### Install

In [1]:
!pip install tqdm
!pip install lightgbm
!pip install xgboost
!pip install vecstack
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, LogN, Real
import scipy.stats as stats

from sklearn import linear_model, svm, ensemble
from xgboost import XGBRegressor
import lightgbm

from vecstack import StackingTransformer

from collections import Counter

from tqdm import tqdm

from sklearn.utils.fixes import loguniform

### Prepare the data

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe


In [4]:
train_data = pd.read_csv('../../00_original_data/FIFA_train.csv')

In [5]:
train_data.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [6]:
train_data.shape

(8932, 12)

In [7]:
#contract_until 변수 int 형으로 변환
# 계약 연도만 추출
def func(string:object) -> int:
    """계약 연도만 추출하여 int로 반환"""
    string = string[-4:]
    return int(string)


train_data['contract_until'] = train_data['contract_until'].apply(func)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(train_data[train_data.keys().drop(['id', 'name', 'value'])], train_data['value'], random_state=42)

In [9]:
x_train.head()

Unnamed: 0,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
3100,29,south america,2019,DF,right,1.0,70,70,2.0
8813,20,europe,2019,DF,right,1.0,52,63,2.0
847,24,europe,2021,DF,left,2.0,76,79,3.0
7395,17,europe,2021,DF,right,1.0,61,83,2.0
8681,18,asia,2022,MF,right,1.0,53,72,2.0


In [10]:
x_test.shape

(2233, 9)

#### Pipeline Preprocess

In [11]:
numeric_features = ['age', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']
numeric_transformer = StandardScaler() # RobustScaler

categorical_features = ['continent', 'position', 'prefer_foot']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [12]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)

### Bayesian Search

In [13]:
svr_param_grid = {'C' : Real(0.1, 1000, 'log-uniform'), 
             'gamma' : Real(1e-4, 1, 'log-uniform'),
             'kernel' : ['rbf']}

svr_hpo = BayesSearchCV(svm.SVR(), svr_param_grid, refit=True, verbose=2, n_jobs=-1, cv=5)

svr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', svr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [14]:
lsvr_param_grid = {'C' : Real(0.1, 1000, prior='log-uniform')
                   , 'random_state' : [42]
                   }

lsvr_hpo = BayesSearchCV(svm.LinearSVR(), lsvr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

lsvr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', lsvr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [18]:
xtr_param_grid = {
    'n_estimators' : Integer(10, 1000, 'log-uniform')
    , 'n_jobs' : [-1]
    , 'max_depth' : Integer(1, 8)
    , 'random_state' : [42]
}

xtr_hpo = BayesSearchCV(ensemble.ExtraTreesRegressor(), xtr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

xtr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', xtr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [20]:
rfr_param_grid = {
    'n_estimators' : Integer(10, 1000, 'log-uniform')
    , 'n_jobs' : [-1]
    , 'max_depth' : Integer(1, 8)
    , 'random_state' : [42]
}

rfr_hpo = BayesSearchCV(ensemble.RandomForestRegressor(), rfr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

rfr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', rfr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [22]:
gbr_param_grid = {
    'n_estimators' : Integer(10, 1000, 'log-uniform')
    , 'max_depth' : Integer(1, 8)
    , 'learning_rate' : Real(1e-3, 1e-1, 'log-uniform')
    , 'random_state' : [42]
}

gbr_hpo = BayesSearchCV(ensemble.GradientBoostingRegressor(), gbr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

gbr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', gbr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [23]:
xgbr_param_grid = {
    'n_estimators' : Integer(10, 1000, 'log-uniform')
    , 'gamma' : Real(1e-3, 1e-1, 'log-uniform')
    , 'max_depth' : Integer(1, 8)
    , 'learning_rate' : Real(1e-3, 1e-1, 'log-uniform')
    , 'random_state' : [42]
}

xgbr_hpo = BayesSearchCV(XGBRegressor(), xgbr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

xgbr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', xgbr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [26]:
# Light GBM

lgbr_param_grid = {
    'boosting_type' : ['gbdt', 'dart', 'goss']
    , 'gamma' : Real(1e-3, 1e-1, 'log-uniform')
    , 'max_depth' : Integer(1, 8)
    , 'learning_rate' : Real(1e-3, 1, 'log-uniform')
    , 'random_state' : [42]
}

lgbr_hpo = BayesSearchCV(lightgbm.LGBMRegressor(), lgbr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

lgbr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', lgbr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [29]:
# Light GBM

lgbr_param_grid = {
    'boosting_type' : ['rf']
    , 'gamma' : Real(1e-3, 1e-1, 'log-uniform')
    , 'max_depth' : Integer(1, 8)
    , 'learning_rate' : Real(1e-3, 1, 'log-uniform')
    , 'random_state' : [42]
    , 'baggeng_freq' : [1]
    , 'bagging_fraction' : Real(0.1, 0.9)
}

lgbr_hpo = BayesSearchCV(lightgbm.LGBMRegressor(), lgbr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

lgbr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', lgbr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


LightGBMError: ignored

In [30]:
# SGD regressor

sgdr_param_grid = {
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive']
    , 'alpha' : Real(1e-4, 1e-2, 'log-uniform')
    , 'random_state' : [42]
}

sgdr_hpo = BayesSearchCV(linear_model.SGDRegressor(), sgdr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

sgdr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', sgdr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [31]:
linr_param_grid = {
    'copy_X' : [True]
    , 'n_jobs' : [-1]
}

linr_hpo = BayesSearchCV(linear_model.LinearRegression(), linr_param_grid, refit=True, verbose=2, n_jobs=-1, n_iter=72, cv=5)

linr_hpo.fit(x_train_transformed, y_train)
print('The best parameters are ', linr_hpo.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi