### Install

In [1]:
!pip install tqdm
!pip install lightgbm
!pip install xgboost
!pip install vecstack
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, LogN, Real
import scipy.stats as stats
from sklearn.utils.fixes import loguniform

from sklearn import linear_model, svm, ensemble
from xgboost import XGBRegressor
import lightgbm

from vecstack import StackingTransformer

from collections import Counter

from tqdm import tqdm

### Prepare the data

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe


In [4]:
train_data = pd.read_csv('../../00_original_data/FIFA_train.csv')

In [5]:
#contract_until 변수 int 형으로 변환
# 계약 연도만 추출
def func(string:object) -> int:
    """계약 연도만 추출하여 int로 반환"""
    string = string[-4:]
    return int(string)


train_data['contract_until'] = train_data['contract_until'].apply(func)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(train_data[train_data.keys().drop(['id', 'name', 'value'])], train_data['value'], random_state=42)

#### Pipeline Preprocess

In [7]:
numeric_features = ['age', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']
numeric_transformer = StandardScaler() # RobustScaler

categorical_features = ['continent', 'position', 'prefer_foot']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [8]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)

### Random Search

In [9]:
svr_param_grid = {'C' : loguniform(1e-1, 1e3), 
             'gamma' : loguniform(1e-4, 1e0),
             'kernel' : ['rbf']}

svr_grid = RandomizedSearchCV(svm.SVR(), svr_param_grid, refit=True, verbose=2)

svr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', svr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END C=1.0683930644884565, gamma=0.0009364150677682173, kernel=rbf; total time=   3.2s
[CV] END C=1.0683930644884565, gamma=0.0009364150677682173, kernel=rbf; total time=   3.3s
[CV] END C=1.0683930644884565, gamma=0.0009364150677682173, kernel=rbf; total time=   5.5s
[CV] END C=1.0683930644884565, gamma=0.0009364150677682173, kernel=rbf; total time=   5.6s
[CV] END C=1.0683930644884565, gamma=0.0009364150677682173, kernel=rbf; total time=   6.1s
[CV] END C=16.803646390985232, gamma=0.0811957710059441, kernel=rbf; total time=   3.7s
[CV] END C=16.803646390985232, gamma=0.0811957710059441, kernel=rbf; total time=   2.8s
[CV] END C=16.803646390985232, gamma=0.0811957710059441, kernel=rbf; total time=   3.3s
[CV] END C=16.803646390985232, gamma=0.0811957710059441, kernel=rbf; total time=   3.5s
[CV] END C=16.803646390985232, gamma=0.0811957710059441, kernel=rbf; total time=   2.7s
[CV] END C=0.31140694890520054, gamma=0.0019

In [10]:
lsvr_param_grid = {'C' : loguniform(1e-1, 1e3)
                   , 'random_state' : [42]}

lsvr_grid = RandomizedSearchCV(svm.LinearSVR(), lsvr_param_grid, refit=True, verbose=2)

lsvr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', lsvr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .............C=0.11098748365891312, random_state=42; total time=   0.0s
[CV] END .............C=0.11098748365891312, random_state=42; total time=   0.0s
[CV] END .............C=0.11098748365891312, random_state=42; total time=   0.0s
[CV] END .............C=0.11098748365891312, random_state=42; total time=   0.0s
[CV] END .............C=0.11098748365891312, random_state=42; total time=   0.0s
[CV] END ..............C=16.371258949096802, random_state=42; total time=   0.0s
[CV] END ..............C=16.371258949096802, random_state=42; total time=   0.0s
[CV] END ..............C=16.371258949096802, random_state=42; total time=   0.0s
[CV] END ..............C=16.371258949096802, random_state=42; total time=   0.0s
[CV] END ..............C=16.371258949096802, random_state=42; total time=   0.0s
[CV] END .............C=0.11171714662708848, random_state=42; total time=   0.0s
[CV] END .............C=0.11171714662708848, ran

In [13]:
xtr_param_grid = {
    'n_estimators' : stats.randint(100, 1000)
    , 'n_jobs' : [-1]
    , 'max_depth' : stats.randint(-1, 5)
    , 'random_state' : [42]
}

xtr_grid = RandomizedSearchCV(ensemble.ExtraTreesRegressor(), xtr_param_grid, refit=True, verbose=2)

xtr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', xtr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=1, n_estimators=339, n_jobs=-1, random_state=42; total time=   2.2s
[CV] END max_depth=1, n_estimators=339, n_jobs=-1, random_state=42; total time=   0.8s
[CV] END max_depth=1, n_estimators=339, n_jobs=-1, random_state=42; total time=   0.8s
[CV] END max_depth=1, n_estimators=339, n_jobs=-1, random_state=42; total time=   0.8s
[CV] END max_depth=1, n_estimators=339, n_jobs=-1, random_state=42; total time=   0.8s
[CV] END max_depth=4, n_estimators=815, n_jobs=-1, random_state=42; total time=   2.9s
[CV] END max_depth=4, n_estimators=815, n_jobs=-1, random_state=42; total time=   3.0s
[CV] END max_depth=4, n_estimators=815, n_jobs=-1, random_state=42; total time=   3.0s
[CV] END max_depth=4, n_estimators=815, n_jobs=-1, random_state=42; total time=   3.0s
[CV] END max_depth=4, n_estimators=815, n_jobs=-1, random_state=42; total time=   3.0s
[CV] END max_depth=2, n_estimators=138, n_jobs=-1, random_state=42; t

In [14]:
rfr_param_grid = {
    'n_estimators' : stats.randint(100, 1000)
    , 'n_jobs' : [-1]
    , 'max_depth' : stats.randint(-1, 5)
    , 'random_state' : [42]
}

rfr_grid = RandomizedSearchCV(ensemble.RandomForestRegressor(), rfr_param_grid, refit=True, verbose=2)

rfr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', rfr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=2, n_estimators=779, n_jobs=-1, random_state=42; total time=   2.5s
[CV] END max_depth=2, n_estimators=779, n_jobs=-1, random_state=42; total time=   2.5s
[CV] END max_depth=2, n_estimators=779, n_jobs=-1, random_state=42; total time=   2.5s
[CV] END max_depth=2, n_estimators=779, n_jobs=-1, random_state=42; total time=   2.5s
[CV] END max_depth=2, n_estimators=779, n_jobs=-1, random_state=42; total time=   2.4s
[CV] END max_depth=0, n_estimators=130, n_jobs=-1, random_state=42; total time=   0.1s
[CV] END max_depth=0, n_estimators=130, n_jobs=-1, random_state=42; total time=   1.1s
[CV] END max_depth=0, n_estimators=130, n_jobs=-1, random_state=42; total time=   1.1s
[CV] END max_depth=0, n_estimators=130, n_jobs=-1, random_state=42; total time=   1.1s
[CV] END max_depth=0, n_estimators=130, n_jobs=-1, random_state=42; total time=   1.1s
[CV] END max_depth=4, n_estimators=443, n_jobs=-1, random_state=42; t

In [15]:
gbr_param_grid = {
    'n_estimators' : stats.randint(100, 1000)
    , 'max_depth' : stats.randint(-1, 5)
    , 'learning_rate' : loguniform(1e-3, 1e-1)
    , 'random_state' : [42]
}

gbr_grid = RandomizedSearchCV(ensemble.GradientBoostingRegressor(), gbr_param_grid, refit=True, verbose=2)

gbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', gbr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END learning_rate=0.06228578707760924, max_depth=2, n_estimators=663, random_state=42; total time=   1.9s
[CV] END learning_rate=0.06228578707760924, max_depth=2, n_estimators=663, random_state=42; total time=   1.9s
[CV] END learning_rate=0.06228578707760924, max_depth=2, n_estimators=663, random_state=42; total time=   1.9s
[CV] END learning_rate=0.06228578707760924, max_depth=2, n_estimators=663, random_state=42; total time=   1.9s
[CV] END learning_rate=0.06228578707760924, max_depth=2, n_estimators=663, random_state=42; total time=   1.9s
[CV] END learning_rate=0.049577508855568365, max_depth=3, n_estimators=925, random_state=42; total time=   3.7s
[CV] END learning_rate=0.049577508855568365, max_depth=3, n_estimators=925, random_state=42; total time=   3.8s
[CV] END learning_rate=0.049577508855568365, max_depth=3, n_estimators=925, random_state=42; total time=   3.7s
[CV] END learning_rate=0.049577508855568365, max

In [16]:
xgbr_param_grid = {
    'n_estimators' : stats.randint(100, 1000)
    , 'gamma' : loguniform(1e-3, 1e-1)
    , 'max_depth' : stats.randint(-1, 5)
    , 'learning_rate' : loguniform(1e-3, 1e-1)
    , 'random_state' : [42]
}

xgbr_grid = RandomizedSearchCV(XGBRegressor(), xgbr_param_grid, refit=True, verbose=2)

xgbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', xgbr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END gamma=0.033627692968982395, learning_rate=0.0013547836593743741, max_depth=0, n_estimators=121, random_state=42; total time=   0.3s
[CV] END gamma=0.033627692968982395, learning_rate=0.0013547836593743741, max_depth=0, n_estimators=121, random_state=42; total time=   0.1s
[CV] END gamma=0.033627692968982395, learning_rate=0.0013547836593743741, max_depth=0, n_estimators=121, random_state=42; total time=   0.1s
[CV] END gamma=0.033627692968982395, learning_rate=0.0013547836593743741, max_depth=0, n_estimators=121, random_state=42; total time=   0.1s
[CV] END gamma=0.033627692968982395, learning_rate=0.0013547836593743741, max_depth=0, n_estimators=121, random_state=42; total time=   0.1s
[CV] END gamma=0.04586484324176051, learning_rate=0.06867462147439204, max_depth=4, n_estimators=628, random_state=42; total time=   2.3s
[CV] END gamma=0.04586484324176051, learning_rate=0.06867462147439204, max_depth=4, n_estimators

In [17]:
# Light GBM

lgbr_param_grid = {
    'boosting_type' : ['gbdt', 'dart', 'rf', 'goss']
    , 'gamma' : loguniform(1e-3, 1e-1)
    , 'max_depth' : stats.randint(-1, 5)
    , 'learning_rate' : loguniform(1e-3, 1e0)
    , 'random_state' : [42]
}

lgbr_grid = RandomizedSearchCV(lightgbm.LGBMRegressor(), lgbr_param_grid, refit=True, verbose=2)

lgbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', lgbr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END boosting_type=goss, gamma=0.011466240935797812, learning_rate=0.007896941231469617, max_depth=4, random_state=42; total time=   0.1s
[CV] END boosting_type=goss, gamma=0.011466240935797812, learning_rate=0.007896941231469617, max_depth=4, random_state=42; total time=   0.1s
[CV] END boosting_type=goss, gamma=0.011466240935797812, learning_rate=0.007896941231469617, max_depth=4, random_state=42; total time=   0.1s
[CV] END boosting_type=goss, gamma=0.011466240935797812, learning_rate=0.007896941231469617, max_depth=4, random_state=42; total time=   0.1s
[CV] END boosting_type=goss, gamma=0.011466240935797812, learning_rate=0.007896941231469617, max_depth=4, random_state=42; total time=   0.1s
[CV] END boosting_type=gbdt, gamma=0.033173639397717856, learning_rate=0.45702046798546164, max_depth=4, random_state=42; total time=   0.1s
[CV] END boosting_type=gbdt, gamma=0.033173639397717856, learning_rate=0.457020467985461

In [20]:
# SGD regressor

sgdr_param_grid = {
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive']
    , 'alpha' : loguniform(1e-5, 1e-2)
    , 'random_state' : [42]
}

sgdr_grid = RandomizedSearchCV(linear_model.SGDRegressor(), sgdr_param_grid, refit=True, verbose=2)

sgdr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', sgdr_grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END alpha=0.004484585452411359, learning_rate=invscaling, random_state=42; total time=   0.0s
[CV] END alpha=0.004484585452411359, learning_rate=invscaling, random_state=42; total time=   0.0s
[CV] END alpha=0.004484585452411359, learning_rate=invscaling, random_state=42; total time=   0.0s
[CV] END alpha=0.004484585452411359, learning_rate=invscaling, random_state=42; total time=   0.0s
[CV] END alpha=0.004484585452411359, learning_rate=invscaling, random_state=42; total time=   0.0s
[CV] END alpha=0.0001166678101674488, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=0.0001166678101674488, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=0.0001166678101674488, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=0.0001166678101674488, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=0.0001166678101674488, learning_rate=opti

In [21]:
linr_param_grid = {
    'copy_X' : [True]
    , 'n_jobs' : [-1]
}

linr_grid = RandomizedSearchCV(linear_model.LinearRegression(), linr_param_grid, refit=True, verbose=2)

linr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', linr_grid.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
The best parameters are  {'n_jobs': -1, 'copy_X': True}
