# Prediction By Machine Learning

### Install

In [1]:
!pip install tqdm
!pip install lightgbm
!pip install xgboost
!pip install vecstack
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vecstack
  Downloading vecstack-0.4.0.tar.gz (18 kB)
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-py3-none-any.whl size=19877 sha256=94cb4a75c61e90ab3f50613757965517443cf3b47cc6ccf61a176f5807cf3bb8
  Stored in directory: /root/.cache/pip/wheels/28/fe/0c/fe8e43660e3316d7ce204e59a79a72246c0ae9b6c5c79841c8
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0
Looking in indexes: https://pypi.org/simple, https://us-python

### Import

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, LogN, Real
import scipy.stats as stats

from sklearn import linear_model, svm, ensemble
from xgboost import XGBRegressor
import lightgbm

from vecstack import StackingTransformer

from collections import Counter

from tqdm import tqdm

### Prepare the data

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe


In [4]:
train_data = pd.read_csv('../../00_original_data/FIFA_train.csv')

In [5]:
train_data.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [6]:
train_data.shape

(8932, 12)

In [7]:
#contract_until 변수 int 형으로 변환
# 계약 연도만 추출
def func(string:object) -> int:
    """계약 연도만 추출하여 int로 반환"""
    string = string[-4:]
    return int(string)


train_data['contract_until'] = train_data['contract_until'].apply(func)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(train_data[train_data.keys().drop(['id', 'name', 'value'])], train_data['value'], random_state=42)

In [9]:
x_train.head()

Unnamed: 0,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
3100,29,south america,2019,DF,right,1.0,70,70,2.0
8813,20,europe,2019,DF,right,1.0,52,63,2.0
847,24,europe,2021,DF,left,2.0,76,79,3.0
7395,17,europe,2021,DF,right,1.0,61,83,2.0
8681,18,asia,2022,MF,right,1.0,53,72,2.0


In [10]:
x_test.shape

(2233, 9)

#### Pipeline Preprocess

In [11]:
numeric_features = ['age', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']
numeric_transformer = StandardScaler() # RobustScaler

categorical_features = ['continent', 'position', 'prefer_foot']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [12]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)

### Grid Search

In [13]:
svr_param_grid = {'C' : [0.1, 1, 10, 100, 1000], 
             'gamma' : [0.1],
             'kernel' : ['rbf']}

svr_grid = GridSearchCV(svm.SVR(), svr_param_grid, refit=True, verbose=2)

svr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', svr_grid.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.4s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   2.5s
[CV] END ......................C=0.1, gamma=0.0

In [14]:
lsvr_param_grid = {'C' : [0.1, 1, 10, 100, 1000]
                   , 'random_state' : [42]}

lsvr_grid = GridSearchCV(svm.LinearSVR(), lsvr_param_grid, refit=True, verbose=2)

lsvr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', lsvr_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END .............................C=0.1, random_state=42; total time=   0.0s
[CV] END .............................C=0.1, random_state=42; total time=   0.0s
[CV] END .............................C=0.1, random_state=42; total time=   0.0s
[CV] END .............................C=0.1, random_state=42; total time=   0.0s
[CV] END .............................C=0.1, random_state=42; total time=   0.0s
[CV] END ...............................C=1, random_state=42; total time=   0.0s
[CV] END ...............................C=1, random_state=42; total time=   0.0s
[CV] END ...............................C=1, random_state=42; total time=   0.0s
[CV] END ...............................C=1, random_state=42; total time=   0.0s
[CV] END ...............................C=1, random_state=42; total time=   0.0s
[CV] END ..............................C=10, random_state=42; total time=   0.0s
[CV] END ..............................C=10, rand

In [15]:
xtr_param_grid = {
    'n_estimators' : [100, 200, 500, 1000]
    , 'n_jobs' : [-1]
    , 'max_depth' : [-1, 3, 5]
    , 'random_state' : [42]
}

xtr_grid = GridSearchCV(ensemble.ExtraTreesRegressor(), xtr_param_grid, refit=True, verbose=2)

xtr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', xtr_grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   1.2s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.9s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.9s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.9s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   0.9s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.6s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   2.1s
[CV] END max_depth=-1, n_estimators=500, n_jobs=-1, random_

In [16]:
rfr_param_grid = {
    'n_estimators' : [100, 200, 500, 1000]
    , 'n_jobs' : [-1]
    , 'max_depth' : [-1, 3, 5]
    , 'random_state' : [42]
}

rfr_grid = GridSearchCV(ensemble.RandomForestRegressor(), rfr_param_grid, refit=True, verbose=2)

rfr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', rfr_grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.1s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.9s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=100, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=200, n_jobs=-1, random_state=42; total time=   1.0s
[CV] END max_depth=-1, n_estimators=500, n_jobs=-1, random_

In [17]:
gbr_param_grid = {
    'n_estimators' : [10, 100, 1000]
    , 'max_depth' : [-1, 3, 5]
    , 'learning_rate' : [0.001, 0.01, 0.1]
    , 'random_state' : [42]
}

gbr_grid = GridSearchCV(ensemble.GradientBoostingRegressor(), gbr_param_grid, refit=True, verbose=2)

gbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', gbr_grid.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=100, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=100, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=100, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=100, random_state=42; total time=   0.0s
[CV] END learning_rate=0.001, max_depth=-1, n_estimators=100,

In [18]:
xgbr_param_grid = {
    'n_estimators' : [10, 100, 1000]
    , 'gamma' : [0.1, 0.01, 0.001]
    , 'max_depth' : [-1, 3, 5]
    , 'learning_rate' : [0.001, 0.01, 0.1]
    , 'random_state' : [42]
}

xgbr_grid = GridSearchCV(XGBRegressor(), xgbr_param_grid, refit=True, verbose=2)

xgbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', xgbr_grid.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=10, random_state=42; total time=   0.0s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=100, random_state=42; total time=   0.0s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=100, random_state=42; total time=   0.0s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=100, random_state=42; total time=   0.0s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=-1, n_estimators=100

In [19]:
# Light GBM

lgbr_param_grid = {
    'boosting_type' : ['gbdt', 'dart', 'rf', 'goss']
    , 'gamma' : [0.1, 0.01, 0.001]
    , 'max_depth' : [-1, 3, 5]
    , 'learning_rate' : [0.001, 0.01, 0.1, 1]
    , 'random_state' : [42]
}

lgbr_grid = GridSearchCV(lightgbm.LGBMRegressor(), lgbr_param_grid, refit=True, verbose=2)

lgbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', lgbr_grid.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=-1, random_state=42; total time=   0.1s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=-1, random_state=42; total time=   0.1s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=-1, random_state=42; total time=   0.1s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=-1, random_state=42; total time=   0.1s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=-1, random_state=42; total time=   0.1s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=3, random_state=42; total time=   0.0s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=3, random_state=42; total time=   0.0s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rate=0.001, max_depth=3, random_state=42; total time=   0.0s
[CV] END boosting_type=gbdt, gamma=0.1, learning_rat

In [20]:
# SGD regressor

sgdr_param_grid = {
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive']
    , 'alpha' : [0.00001, 0.0001, 0.001 ,0.01]
    , 'random_state' : [42]
}

sgdr_grid = GridSearchCV(linear_model.SGDRegressor(), sgdr_param_grid, refit=True, verbose=2)

sgdr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', sgdr_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END alpha=1e-05, learning_rate=constant, random_state=42; total time=   0.0s
[CV] END alpha=1e-05, learning_rate=constant, random_state=42; total time=   0.0s
[CV] END alpha=1e-05, learning_rate=constant, random_state=42; total time=   0.0s
[CV] END alpha=1e-05, learning_rate=constant, random_state=42; total time=   0.0s
[CV] END alpha=1e-05, learning_rate=constant, random_state=42; total time=   0.0s
[CV] END alpha=1e-05, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=1e-05, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=1e-05, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=1e-05, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=1e-05, learning_rate=optimal, random_state=42; total time=   0.1s
[CV] END alpha=1e-05, learning_rate=invscaling, random_state=42; total time=   0.0s
[CV] END alpha=1e-05, learning_rate=invs

In [21]:
linr_param_grid = {
    'copy_X' : [True]
    , 'n_jobs' : [-1]
}

linr_grid = GridSearchCV(linear_model.LinearRegression(), linr_param_grid, refit=True, verbose=2)

linr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', linr_grid.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
[CV] END .............................copy_X=True, n_jobs=-1; total time=   0.0s
The best parameters are  {'copy_X': True, 'n_jobs': -1}
