# Prediction By Machine Learning

### Install

In [None]:
!pip install tqdm
!pip install lightgbm
!pip install xgboost
!pip install vecstack

In [33]:
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 3.7 MB/s 
[?25hCollecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


### Import

In [9]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn import linear_model, svm, ensemble
from xgboost import XGBRegressor
import lightgbm

from vecstack import StackingTransformer

from collections import Counter

from tqdm import tqdm

In [34]:
from skopt import BayesSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
from skopt.space import Categorical, Integer

### Prepare the data

In [10]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/Querkowe

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/Querkowe


In [11]:
train_data = pd.read_csv('../Data/FIFA_train.csv')

In [12]:
train_data.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [13]:
train_data.shape

(8932, 12)

In [20]:
#contract_until 변수 int 형으로 변환
# 계약 연도만 추출
def func(string:object) -> int:
    """계약 연도만 추출하여 int로 반환"""
    string = string[-4:]
    return int(string)


train_data['contract_until'] = train_data['contract_until'].apply(func)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(train_data[train_data.keys().drop(['id', 'name', 'value'])], train_data['value'])

In [22]:
x_train.head()

Unnamed: 0,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
4931,21,europe,2022,MF,right,1.0,66,71,3.0
7493,19,asia,2021,ST,left,1.0,60,70,2.0
8820,18,europe,2023,MF,right,1.0,52,70,2.0
8054,20,south america,2021,MF,right,1.0,58,66,2.0
5341,33,europe,2019,ST,right,1.0,66,66,2.0


In [23]:
x_test.shape

(2233, 9)

#### Pipeline Preprocess

In [25]:
numeric_features = ['age', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']
numeric_transformer = StandardScaler() # RobustScaler

categorical_features = ['continent', 'position', 'prefer_foot']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [26]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)

### Grid Search

In [27]:
svr_param_grid = {'C' : [0.1, 1, 10, 100, 1000], 
             'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
             'kernel' : ['rbf']}

svr_grid = GridSearchCV(svm.SVR(), svr_param_grid, refit=True, verbose=2)

svr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', svr_grid.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.4s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.7s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   3.1s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   2.9s
[CV] END ......................C=0.1, gamma=0.0

In [35]:
lsvr_param_grid = {'C' : [0.1, 1, 10, 100, 1000]}

lsvr_grid = GridSearchCV(svm.LinearSVR(), lsvr_param_grid, refit=True, verbose=2)

lsvr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', lsvr_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ..............................................C=0.1; total time=   0.0s
[CV] END ..............................................C=0.1; total time=   0.0s
[CV] END ..............................................C=0.1; total time=   0.0s
[CV] END ..............................................C=0.1; total time=   0.0s
[CV] END ..............................................C=0.1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ...............................................C=10; total time=   0.0s
[CV] END ........................................

In [29]:
xtr_param_grid = {
    'n_estimators' : [100, 200, 500, 1000]
    , 'n_jobs' : [-1]
    , 'max_depth' : [3, 4, 5, 8]
}

xtr_grid = GridSearchCV(ensemble.ExtraTreesRegressor(), xtr_param_grid, refit=True, verbose=2)

xtr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', xtr_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   1.7s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.4s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.4s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.5s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.4s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.7s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.7s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.7s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.7s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.7s
[CV] END ...........max_depth=3, n_estimators=500, n_jobs=-1; total time=   1.6s
[CV] END ...........max_depth=3, n_estimators=50

In [30]:
rfr_param_grid = {
    'n_estimators' : [100, 200, 500, 1000]
    , 'n_jobs' : [-1]
    , 'max_depth' : [3, 4, 5, 8]
}

rfr_grid = GridSearchCV(ensemble.RandomForestRegressor(), rfr_param_grid, refit=True, verbose=2)

rfr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', rfr_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.5s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.5s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.5s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.5s
[CV] END ...........max_depth=3, n_estimators=100, n_jobs=-1; total time=   0.5s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.8s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.9s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.9s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.9s
[CV] END ...........max_depth=3, n_estimators=200, n_jobs=-1; total time=   0.8s
[CV] END ...........max_depth=3, n_estimators=500, n_jobs=-1; total time=   1.9s
[CV] END ...........max_depth=3, n_estimators=50

In [31]:
gbr_param_grid = {
    'n_estimators' : [100, 200, 500]
    , 'max_depth' : [3, 4, 5]
    , 'learning_rate' : [0.001, 0.01, 0.05, 0.1]
}

gbr_grid = GridSearchCV(ensemble.GradientBoostingRegressor(), gbr_param_grid, refit=True, verbose=2)

gbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', gbr_grid.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=500; total time=   2.1s
[CV] END .learning_rate=0.001, max_depth=3, n_e

In [32]:
xgbr_param_grid = {
    'n_estimators' : [200, 500, 1000]
    , 'gamma' : [0.1, 0.01, 0.001]
    , 'max_depth' : [3, 4, 5]
    , 'learning_rate' : [0.001, 0.01, 0.05, 0.1]
}

xgbr_grid = GridSearchCV(XGBRegressor(), xgbr_param_grid, refit=True, verbose=2)

xgbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', xgbr_grid.best_params_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=500; total time=   1.4s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=500; total time=   1.4s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=500; total time=   1.5s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=500; total time=   1.4s
[CV] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=500; total time=   1.4s
[CV] END gamma=0.1, learnin

In [None]:
lgbr_param_grid = {
    'n_estimators' : [200, 500, 1000]
    , 'gamma' : [0.1, 0.01, 0.001]
    , 'max_depth' : [3, 4, 5]
    , 'learning_rate' : [0.001, 0.01, 0.05, 0.1]
}

lgbr_grid = GridSearchCV(lightgbm.LGBMRegressor(), lgbr_param_grid, refit=True, verbose=2)

lgbr_grid.fit(x_train_transformed, y_train)
print('The best parameters are ', lgbr_grid.best_params_)