### Install

In [3]:
!pip install tqdm
!pip install lightgbm
!pip install xgboost
!pip install vecstack

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import

In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn import linear_model, svm, ensemble
from xgboost import XGBRegressor
import lightgbm

from vecstack import StackingTransformer

from collections import Counter

from tqdm import tqdm

### Prepare the data

In [5]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/220919/fifa_pay_prediction/03_work_code/Querkowe


In [6]:
train_data = pd.read_csv('../../00_original_data/FIFA_train.csv')

In [7]:
#contract_until 변수 int 형으로 변환
# 계약 연도만 추출
def func(string:object) -> int:
    """계약 연도만 추출하여 int로 반환"""
    string = string[-4:]
    return int(string)


train_data['contract_until'] = train_data['contract_until'].apply(func)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(train_data[train_data.keys().drop(['id', 'name', 'value'])], train_data['value'], random_state=42)

#### Pipeline Preprocess

In [9]:
numeric_features = ['age', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']
numeric_transformer = StandardScaler() # RobustScaler

categorical_features = ['continent', 'position', 'prefer_foot']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)

In [12]:
x_test_transformed = preprocessor_pipe.transform(x_test)

### Build & Learn Model

#### SVM

- Grid search : {'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
- Randomized search : {'C': 417.8148444007955, 'gamma': 0.3547275579413753, 'kernel': 'rbf'}
- Bayesian search : [('C', 1000.0), ('gamma', 0.0512015475365109), ('kernel', 'rbf')]

In [11]:
svr_g = svm.SVR(C=1000, gamma=0.1, kernel='rbf').fit(x_train_transformed, y_train)
svr_r = svm.SVR(C=417.8148444007955, gamma=0.3547275579413753, kernel='rbf').fit(x_train_transformed, y_train)
svr_b = svm.SVR(C=1000, gamma=0.0512015475365109, kernel='rbf').fit(x_train_transformed, y_train)

In [18]:
print('svm regressor with grid search :', mean_squared_error(y_test, svr_g.predict(x_test_transformed))**.5)
print('svm regressor with rand search :', mean_squared_error(y_test, svr_r.predict(x_test_transformed))**.5)
print('svm regressor with baye search :', mean_squared_error(y_test, svr_b.predict(x_test_transformed))**.5)

svm regressor with grid search 6657087.995002561
svm regressor with rand search 6799938.573693998
svm regressor with baye search 6622588.779257085


#### Linear SVM

- Grid search : {'C': 1000, 'random_state': 42}
- Randomized search : {'C': 524.8859871129727, 'random_state': 42}
- Bayesian search : [('C', 999.8265739695903), ('random_state', 42)]

In [19]:
lsvr_g = svm.LinearSVR(C=1000, random_state=42).fit(x_train_transformed, y_train)
lsvr_r = svm.LinearSVR(C=524.8859871129727, random_state=42).fit(x_train_transformed, y_train)
lsvr_b = svm.LinearSVR(C=999.8265739695903, random_state=42).fit(x_train_transformed, y_train)

In [20]:
print('lsvm regressor with grid search :', mean_squared_error(y_test, lsvr_g.predict(x_test_transformed))**.5)
print('lsvm regressor with rand search :', mean_squared_error(y_test, lsvr_r.predict(x_test_transformed))**.5)
print('lsvm regressor with baye search :', mean_squared_error(y_test, lsvr_b.predict(x_test_transformed))**.5)

lsvm regressor with grid search : 5511990.828885629
lsvm regressor with rand search : 5822254.725812118
lsvm regressor with baye search : 5512656.161270678


#### Extra Trees

- Grid search : {'max_depth': 5, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 42} 
- Randomized search : {'max_depth': 4, 'n_estimators': 538, 'n_jobs': -1, 'random_state': 42}
- Bayesian search : [('max_depth', 8), ('n_estimators', 724), ('n_jobs', -1), ('random_state', 42)]

In [21]:
xtr_g_param = {'max_depth': 5, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 42}
xtr_r_param = {'max_depth': 4, 'n_estimators': 538, 'n_jobs': -1, 'random_state': 42}
xtr_b_param = {'max_depth': 8, 'n_estimators': 724, 'n_jobs': -1, 'random_state': 42}

In [23]:
xtr_g = ensemble.ExtraTreesRegressor(**xtr_g_param).fit(x_train_transformed, y_train)
xtr_r = ensemble.ExtraTreesRegressor(**xtr_r_param).fit(x_train_transformed, y_train)
xtr_b = ensemble.ExtraTreesRegressor(**xtr_b_param).fit(x_train_transformed, y_train)

In [25]:
print('extra trees regressor with grid search :', mean_squared_error(y_test, xtr_g.predict(x_test_transformed))**.5)
print('extra trees regressor with rand search :', mean_squared_error(y_test, xtr_r.predict(x_test_transformed))**.5)
print('extra trees regressor with baye search :', mean_squared_error(y_test, xtr_b.predict(x_test_transformed))**.5)

extra trees regressor with grid search : 1822018.0061618283
extra trees regressor with rand search : 2161332.733954486
extra trees regressor with baye search : 1296636.8223825248


#### Random Forest

- Grid search : {'max_depth': 5, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 42}
- Randomized search : {'max_depth': 4, 'n_estimators': 443, 'n_jobs': -1, 'random_state': 42}
- Bayesian search : [('max_depth', 8), ('n_estimators', 20), ('n_jobs', -1), ('random_state', 42)]

In [26]:
rfr_g_param = {'max_depth': 5, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 42}
rfr_r_param = {'max_depth': 4, 'n_estimators': 443, 'n_jobs': -1, 'random_state': 42}
rfr_b_param = {'max_depth': 8, 'n_estimators': 20, 'n_jobs': -1, 'random_state': 42}

In [27]:
rfr_g = ensemble.ExtraTreesRegressor(**rfr_g_param).fit(x_train_transformed, y_train)
rfr_r = ensemble.ExtraTreesRegressor(**rfr_r_param).fit(x_train_transformed, y_train)
rfr_b = ensemble.ExtraTreesRegressor(**rfr_b_param).fit(x_train_transformed, y_train)

In [29]:
print('random forest regressor with grid search :', mean_squared_error(y_test, rfr_g.predict(x_test_transformed))**.5)
print('random forest regressor with rand search :', mean_squared_error(y_test, rfr_r.predict(x_test_transformed))**.5)
print('random forest regressor with baye search :', mean_squared_error(y_test, rfr_b.predict(x_test_transformed))**.5)

random forest regressor with grid search : 1822018.0061618283
random forest regressor with rand search : 2165748.074051069
random forest regressor with baye search : 1336860.190827881


#### Gradient Boost

- Grid search : {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42}
- Randomized search : {'learning_rate': 0.049577508855568365, 'max_depth': 3, 'n_estimators': 925, 'random_state': 42}
- Bayesian search : [('learning_rate', 0.06344498415616949), ('max_depth', 5), ('n_estimators', 129), ('random_state', 42)]

In [30]:
gbr_g_param = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42}
gbr_r_param = {'learning_rate': 0.049577508855568365, 'max_depth': 3, 'n_estimators': 925, 'random_state': 42}
gbr_b_param = {'learning_rate': 0.06344498415616949, 'max_depth': 5, 'n_estimators': 129, 'random_state': 42}

In [32]:
gbr_g = ensemble.GradientBoostingRegressor(**gbr_g_param).fit(x_train_transformed, y_train)
gbr_r = ensemble.GradientBoostingRegressor(**gbr_r_param).fit(x_train_transformed, y_train)
gbr_b = ensemble.GradientBoostingRegressor(**gbr_b_param).fit(x_train_transformed, y_train)

In [33]:
print('gradient boost regressor with grid search :', mean_squared_error(y_test, gbr_g.predict(x_test_transformed))**.5)
print('gradient boost regressor with rand search :', mean_squared_error(y_test, gbr_r.predict(x_test_transformed))**.5)
print('gradient boost regressor with baye search :', mean_squared_error(y_test, gbr_b.predict(x_test_transformed))**.5)

gradient boost regressor with grid search : 805099.9780213628
gradient boost regressor with rand search : 854630.5583619052
gradient boost regressor with baye search : 976341.6441060889


#### Extra Gradient Boost

- Grid search : {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42}
- Randomized search : {'gamma': 0.04586484324176051, 'learning_rate': 0.06867462147439204, 'max_depth': 4, 'n_estimators': 628, 'random_state': 42}
- Bayesian search : [('gamma', 0.001), ('learning_rate', 0.06795626895549607), ('max_depth', 3), ('n_estimators', 632), ('random_state', 42)]

In [34]:
xgbr_g_param = {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42}
xgbr_r_param = {'gamma': 0.04586484324176051, 'learning_rate': 0.06867462147439204, 'max_depth': 4, 'n_estimators': 628, 'random_state': 42}
xgbr_b_param = {'gamma': 0.001, 'learning_rate':  0.06795626895549607, 'max_depth': 3, 'n_estimators': 632, 'random_state': 42}

In [36]:
xgbr_g = XGBRegressor(**xgbr_g_param).fit(x_train_transformed, y_train)
xgbr_r = XGBRegressor(**xgbr_r_param).fit(x_train_transformed, y_train)
xgbr_b = XGBRegressor(**xgbr_b_param).fit(x_train_transformed, y_train)



In [37]:
print('extra gradient boost regressor with grid search :', mean_squared_error(y_test, xgbr_g.predict(x_test_transformed))**.5)
print('extra gradient boost regressor with rand search :', mean_squared_error(y_test, xgbr_r.predict(x_test_transformed))**.5)
print('extra gradient boost regressor with baye search :', mean_squared_error(y_test, xgbr_b.predict(x_test_transformed))**.5)

extra gradient boost regressor with grid search : 822431.2926722522
extra gradient boost regressor with rand search : 878991.4366171896
extra gradient boost regressor with baye search : 884630.6329353064


#### Light GBM

- Grid search : {'boosting_type': 'gbdt', 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'random_state': 42}
- Randomized search : {'boosting_type': 'gbdt', 'gamma': 0.033173639397717856, 'learning_rate': 0.45702046798546164, 'max_depth': 4, 'random_state': 42}
- Bayesian search : [('boosting_type', 'gbdt'), ('gamma', 0.018521662961496488), ('learning_rate', 0.34556810356817463), ('max_depth', 6), ('random_state', 42)]

In [41]:
lgbr_g_param = {'boosting_type': 'gbdt', 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'random_state': 42}
lgbr_r_param = {'boosting_type': 'gbdt', 'gamma': 0.033173639397717856, 'learning_rate': 0.45702046798546164, 'max_depth': 4, 'random_state': 42}
lgbr_b_param = {'boosting_type': 'gbdt', 'gamma': 0.018521662961496488, 'learning_rate': 0.34556810356817463, 'max_depth': 6, 'random_state': 42}

In [44]:
lgbr_g = lightgbm.LGBMRegressor(**lgbr_g_param).fit(x_train_transformed, y_train)
lgbr_r = lightgbm.LGBMRegressor(**lgbr_r_param).fit(x_train_transformed, y_train)
lgbr_b = lightgbm.LGBMRegressor(**lgbr_b_param).fit(x_train_transformed, y_train)

In [45]:
print('light gbm regressor with grid search :', mean_squared_error(y_test, lgbr_g.predict(x_test_transformed))**.5)
print('light gbm regressor with rand search :', mean_squared_error(y_test, lgbr_r.predict(x_test_transformed))**.5)
print('light gbm regressor with baye search :', mean_squared_error(y_test, lgbr_b.predict(x_test_transformed))**.5)

light gbm regressor with grid search : 1415702.9198474837
light gbm regressor with rand search : 1212245.7860615046
light gbm regressor with baye search : 1254460.5394929934


#### SGD Regressor

- Grid search : {'alpha': 0.001, 'learning_rate': 'adaptive', 'random_state': 42}
- Randomized search : {'alpha': 0.003974513629048665, 'learning_rate': 'adaptive', 'random_state': 42}
- Bayesian search : [('alpha', 0.00021345511640311532), ('learning_rate', 'adaptive'), ('random_state', 42)]

In [46]:
sgdr_g_param = {'alpha': 0.001, 'learning_rate': 'adaptive', 'random_state': 42}
sgdr_r_param = {'alpha': 0.003974513629048665, 'learning_rate': 'adaptive', 'random_state': 42}
sgdr_b_param = {'alpha': 0.00021345511640311532, 'learning_rate': 'adaptive', 'random_state': 42}

In [47]:
sgdr_g = linear_model.SGDRegressor(**sgdr_g_param).fit(x_train_transformed, y_train)
sgdr_r = linear_model.SGDRegressor(**sgdr_r_param).fit(x_train_transformed, y_train)
sgdr_b = linear_model.SGDRegressor(**sgdr_b_param).fit(x_train_transformed, y_train)

In [48]:
print('SGD regressor with grid search :', mean_squared_error(y_test, sgdr_g.predict(x_test_transformed))**.5)
print('SGD regressor with rand search :', mean_squared_error(y_test, sgdr_r.predict(x_test_transformed))**.5)
print('SGD regressor with baye search :', mean_squared_error(y_test, sgdr_b.predict(x_test_transformed))**.5)

SGD regressor with grid search : 4011425.5227940665
SGD regressor with rand search : 4012793.499460694
SGD regressor with baye search : 4011095.7429131125


#### Linear Regressor

In [51]:
linear = linear_model.LinearRegression(n_jobs=-1).fit(x_train_transformed, y_train)

In [52]:
print('linear regression :', mean_squared_error(y_test, linear.predict(x_test_transformed))**.5)

linear regression : 4011095.5641239663
