# I. Import library and load data



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/machine learning project/processed data.csv")
data = data.drop(columns = ['Unnamed: 0'])
data.head()

Unnamed: 0,MP,Min,Goals,Assists,CrdY,CrdR,Comp,Shots,Acceleration,Aggression,...,Vision,Volleys,age,name,overall,potential,preferred_foot,position common,market value,wage value
0,34,2983,0.06,0.0,0.15,0.03,Ligue 1,0.54,37,81,...,45,24,33,Yunis Abdelhamid,76,76,Left,defender,3600000.0,23000.0
1,31,2462,0.04,0.0,0.44,0.11,Ligue 1,0.66,50,79,...,65,38,21,Salis Abdul Samed,70,75,Right,midfielder,2200000.0,7000.0
2,34,2956,0.0,0.06,0.27,0.0,Ligue 1,0.91,77,76,...,66,37,28,Laurent Abergel,75,75,Right,midfielder,4900000.0,18000.0
3,24,726,0.0,0.12,0.37,0.0,Bundesliga,2.22,81,39,...,44,53,22,Dickson Abiama,68,76,Right,striker,2700000.0,9000.0
4,30,2536,0.14,0.0,0.07,0.04,Serie A,0.57,64,77,...,65,39,33,Francesco Acerbi,83,83,Left,defender,17500000.0,75000.0


# II. Convert category data into numeric data and remove columns that don't related in build model

In [None]:
# Comp = name of league
data['Comp'].replace(data['Comp'].unique(),[1, 3, 2, 4, 5], inplace=True)
# drop name column and preferred_foot
data = data.drop(columns = ['name','preferred_foot'])

In [None]:
#convert the categorical variables to dummies
data = pd.get_dummies(data) 
# log tranformation for market vakue
data['market value'] = np.log(data['market value'])
data.head()

Unnamed: 0,MP,Min,Goals,Assists,CrdY,CrdR,Comp,Shots,Acceleration,Aggression,...,Volleys,age,overall,potential,market value,wage value,position common_defender,position common_goalkeeper,position common_midfielder,position common_striker
0,34,2983,0.06,0.0,0.15,0.03,1,0.54,37,81,...,24,33,76,76,15.096444,23000.0,1,0,0,0
1,31,2462,0.04,0.0,0.44,0.11,1,0.66,50,79,...,38,21,70,75,14.603968,7000.0,0,0,1,0
2,34,2956,0.0,0.06,0.27,0.0,1,0.91,77,76,...,37,28,75,75,15.404746,18000.0,0,0,1,0
3,24,726,0.0,0.12,0.37,0.0,3,2.22,81,39,...,53,22,68,76,14.808762,9000.0,0,0,0,1
4,30,2536,0.14,0.0,0.07,0.04,2,0.57,64,77,...,39,33,83,83,16.677711,75000.0,1,0,0,0


# III. Split training and test



In [None]:
data_build = data.copy()
y = data_build['market value']
x = data_build.drop(columns =['market value'])
x_train, x_test, y_train, y_test = train_test_split(x,y , test_size = 0.33, random_state = 3 )

# IV.Emsemble models

## 1. Adaboost

In [None]:
DCT = DecisionTreeRegressor(random_state=0, max_depth = 13)
regressor = AdaBoostRegressor(base_estimator = DCT)
params = {'n_estimators': range(1,100,3),'random_state' : [3],
         }
scoring_fnc = 'neg_mean_squared_error'
grid = GridSearchCV(estimator=regressor, param_grid=params, scoring= scoring_fnc, cv= 5 , verbose= 1)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 33 candidates, totalling 165 fits


GridSearchCV(cv=5,
             estimator=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=13,
                                                                              random_state=0)),
             param_grid={'n_estimators': range(1, 100, 3), 'random_state': [3]},
             scoring='neg_mean_squared_error', verbose=1)

In [None]:
# get best parameter
grid.best_estimator_.get_params()

{'base_estimator': DecisionTreeRegressor(max_depth=13, random_state=0),
 'base_estimator__ccp_alpha': 0.0,
 'base_estimator__criterion': 'squared_error',
 'base_estimator__max_depth': 13,
 'base_estimator__max_features': None,
 'base_estimator__max_leaf_nodes': None,
 'base_estimator__min_impurity_decrease': 0.0,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__min_samples_split': 2,
 'base_estimator__min_weight_fraction_leaf': 0.0,
 'base_estimator__random_state': 0,
 'base_estimator__splitter': 'best',
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 94,
 'random_state': 3}

In [None]:
#  Training model with hyperparameter tuning
model = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=13, random_state=0)
                          ,n_estimators = 94, learning_rate = 1.0, random_state = 3)
model.fit(x_train, y_train)
y_pre = model.predict(x_test)
print('Root mean squared' ,(mean_squared_error(y_pre,y_test )**(1/2)))
print('R2-scored ', r2_score(y_pre, y_test))

Root mean squared 0.12748218988490542
R2-scored  0.9894280792870761


##2. XGBoost

In [None]:
regressor = XGBRegressor()

params = {
    'n_estimators': [500,1000,2000,6000],
    
    'learning_rate': [x/100 for x in range(1, 21, 2)],
    'gamma': [x/5 for x in range(0, 3, 1)],
    'max_depth':[int(x) for x in range(3, 11, 2)]
}
scoring_fnc = 'neg_mean_squared_error'
grid = GridSearchCV(estimator=regressor, param_grid=params, scoring= scoring_fnc, cv= 5 , verbose= 5)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV 1/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.026 total time=   1.1s
[CV 2/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.025 total time=   1.0s
[CV 3/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.027 total time=   1.0s
[CV 4/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.022 total time=   1.0s
[CV 5/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=500;, score=-0.030 total time=   1.0s
[CV 1/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=1000;, score=-0.011 total time=   1.9s
[CV 2/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=1000;, score=-0.009 total time=   2.0s
[CV 3/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators=1000;, score=-0.009 total time=   1.9s
[CV 4/5] END gamma=0.0, learning_rate=0.01, max_depth=3, n_estimators

GridSearchCV(cv=5, estimator=XGBRegressor(),
             param_grid={'gamma': [0.0, 0.2, 0.4],
                         'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.09, 0.11,
                                           0.13, 0.15, 0.17, 0.19],
                         'max_depth': [3, 5, 7, 9],
                         'n_estimators': [500, 1000, 2000, 6000]},
             scoring='neg_mean_squared_error', verbose=5)

In [None]:
# get best parameter
grid.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0.0,
 'importance_type': 'gain',
 'learning_rate': 0.03,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 2000,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'reg:linear',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}

In [None]:
#  Training model with hyperparameter tuning
model = XGBRegressor(n_estimators = 2000, learning_rate = 0.03, gamma = 0.0, max_depth = 3  )
model.fit(x_train, y_train)
y_pre = model.predict(x_test)
print('Root mean squared' ,(mean_squared_error(y_pre,y_test )**(1/2)))
print('R2-scored ', r2_score(y_pre, y_test))

Root mean squared 0.07013753747205126
R2-scored  0.9968735027541525
