# Hyperparameters tuning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

In [2]:
seed=42
kf=4

In [3]:
XY_train=pd.read_excel('X10Y10_CONC.xlsx')

In [4]:
LassoFeatToDrop =['Student_Internet', 'Student_ActiveWorking', 'Student_Parish',
       'FTH_Nation_CHN', 'FTH_Nation_EEUR', 'FTH_Nation_OTHERS',
       'FTH_Nation_RICH', 'SES_STDRESP_ProfClass_UnivII',
       'SES_STDRESP_ProfClass_Unknown_NoProfession',
       'SES_FATH_ProfClass_Unknown_NoProfession', 'SES_MOTH_ProfClass_BasicI',
       'SES_MOTH_ProfClass_UnivI', 'SES_MOTH_ProfClass_Unknown_NoProfession',
       'SES_STDRESP_JobSit_HomeAffairs', 'SES_STDRESP_JobSit_Other',
       'SES_STDRESP_JobSit_Retired', 'SES_STDRESP_JobSit_Student',
       'SES_STDRESP_JobSit_Unemployed', 'SES_STDRESP_JobSit_Unknown',
       'SES_FATH_JobSit_Employer', 'SES_FATH_JobSit_HomeAffairs',
       'SES_FATH_JobSit_Other', 'SES_FATH_JobSit_Retired',
       'SES_FATH_JobSit_Student', 'SES_FATH_JobSit_Unknown',
       'SES_MOTH_JobSit_HomeAffairs', 'SES_MOTH_JobSit_Other',
       'SES_MOTH_JobSit_SelfEmployed', 'SES_MOTH_JobSit_Unemployed',
       'SES_MOTH_JobSit_Unknown', 'SES_STDRESP_AcadEduc_Basic_II',
       'SES_STDRESP_AcadEduc_Basic_III', 'SES_STDRESP_AcadEduc_PostGraduation',
       'SES_STDRESP_AcadEduc_Unknown', 'SES_FATH_AcadEduc_Bachelor',
       'SES_FATH_AcadEduc_NoFormalEducation', 'SES_FATH_AcadEduc_Other',
       'SES_FATH_AcadEduc_PostGraduation', 'SES_FATH_AcadEduc_Unknown',
       'SES_MOTH_AcadEduc_Bachelor', 'SES_MOTH_AcadEduc_NoFormalEducation',
       'SES_MOTH_AcadEduc_Other', 'SES_MOTH_AcadEduc_Unknown',
       'SES%_Parish_IliteracyRate', 'SES%_Parish_PostSecondarySchoolingRate',
       'SES%_Parish_PrimarySector', 'Tch_FixedTermStaff',
       'Tch_PedagogicZoneNoDefinitivePermanentStaff',
       'Tch_SchoolClusterNoDefinitivePermanentStaff',
       'Tch_SchoolNoDefinitivePermanentStaff', 'Tch_AcadEduc_Phd_Master',
       'Teacher_TemporaryReplacement', 'Teacher_EducationSupport',
       'Teacher_Age', 'Teacher_TeachingDedicatedTime',
       'Teacher_NoTeachingDedicatedTime']

In [5]:
XY_train=XY_train.drop(LassoFeatToDrop, axis=1)

In [6]:
XY_train=XY_train.drop(['Unnamed: 0', 'AcYear_11', 'AcYear_12'], axis=1)

In [7]:
X_train=XY_train.iloc[:,:64]
Y_train=XY_train.iloc[:,-1]

In [8]:
Y_train=Y_train.to_numpy()

In [9]:
# standardization
scaler=StandardScaler()

## Search Grid CV: XGBoost

In [10]:
xgb_rgr = xgb.XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=50,
                           verbosity=1, objective='reg:squarederror',
                           booster='gbtree', n_jobs=-1, gamma=0,
                           min_child_weight=1, max_delta_step=0, subsample=0.6,
                           colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
                           reg_alpha=0, reg_lambda=1.5, scale_pos_weight=1, base_score=0.5,
                           random_state=seed, seed=None, missing=None, importance_type='gain')


In [15]:
# A+B

max_depth= range(10, 55, 5) # [30, 35, 40, 45, 50]

d = len(X_train)
cw= np.arange(0.001, 0.051, 0.001) # [0, 0.00005, 0.0001,0.00015, 0.0002]


min_child_weight=np.round(d*np.array(cw),0) # [3.0] 

subsample = [i/10. for i in range(1,11)] #[6,7,8,9] [1.0] 

colsample_bytree = [i/10. for i in range(1,11)] #  [0.7] 

# C+D
# C
learning_rate= np.arange(0.01, 1.005, 0.005) # [.3, .2, .1, .05, .01, .005]# eta
# D

reg_lambda=[0, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18 ,20]



params_xgb = {"xgb__learning_rate": learning_rate,
              "xgb__reg_lambda": reg_lambda,
              "xgb__max_depth": max_depth,
              "xgb__min_child_weight": min_child_weight,
              "xgb__subsample" : subsample,
              "xgb__colsample_bytree":colsample_bytree}

In [16]:
min_child_weight

array([  33.,   65.,   98.,  131.,  164.,  196.,  229.,  262.,  294.,
        327.,  360.,  392.,  425.,  458.,  491.,  523.,  556.,  589.,
        621.,  654.,  687.,  720.,  752.,  785.,  818.,  850.,  883.,
        916.,  948.,  981., 1014., 1047., 1079., 1112., 1145., 1177.,
       1210., 1243., 1276., 1308., 1341., 1374., 1406., 1439., 1472.,
       1504., 1537., 1570., 1603., 1635.])

In [17]:
# Define sklearn pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('xgb', xgb_rgr))
pipeline = Pipeline(estimators)

In [18]:
xgb_grid = RandomizedSearchCV(pipeline, params_xgb,
                           n_jobs=-1, cv=kf,n_iter=200,
                           verbose=1, scoring='neg_mean_absolute_error', return_train_score=True)

In [19]:
grid_result=xgb_grid.fit(X_train,Y_train)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 97.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 245.4min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 245.6min finished


In [20]:
# summarize results

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means=grid_result.cv_results_['mean_test_score']
stds=grid_result.cv_results_['std_test_score']
params=grid_result.cv_results_['params']

for mean, stdev, param in zip(means,stds, params):
    print ("%f (%f) with: %r" % (mean,stdev,param))

Best: -1.907338 using {'xgb__subsample': 1.0, 'xgb__reg_lambda': 0.4, 'xgb__min_child_weight': 131.0, 'xgb__max_depth': 20, 'xgb__learning_rate': 0.41999999999999993, 'xgb__colsample_bytree': 1.0}
-2.019704 (0.003918) with: {'xgb__subsample': 1.0, 'xgb__reg_lambda': 6, 'xgb__min_child_weight': 1635.0, 'xgb__max_depth': 45, 'xgb__learning_rate': 0.6849999999999999, 'xgb__colsample_bytree': 0.3}
-2.014418 (0.005860) with: {'xgb__subsample': 1.0, 'xgb__reg_lambda': 0.6, 'xgb__min_child_weight': 1472.0, 'xgb__max_depth': 20, 'xgb__learning_rate': 0.9599999999999999, 'xgb__colsample_bytree': 0.5}
-1.975299 (0.005193) with: {'xgb__subsample': 1.0, 'xgb__reg_lambda': 1, 'xgb__min_child_weight': 621.0, 'xgb__max_depth': 50, 'xgb__learning_rate': 0.355, 'xgb__colsample_bytree': 0.7}
-2.162580 (0.009516) with: {'xgb__subsample': 0.2, 'xgb__reg_lambda': 12, 'xgb__min_child_weight': 1079.0, 'xgb__max_depth': 50, 'xgb__learning_rate': 0.8349999999999999, 'xgb__colsample_bytree': 1.0}
-2.004907 (0.0

In [21]:
df_resgscv_xgb=pd.DataFrame(grid_result.cv_results_)

In [22]:
grid_result.best_params_

{'xgb__subsample': 1.0,
 'xgb__reg_lambda': 0.4,
 'xgb__min_child_weight': 131.0,
 'xgb__max_depth': 20,
 'xgb__learning_rate': 0.41999999999999993,
 'xgb__colsample_bytree': 1.0}

In [23]:
df_resgscv_xgb.to_excel('HYPER_RESULTS_II_XGB.xlsx', sheet_name='XGB_CV_ABCD')


#with pd.ExcelWriter('HYPER_RESULTS_II.xlsx',engine='openpyxl', mode='a') as writer:
 #    df_resgscv_xgb.to_excel(writer, sheet_name='XGB_CV_ABCD')


In [24]:
df_resgscv_xgb.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__subsample,param_xgb__reg_lambda,param_xgb__min_child_weight,param_xgb__max_depth,param_xgb__learning_rate,param_xgb__colsample_bytree,...,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
0,1.933337,0.037256,0.04718,0.010163,1.0,6.0,1635,45,0.685,0.3,...,-2.014227,-2.019704,0.003918,53,-1.980662,-1.981538,-1.981074,-1.983976,-1.981812,0.001287
1,2.885217,0.03782,0.041847,0.011303,1.0,0.6,1472,20,0.96,0.5,...,-2.015538,-2.014418,0.00586,42,-1.944413,-1.950228,-1.934263,-1.938821,-1.941931,0.005989
2,5.886502,0.097678,0.058589,0.012998,1.0,1.0,621,50,0.355,0.7,...,-1.975833,-1.975299,0.005193,9,-1.876096,-1.881172,-1.878432,-1.873446,-1.877287,0.002854
3,2.063484,0.072865,0.035033,0.004159,0.2,12.0,1079,50,0.835,1.0,...,-2.159808,-2.16258,0.009516,154,-2.14533,-2.150313,-2.153813,-2.152693,-2.150537,0.003261
4,5.125951,0.138551,0.059736,0.008301,0.4,20.0,229,15,0.46,0.7,...,-2.006749,-2.004907,0.006972,33,-1.894727,-1.890111,-1.878845,-1.885142,-1.887206,0.005899


In [25]:
xgb_rgr = xgb.XGBRegressor(max_depth=40, learning_rate=0.05, n_estimators=50,
                           verbosity=1, objective='reg:squarederror',
                           booster='gbtree', n_jobs=-1, gamma=0,
                           min_child_weight=3, max_delta_step=0, subsample=1,
                           colsample_bytree=0.7, colsample_bylevel=1, colsample_bynode=1,
                           reg_alpha=0, reg_lambda=1.5, scale_pos_weight=1, base_score=0.5,
                           random_state=seed, seed=None, missing=None, importance_type='gain')


In [26]:
X_vtrain, X_vtest, Y_vtrain, Y_vtest=train_test_split(X_train,Y_train,test_size=0.3, random_state=seed)

In [27]:
scaler.fit(X_vtrain)
X_vtrain_std=scaler.transform(X_vtrain)
X_vtest_std=scaler.transform(X_vtest)

In [28]:
dtrain = xgb.DMatrix(X_vtrain, label=Y_vtrain)
dtest = xgb.DMatrix(X_vtest, label=Y_vtest)

In [29]:
num_boost_round = 999
#maximum number of boosting rounds allowed

params={'eta': 0.41999999999999993,
 'max_depth': 20,
 'min_child_weight': 131,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'lambda': 0.4,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [30]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")])# early_stopping_rounds=0

[0]	Test-mae:7.55323
[1]	Test-mae:4.46469
[2]	Test-mae:2.93514
[3]	Test-mae:2.33135
[4]	Test-mae:2.12323
[5]	Test-mae:2.0492
[6]	Test-mae:2.01612
[7]	Test-mae:2.00141
[8]	Test-mae:1.99082
[9]	Test-mae:1.9813
[10]	Test-mae:1.97624
[11]	Test-mae:1.97239
[12]	Test-mae:1.96804
[13]	Test-mae:1.96655
[14]	Test-mae:1.96362
[15]	Test-mae:1.95888
[16]	Test-mae:1.95406
[17]	Test-mae:1.9498
[18]	Test-mae:1.94883
[19]	Test-mae:1.94693
[20]	Test-mae:1.94535
[21]	Test-mae:1.94098
[22]	Test-mae:1.93712
[23]	Test-mae:1.93393
[24]	Test-mae:1.93354
[25]	Test-mae:1.93433
[26]	Test-mae:1.92992
[27]	Test-mae:1.92824
[28]	Test-mae:1.92778
[29]	Test-mae:1.92711
[30]	Test-mae:1.92572
[31]	Test-mae:1.92633
[32]	Test-mae:1.92368
[33]	Test-mae:1.92226
[34]	Test-mae:1.9223
[35]	Test-mae:1.91986
[36]	Test-mae:1.91971
[37]	Test-mae:1.91821
[38]	Test-mae:1.9179
[39]	Test-mae:1.91796
[40]	Test-mae:1.91716
[41]	Test-mae:1.91569
[42]	Test-mae:1.91383
[43]	Test-mae:1.91297
[44]	Test-mae:1.91277
[45]	Test-mae:1.91259
[46

[363]	Test-mae:1.89892
[364]	Test-mae:1.89873
[365]	Test-mae:1.89892
[366]	Test-mae:1.89882
[367]	Test-mae:1.89895
[368]	Test-mae:1.89906
[369]	Test-mae:1.8989
[370]	Test-mae:1.89911
[371]	Test-mae:1.89914
[372]	Test-mae:1.89985
[373]	Test-mae:1.90021
[374]	Test-mae:1.90099
[375]	Test-mae:1.90107
[376]	Test-mae:1.90109
[377]	Test-mae:1.9012
[378]	Test-mae:1.90144
[379]	Test-mae:1.90162
[380]	Test-mae:1.90158
[381]	Test-mae:1.9015
[382]	Test-mae:1.90161
[383]	Test-mae:1.90169
[384]	Test-mae:1.90176
[385]	Test-mae:1.90178
[386]	Test-mae:1.9022
[387]	Test-mae:1.90261
[388]	Test-mae:1.90271
[389]	Test-mae:1.90263
[390]	Test-mae:1.9028
[391]	Test-mae:1.90258
[392]	Test-mae:1.90303
[393]	Test-mae:1.90277
[394]	Test-mae:1.90305
[395]	Test-mae:1.90288
[396]	Test-mae:1.9031
[397]	Test-mae:1.90302
[398]	Test-mae:1.90292
[399]	Test-mae:1.90327
[400]	Test-mae:1.90299
[401]	Test-mae:1.90329
[402]	Test-mae:1.90376
[403]	Test-mae:1.90366
[404]	Test-mae:1.90357
[405]	Test-mae:1.9036
[406]	Test-mae:1.9

[722]	Test-mae:1.92965
[723]	Test-mae:1.93007
[724]	Test-mae:1.93002
[725]	Test-mae:1.92993
[726]	Test-mae:1.92999
[727]	Test-mae:1.92969
[728]	Test-mae:1.92993
[729]	Test-mae:1.92989
[730]	Test-mae:1.93028
[731]	Test-mae:1.93007
[732]	Test-mae:1.93017
[733]	Test-mae:1.9301
[734]	Test-mae:1.93018
[735]	Test-mae:1.93041
[736]	Test-mae:1.93046
[737]	Test-mae:1.93043
[738]	Test-mae:1.9304
[739]	Test-mae:1.93063
[740]	Test-mae:1.93075
[741]	Test-mae:1.93072
[742]	Test-mae:1.93081
[743]	Test-mae:1.93105
[744]	Test-mae:1.93118
[745]	Test-mae:1.93124
[746]	Test-mae:1.93143
[747]	Test-mae:1.93146
[748]	Test-mae:1.93157
[749]	Test-mae:1.93173
[750]	Test-mae:1.93176
[751]	Test-mae:1.93174
[752]	Test-mae:1.93183
[753]	Test-mae:1.93162
[754]	Test-mae:1.93159
[755]	Test-mae:1.93172
[756]	Test-mae:1.93165
[757]	Test-mae:1.93157
[758]	Test-mae:1.93158
[759]	Test-mae:1.93182
[760]	Test-mae:1.93183
[761]	Test-mae:1.93199
[762]	Test-mae:1.93233
[763]	Test-mae:1.93247
[764]	Test-mae:1.93257
[765]	Test-ma