# hyperparameters tuning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import math
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [2]:
seed=42
kf=4

In [3]:
XY_train=pd.read_excel('X10Y10_CONC.xlsx')

In [4]:
XY_train.columns

Index(['Unnamed: 0', 'AcYear_11', 'AcYear_12', 'Std_Gender_F', 'N_Retentions',
       'School_Size', 'Class_Size', 'Student_Computer', 'Student_Internet',
       'Student_NumberEnrolments',
       ...
       'Teacher_TemporaryReplacement', 'Teacher_EducationSupport',
       'Teacher_Age', 'Teacher_TeachingDedicatedTime',
       'Teacher_NoTeachingDedicatedTime',
       'Teacher_EducationSupportDedicatedTime', 'SubjClass_Foreign_Lang',
       'SubjClass_Qual', 'SubjClass_Quant', 'FinalMark'],
      dtype='object', length=124)

In [5]:
LassoFeatToDrop =['Student_Internet', 'Student_ActiveWorking', 'Student_Parish',
       'FTH_Nation_CHN', 'FTH_Nation_EEUR', 'FTH_Nation_OTHERS',
       'FTH_Nation_RICH', 'SES_STDRESP_ProfClass_UnivII',
       'SES_STDRESP_ProfClass_Unknown_NoProfession',
       'SES_FATH_ProfClass_Unknown_NoProfession', 'SES_MOTH_ProfClass_BasicI',
       'SES_MOTH_ProfClass_UnivI', 'SES_MOTH_ProfClass_Unknown_NoProfession',
       'SES_STDRESP_JobSit_HomeAffairs', 'SES_STDRESP_JobSit_Other',
       'SES_STDRESP_JobSit_Retired', 'SES_STDRESP_JobSit_Student',
       'SES_STDRESP_JobSit_Unemployed', 'SES_STDRESP_JobSit_Unknown',
       'SES_FATH_JobSit_Employer', 'SES_FATH_JobSit_HomeAffairs',
       'SES_FATH_JobSit_Other', 'SES_FATH_JobSit_Retired',
       'SES_FATH_JobSit_Student', 'SES_FATH_JobSit_Unknown',
       'SES_MOTH_JobSit_HomeAffairs', 'SES_MOTH_JobSit_Other',
       'SES_MOTH_JobSit_SelfEmployed', 'SES_MOTH_JobSit_Unemployed',
       'SES_MOTH_JobSit_Unknown', 'SES_STDRESP_AcadEduc_Basic_II',
       'SES_STDRESP_AcadEduc_Basic_III', 'SES_STDRESP_AcadEduc_PostGraduation',
       'SES_STDRESP_AcadEduc_Unknown', 'SES_FATH_AcadEduc_Bachelor',
       'SES_FATH_AcadEduc_NoFormalEducation', 'SES_FATH_AcadEduc_Other',
       'SES_FATH_AcadEduc_PostGraduation', 'SES_FATH_AcadEduc_Unknown',
       'SES_MOTH_AcadEduc_Bachelor', 'SES_MOTH_AcadEduc_NoFormalEducation',
       'SES_MOTH_AcadEduc_Other', 'SES_MOTH_AcadEduc_Unknown',
       'SES%_Parish_IliteracyRate', 'SES%_Parish_PostSecondarySchoolingRate',
       'SES%_Parish_PrimarySector', 'Tch_FixedTermStaff',
       'Tch_PedagogicZoneNoDefinitivePermanentStaff',
       'Tch_SchoolClusterNoDefinitivePermanentStaff',
       'Tch_SchoolNoDefinitivePermanentStaff', 'Tch_AcadEduc_Phd_Master',
       'Teacher_TemporaryReplacement', 'Teacher_EducationSupport',
       'Teacher_Age', 'Teacher_TeachingDedicatedTime',
       'Teacher_NoTeachingDedicatedTime']

In [6]:
XY_train=XY_train.drop(LassoFeatToDrop, axis=1)

In [7]:
XY_train=XY_train.drop(['Unnamed: 0', 'AcYear_11', 'AcYear_12'], axis=1)

In [8]:
XY_train.shape

(32706, 65)

In [9]:
X_train=XY_train.iloc[:,:64]
Y_train=XY_train.iloc[:,-1]

In [10]:
Y_train=Y_train.to_numpy()

In [11]:
# standardization
scaler=StandardScaler()

## Search Grid CV

### Random Forest

In [12]:
rf_rgr = RandomForestRegressor(n_estimators = 50, criterion="mse", min_samples_leaf=0.001,
                               min_samples_split=0.001 , random_state=seed)

In [18]:
n_estim= np.linspace(start=300, stop=700, num= 11)
n_estim=n_estim.astype(int)

min_samples_lf = np.linspace(start=0.001, stop=0.05, num= 50)
min_samples_sp = np.linspace(start=0.001, stop=0.05, num= 50)

params_rf = {"rf__n_estimators": n_estim, #[400,450,500,550,600,650,700], 
             "rf__min_samples_leaf":min_samples_lf,# [0.001, 0.002, 0.003, 0.004, 0.005, 0.006,0.007,0.009], #, # minimum number of samples
              #required to be at a leaf 
              "rf__min_samples_split": min_samples_sp,#[0.001,0.002,0.003], # ,
                "rf__bootstrap": [True, False]}#[True, False]} #bootstrap samples are used when building trees

#"rf__n_estimators": [10, 20, 50, 100, 150, 200], # number of trees in the forest


In [19]:
# Define sklearn pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('rf', rf_rgr))
pipeline = Pipeline(estimators)

In [20]:
rf_grid = RandomizedSearchCV(pipeline, params_rf,
                           n_jobs=-1, cv=kf, n_iter=200,
                           verbose=1, scoring='neg_mean_absolute_error', return_train_score=True)

In [22]:
grid_result=rf_grid.fit(X_train,Y_train)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 40.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 96.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 172.1min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 173.7min finished


In [23]:
df_Feat_import_gscv = pd.DataFrame(grid_result.best_estimator_.named_steps['rf'].feature_importances_)

In [24]:
# summarize results

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means=grid_result.cv_results_['mean_test_score']
stds=grid_result.cv_results_['std_test_score']
params=grid_result.cv_results_['params']

for mean, stdev, param in zip(means,stds, params):
    print ("%f (%f) with: %r" % (mean,stdev,param))

Best: -2.037677 using {'rf__n_estimators': 420, 'rf__min_samples_split': 0.009000000000000001, 'rf__min_samples_leaf': 0.001, 'rf__bootstrap': True}
-2.102719 (0.005255) with: {'rf__n_estimators': 540, 'rf__min_samples_split': 0.04, 'rf__min_samples_leaf': 0.034, 'rf__bootstrap': True}
-2.106607 (0.005818) with: {'rf__n_estimators': 460, 'rf__min_samples_split': 0.013000000000000001, 'rf__min_samples_leaf': 0.044000000000000004, 'rf__bootstrap': False}
-2.101118 (0.003298) with: {'rf__n_estimators': 340, 'rf__min_samples_split': 0.025, 'rf__min_samples_leaf': 0.035, 'rf__bootstrap': False}
-2.100408 (0.003020) with: {'rf__n_estimators': 620, 'rf__min_samples_split': 0.033, 'rf__min_samples_leaf': 0.036000000000000004, 'rf__bootstrap': False}
-2.107508 (0.005700) with: {'rf__n_estimators': 300, 'rf__min_samples_split': 0.019000000000000003, 'rf__min_samples_leaf': 0.045, 'rf__bootstrap': False}
-2.055470 (0.004933) with: {'rf__n_estimators': 300, 'rf__min_samples_split': 0.0190000000000

In [25]:
df_resgscv_rf=pd.DataFrame(grid_result.cv_results_)

In [26]:
grid_result.best_params_

{'rf__n_estimators': 420,
 'rf__min_samples_split': 0.009000000000000001,
 'rf__min_samples_leaf': 0.001,
 'rf__bootstrap': True}

In [27]:
df_resgscv_rf.to_excel('HYPER_RESULTS_II.xlsx', sheet_name='RF_CV')

#with pd.ExcelWriter('HYPER_RESULTS_II.xlsx',engine='openpyxl', mode='a') as writer:
     #df_resgscv_rf.to_excel(writer, sheet_name='RF_CV')

### SVR

In [28]:
samples_limit=7500

In [29]:
n_estimators=math.ceil(len(X_train)/samples_limit)
n_estimators

5

In [30]:
max_samples= math.floor(len(X_train)/n_estimators)
max_samples

6541

In [31]:
len(X_train)-(n_estimators*max_samples)

1

In [32]:
svm_rgr= SVR(C=1.0, cache_size=1000, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=True)
svr_bagging = BaggingRegressor(base_estimator=svm_rgr, n_estimators=n_estimators, max_samples=max_samples,
                               max_features=1.0, bootstrap=False, bootstrap_features=False,
                               oob_score=False, warm_start=False, n_jobs=-1, random_state=seed,
                               verbose=0)


In [33]:
# Define sklearn pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('svr', svr_bagging))
pipeline = Pipeline(estimators)

In [34]:
svr_bagging.get_params().keys()

dict_keys(['base_estimator__C', 'base_estimator__cache_size', 'base_estimator__coef0', 'base_estimator__degree', 'base_estimator__epsilon', 'base_estimator__gamma', 'base_estimator__kernel', 'base_estimator__max_iter', 'base_estimator__shrinking', 'base_estimator__tol', 'base_estimator__verbose', 'base_estimator', 'bootstrap', 'bootstrap_features', 'max_features', 'max_samples', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [35]:
#Penalty parameter C for the error term when it stays outside the insensitive chanel.
C_range = np.logspace(-3, 2, 50) # [0.0001 : 100]
#The gamma parameter can be seen as the inverse of the radius of influence of samples
gamma_range = np.logspace(-3, 2, 50) # [0.0001 : 100]

param_grid = dict(svr__base_estimator__gamma= gamma_range, svr__base_estimator__C= C_range)

In [36]:
# Grid_Search
grid=RandomizedSearchCV(pipeline, param_grid,
                        n_jobs=-1, cv=kf, n_iter=200,
                        verbose=1, scoring='neg_mean_absolute_error', return_train_score=True)


In [37]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardize', 'svr', 'standardize__copy', 'standardize__with_mean', 'standardize__with_std', 'svr__base_estimator__C', 'svr__base_estimator__cache_size', 'svr__base_estimator__coef0', 'svr__base_estimator__degree', 'svr__base_estimator__epsilon', 'svr__base_estimator__gamma', 'svr__base_estimator__kernel', 'svr__base_estimator__max_iter', 'svr__base_estimator__shrinking', 'svr__base_estimator__tol', 'svr__base_estimator__verbose', 'svr__base_estimator', 'svr__bootstrap', 'svr__bootstrap_features', 'svr__max_features', 'svr__max_samples', 'svr__n_estimators', 'svr__n_jobs', 'svr__oob_score', 'svr__random_state', 'svr__verbose', 'svr__warm_start'])

In [38]:
# Grid search
grid_result=grid.fit(X_train, Y_train)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 134.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 305.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 548.0min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 552.7min finished


In [39]:
# summarize results

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means=grid_result.cv_results_['mean_test_score']
stds=grid_result.cv_results_['std_test_score']
params=grid_result.cv_results_['params']

for mean, stdev, param in zip(means,stds, params):
    print ("%f (%f) with: %r" % (mean,stdev,param))

Best: -2.033680 using {'svr__base_estimator__gamma': 0.004094915062380427, 'svr__base_estimator__C': 9.540954763499943}
-2.119721 (0.012502) with: {'svr__base_estimator__gamma': 0.10985411419875583, 'svr__base_estimator__C': 7.543120063354623}
-2.267599 (0.011024) with: {'svr__base_estimator__gamma': 9.540954763499943, 'svr__base_estimator__C': 0.005179474679231213}
-2.264725 (0.010959) with: {'svr__base_estimator__gamma': 0.06866488450043001, 'svr__base_estimator__C': 0.0020235896477251575}
-2.257315 (0.010950) with: {'svr__base_estimator__gamma': 0.0020235896477251575, 'svr__base_estimator__C': 0.0032374575428176433}
-2.274924 (0.010886) with: {'svr__base_estimator__gamma': 15.264179671752334, 'svr__base_estimator__C': 0.28117686979742307}
-2.264881 (0.010960) with: {'svr__base_estimator__gamma': 0.08685113737513529, 'svr__base_estimator__C': 0.0032374575428176433}
-2.268921 (0.010984) with: {'svr__base_estimator__gamma': 19.306977288832496, 'svr__base_estimator__C': 0.05428675439323

In [40]:
df_rescv=pd.DataFrame(grid_result.cv_results_)

In [41]:
df_resgscv_svr=pd.DataFrame(grid_result.cv_results_)

In [42]:
grid_result.best_params_

{'svr__base_estimator__gamma': 0.004094915062380427,
 'svr__base_estimator__C': 9.540954763499943}

In [43]:
# df_resgscv_svr.to_excel('SVR.xlsx', sheet_name='SVRCV')

with pd.ExcelWriter('HYPER_RESULTS_II.xlsx',engine='openpyxl', mode='a') as writer:
     df_resgscv_svr.to_excel(writer, sheet_name='SVR_CV')

SVR Fit best Model

### Ridge Regression

In [44]:
ridge_rgr = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None,
                  tol=0.001, solver='auto', random_state=seed)

In [45]:

params_ridge = {"ridge__alpha": [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18 ,20]} 


In [46]:
# Define sklearn pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('ridge', ridge_rgr))
pipeline = Pipeline(estimators)

In [47]:
ridge_grid = RandomizedSearchCV(pipeline, params_ridge,
                           n_jobs=-1, cv=kf, n_iter=200,
                           verbose=1, scoring='neg_mean_absolute_error', return_train_score=True)

In [48]:
grid_result=ridge_grid.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 17 candidates, totalling 68 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  68 out of  68 | elapsed:    4.0s finished


In [49]:
# summarize results

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means=grid_result.cv_results_['mean_test_score']
stds=grid_result.cv_results_['std_test_score']
params=grid_result.cv_results_['params']

for mean, stdev, param in zip(means,stds, params):
    print ("%f (%f) with: %r" % (mean,stdev,param))

Best: -2.060516 using {'ridge__alpha': 0}
-2.060516 (0.007056) with: {'ridge__alpha': 0}
-2.060516 (0.007056) with: {'ridge__alpha': 0.1}
-2.060516 (0.007056) with: {'ridge__alpha': 0.2}
-2.060517 (0.007056) with: {'ridge__alpha': 0.4}
-2.060517 (0.007056) with: {'ridge__alpha': 0.6}
-2.060517 (0.007056) with: {'ridge__alpha': 0.8}
-2.060518 (0.007056) with: {'ridge__alpha': 1}
-2.060519 (0.007056) with: {'ridge__alpha': 2}
-2.060523 (0.007056) with: {'ridge__alpha': 4}
-2.060526 (0.007057) with: {'ridge__alpha': 6}
-2.060530 (0.007057) with: {'ridge__alpha': 8}
-2.060534 (0.007057) with: {'ridge__alpha': 10}
-2.060537 (0.007058) with: {'ridge__alpha': 12}
-2.060541 (0.007058) with: {'ridge__alpha': 14}
-2.060544 (0.007059) with: {'ridge__alpha': 16}
-2.060548 (0.007059) with: {'ridge__alpha': 18}
-2.060551 (0.007060) with: {'ridge__alpha': 20}


In [50]:
df_resgscv_ridge=pd.DataFrame(grid_result.cv_results_)

In [51]:
grid_result.best_params_

{'ridge__alpha': 0}

In [52]:
# df_resgscv_svr.to_excel('SVR.xlsx', sheet_name='SVRCV')

with pd.ExcelWriter('HYPER_RESULTS_II.xlsx',engine='openpyxl', mode='a') as writer:
     df_resgscv_ridge.to_excel(writer, sheet_name='RIDGE_CV')

### K-nearest Neighbours Regression

In [53]:
knn_rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None, n_jobs=None)

In [54]:
k_range=range(1, 201, 1)

params_knn = {"knn__n_neighbors": k_range,
             "knn__weights":['uniform','distance']} 

In [55]:
# Define sklearn pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('knn', knn_rgr))
pipeline = Pipeline(estimators)

In [56]:
knn_grid = RandomizedSearchCV(pipeline, params_knn,
                           n_jobs=-1, cv=kf, n_iter=200,
                           verbose=1, scoring='neg_mean_absolute_error', return_train_score=True)

In [57]:
grid_result=knn_grid.fit(X_train,Y_train)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 128.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 299.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 570.7min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 576.8min finished


In [58]:
# summarize results

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means=grid_result.cv_results_['mean_test_score']
stds=grid_result.cv_results_['std_test_score']
params=grid_result.cv_results_['params']

for mean, stdev, param in zip(means,stds, params):
    print ("%f (%f) with: %r" % (mean,stdev,param))

Best: -2.085407 using {'knn__weights': 'distance', 'knn__n_neighbors': 18}
-2.115749 (0.007094) with: {'knn__weights': 'distance', 'knn__n_neighbors': 169}
-2.105009 (0.007307) with: {'knn__weights': 'uniform', 'knn__n_neighbors': 80}
-2.109892 (0.006362) with: {'knn__weights': 'uniform', 'knn__n_neighbors': 8}
-2.093266 (0.008855) with: {'knn__weights': 'distance', 'knn__n_neighbors': 45}
-2.113883 (0.007411) with: {'knn__weights': 'uniform', 'knn__n_neighbors': 132}
-2.100537 (0.007731) with: {'knn__weights': 'distance', 'knn__n_neighbors': 70}
-2.115003 (0.007232) with: {'knn__weights': 'distance', 'knn__n_neighbors': 164}
-2.110518 (0.006808) with: {'knn__weights': 'uniform', 'knn__n_neighbors': 113}
-2.112819 (0.007248) with: {'knn__weights': 'distance', 'knn__n_neighbors': 144}
-2.110064 (0.007021) with: {'knn__weights': 'uniform', 'knn__n_neighbors': 110}
-2.119944 (0.006751) with: {'knn__weights': 'uniform', 'knn__n_neighbors': 186}
-2.117556 (0.007603) with: {'knn__weights': '

In [59]:
df_resgscv_knn=pd.DataFrame(grid_result.cv_results_)

In [60]:
grid_result.best_params_

{'knn__weights': 'distance', 'knn__n_neighbors': 18}

In [61]:
# df_resgscv_svr.to_excel('SVR.xlsx', sheet_name='SVRCV')

with pd.ExcelWriter('HYPER_RESULTS_II.xlsx',engine='openpyxl', mode='a') as writer:
     df_resgscv_knn.to_excel(writer, sheet_name='KNN_CV')