## Bootstrap procedure for Bias Variance Decomp

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from mlxtend.evaluate import bias_variance_decomp

In [2]:
seed=42

In [3]:
X12Y12_train=pd.read_excel('X12Y12_train.xlsx')
X12Y12_test=pd.read_excel('X12Y12_test.xlsx')

In [4]:
LassoFeatToDrop = ['Student_Internet', 'Student_ActiveWorking', 'Student_Parish',
       'Student_County', 'STD_Resp_Himself', 'STD_Resp_LegalResp',
       'FTH_Nation_BR', 'FTH_Nation_CHN', 'FTH_Nation_EEUR',
       'FTH_Nation_OTHERS', 'SES_STDRESP_ProfClass_UnivII',
       'SES_STDRESP_ProfClass_Unknown_NoProfession',
       'SES_FATH_ProfClass_BasicI', 'SES_FATH_ProfClass_UnivI',
       'SES_FATH_ProfClass_Unknown_NoProfession', 'SES_MOTH_ProfClass_BasicI',
       'SES_MOTH_ProfClass_UnivI', 'SES_MOTH_ProfClass_Unknown_NoProfession',
       'SES_STDRESP_JobSit_Other', 'SES_STDRESP_JobSit_Retired',
       'SES_STDRESP_JobSit_SelfEmployed', 'SES_STDRESP_JobSit_Unemployed',
       'SES_STDRESP_JobSit_Unknown', 'SES_FATH_JobSit_Employer',
       'SES_FATH_JobSit_HomeAffairs', 'SES_FATH_JobSit_Retired',
       'SES_FATH_JobSit_SelfEmployed', 'SES_FATH_JobSit_Student',
       'SES_FATH_JobSit_Unknown', 'SES_MOTH_JobSit_Employer',
       'SES_MOTH_JobSit_HomeAffairs', 'SES_MOTH_JobSit_Retired',
       'SES_MOTH_JobSit_SelfEmployed', 'SES_MOTH_JobSit_Student',
       'SES_MOTH_JobSit_Unemployed', 'SES_MOTH_JobSit_Unknown',
       'SES_STDRESP_AcadEduc_Bachelor', 'SES_STDRESP_AcadEduc_Basic_III',
       'SES_STDRESP_AcadEduc_Degree', 'SES_STDRESP_AcadEduc_NoFormalEducation',
       'SES_STDRESP_AcadEduc_PostGraduation', 'SES_STDRESP_AcadEduc_Unknown',
       'SES_FATH_AcadEduc_Bachelor', 'SES_FATH_AcadEduc_Basic_III',
       'SES_FATH_AcadEduc_NoFormalEducation', 'SES_FATH_AcadEduc_Other',
       'SES_FATH_AcadEduc_Unknown', 'SES_MOTH_AcadEduc_Basic_II',
       'SES_MOTH_AcadEduc_NoFormalEducation', 'SES_MOTH_AcadEduc_Other',
       'SES_MOTH_AcadEduc_Unknown', 'SES_Scholarship_Half_Support',
       'SES%_Parish_CollectiveDwellings', 'SES%_Parish_IliteracyRate',
       'SES%_Parish_PrimarySector', 'SES%_Parish_UnemploymentRate',
       'Tch_FixedTermStaff', 'Tch_PedagogicZoneDefinitivePermanentStaff',
       'Tch_PedagogicZoneNoDefinitivePermanentStaff',
       'Tch_SchoolClusterDefinitivePermanentStaff',
       'Tch_SchoolClusterNoDefinitivePermanentStaff',
       'Tch_SchoolNoDefinitivePermanentStaff', 'Tch_AcadEduc_Bachelor',
       'Tch_AcadEduc_Phd_Master', 'Tch_Step_567',
       'Teacher_TemporaryReplacement', 'Teacher_Age',
       'Teacher_TeachingDedicatedTime']

In [5]:
XY_train=X12Y12_train.drop(LassoFeatToDrop, axis=1)
XY_test=X12Y12_test.drop(LassoFeatToDrop, axis=1)

In [6]:
XY_train=XY_train.drop(['Unnamed: 0', 'AcYear_11', 'AcYear_12'], axis=1)
XY_test=XY_test.drop(['Unnamed: 0', 'AcYear_11', 'AcYear_12'], axis=1)

In [7]:
XY_test.shape

(9105, 53)

In [8]:
X_train=XY_train.iloc[:,:52]
Y_train=XY_train.iloc[:,-1]

X_test=XY_test.iloc[:,:52]
Y_test=XY_test.iloc[:,-1]

In [9]:
X_train=X_train.to_numpy()
X_test=X_test.to_numpy()

In [10]:
Y_train=Y_train.to_numpy()
Y_test=Y_test.to_numpy()

In [11]:
# standardization
scaler=StandardScaler()

In [12]:
tscale=scaler.fit(X_train)
X_train_std=tscale.transform(X_train)
X_test_std=tscale.transform(X_test)

## Bootstrap

In [13]:
rf_rgr = RandomForestRegressor(n_estimators = 420, criterion="mse", min_samples_leaf=0.001,
                               min_samples_split=0.009 , bootstrap=True, random_state=seed)

In [14]:
svm_rgr= SVR(C=9.541, cache_size=1000, coef0=0.0, degree=3, epsilon=0.2, gamma=0.004,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=True)

In [15]:
xgb_rgr = xgb.XGBRegressor(max_depth=20, learning_rate=0.42, n_estimators=156,
                           verbosity=1, objective='reg:squarederror',
                           booster='gbtree', n_jobs=-1, gamma=0,
                           min_child_weight=131.0, max_delta_step=0, subsample=1.0,
                           colsample_bytree=0.7, colsample_bylevel=1, colsample_bynode=1,
                           reg_alpha=0, reg_lambda=0.4, scale_pos_weight=1, base_score=0.5,
                           random_state=seed, seed=None, missing=None, importance_type='gain')

In [16]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('RF', rf_rgr))
pipeline_RF = Pipeline(estimators)

In [17]:
# Define sklearn pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('SVR', svm_rgr))
pipeline_SVR = Pipeline(estimators)

In [18]:
# Define sklearn pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('XGB', xgb_rgr))
pipeline_XGB = Pipeline(estimators)

In [19]:
mse, mean_bias, mean_var = bias_variance_decomp(pipeline_RF,X_train,Y_train,
                                                X_test,Y_test,loss='mse', num_rounds=200, random_seed=seed)

In [20]:
columns = ['MSE','Mean_bias','Mean_var']

In [21]:
df_results= pd.DataFrame(columns=columns)

In [22]:
new_results_dic = {'MSE': mse, 'Mean_bias': mean_bias,'Mean_var': mean_var}
df_results.loc[len(df_results)] = new_results_dic

In [23]:
indexNamesArr = df_results.index.values
i = indexNamesArr[-1]
df_results.rename(index={i:'RF'}, inplace=True)
df_results.head()

Unnamed: 0,MSE,Mean_bias,Mean_variance
RF,6.241351,6.083633,


In [24]:
mse, mean_bias, mean_var = bias_variance_decomp(pipeline_SVR, X_train, Y_train,
                                                X_test,Y_test,loss='mse', num_rounds=200, random_seed=seed)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [25]:
new_results_dic = {'MSE': mse, 'Mean_bias': mean_bias,'Mean_var': mean_var}
df_results.loc[len(df_results)] = new_results_dic

In [26]:
indexNamesArr = df_results.index.values
i = indexNamesArr[-1]
df_results.rename(index={i:'SVR'}, inplace=True)
df_results.head()

Unnamed: 0,MSE,Mean_bias,Mean_variance
RF,6.241351,6.083633,
SVR,6.615612,6.332203,


In [27]:
mse, mean_bias, mean_var = bias_variance_decomp(pipeline_XGB, X_train, Y_train,
                                                X_test,Y_test,loss='mse', num_rounds=200, random_seed=seed)

In [28]:
new_results_dic = {'MSE': mse, 'Mean_bias': mean_bias,'Mean_var': mean_var}
df_results.loc[len(df_results)] = new_results_dic

In [29]:
indexNamesArr = df_results.index.values
i = indexNamesArr[-1]
df_results.rename(index={i:'XGB'}, inplace=True)
df_results.head()

Unnamed: 0,MSE,Mean_bias,Mean_variance
RF,6.241351,6.083633,
SVR,6.615612,6.332203,
XGB,6.658265,5.610973,


In [30]:
df_results.to_excel('FINALRESULTS_BiasVariance_12.xlsx', sheet_name='Decomposition')

In [31]:
mean_var

1.0472912191103787