<h1>Hyperparameter Exploration</h1>

In [None]:
import pandas as pd
# pd.options.mode.chained_assignment = None  # default='warn'
import os
import re
import glob
import numpy as np
from functools import reduce
import seaborn as sns

from sklearn.preprocessing import scale
from sklearn import linear_model
from sklearn import tree
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics  
from sklearn import ensemble

# For reproducibility
import random
import numpy as np
r_state = 42
random.seed(r_state) 
np.random.seed(r_state)

from timeit import default_timer as timer
import datetime

# Needed on a Mac
import matplotlib as mpl
mpl.use('TkAgg')
%matplotlib inline
import matplotlib.pyplot as plt 

In [None]:
def load_status_scores(dtype):
    status = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/04-Neighborhood Scores/scores' + to_use + '.csv', index_col=0)  # SES scores
    
    status.dropna(inplace = True)
    
    # Scores
    status.drop(['RANK_10','RANK_19'], axis=1, inplace=True)
    status.rename(columns={
        'SES_10':'SES 2010',
        'SES_19':'SES 2019',
        'SES_ASC':'SES Ascent 2010-2019',
        'SES_PR_10':'SES 2010 Percentile', # 99 = High-status
        'SES_PR_19':'SES 2019 Percentile', # 99 = High-status
        'SES_PR_ASC':'SES Percentile Ascent 2010-2019'
    }, inplace=True)
    return status

def classifier_report(clf, y_true, y_hat):
    
    txt = ''
    
    # If the task is regression evaluate using regression metrics, 
    # otherwise evaluate using classification metrics
    txt += "R2:        {0:8.5f}".format(metrics.r2_score(y_true, y_hat)) + "\n" #  R2 - Coefficient of determination
    txt += "MSE:       {0:8.5f}".format(metrics.mean_squared_error(y_true, y_hat)) + "\n"  #  Mean squared error regression loss
    txt += "MAE:       {0:8.5f}".format(metrics.mean_absolute_error(y_true, y_hat)) + "\n"  #  Mean absolute error regression loss
    txt += "Expl. Var: {0:8.5f}".format(metrics.explained_variance_score(y_true, y_hat)) + "\n"  # Explained variance regression score function
    txt += "\n"
    
    return txt

<h2>Exploring Hyperparameters</h2>

In [None]:
# Take a paramter grid and explore a hyperparameter space
# using Cross-Fold Validation...
def explore_extr_hyper(params, x_train, y_train):
    
    clf = ensemble.ExtraTreesRegressor(n_jobs=-1, random_state=r_state)
    cv  = model_selection.GridSearchCV(estimator=clf, param_grid=params, cv=4, n_jobs=2, 
                                       return_train_score=True, verbose=1, scoring='neg_mean_absolute_error') 

    cv.fit(x_train, y_train)
    
    print("Best score: " + str(cv.best_score_))
    print("Best parameters: " + str(cv.best_params_))
    
    best_clf = cv.best_estimator_ # Extract the best estimator from the GridSearch
    best_clf.fit(x_train, y_train)
    y_pred  = best_clf.predict(X_test)

    print(classifier_report(best_clf, y_test, y_pred))
    return cv

# Output the results of a Cross-Validation process
# to a data frame. Currently focussed on training and
# testing scores.
def cv_to_df(cvr):
    # Extract the parameters from the Cross-Validation object that 
    # we want to track in our results
    params  = cvr.cv_results_['params']
    trn_scr = cvr.cv_results_['mean_train_score']
    tst_scr = cvr.cv_results_['mean_test_score']
    trn_std = cvr.cv_results_['std_train_score']
    tst_std = cvr.cv_results_['std_test_score']
    rank    = cvr.cv_results_['rank_test_score']
    
    # Create a data frame from the numbers
    df = pd.DataFrame.from_dict({'Training Score':trn_scr, 'Test Score':tst_scr, 
                                'Std. of Training Scores':trn_std, 'Std. of Test Scores':tst_std})
    
    # Add the rank of the result
    rs = pd.Series(rank, index=df.index)
    df['rank'] = rs
    
    # And now work out how many parameters there
    # were and create the appropriate columns to
    # add to the df. Start with named parameters...
    n_params = cvr.cv_results_['params'][0].keys()
    
    # Convert these to arrays that can be assigned
    # as a new data series to the df.
    for p in list(n_params):
        vals = []
        for v in cvr.cv_results_['params']:
            vals.append(v[p])
        
        # Create and assign a new series using
        # the index from the data frame to avoid
        # setting-with-copy warnings
        ps = pd.Series(vals, index=df.index)
        df[p] = ps
    
    return df

In [None]:
# Can override to_use here if have already generated data above
to_use = 'Untransformed'

SES = load_status_scores(to_use)  # SES scores in 2011

#  Read the transformed data
d10_trs2 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/05-Transformed and Scaled Data/TransformedAndScaled2010' + to_use + '.csv', index_col=0)
d19_trs2 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/05-Transformed and Scaled Data/TransformedAndScaled2019' + to_use + '.csv', index_col=0)

# # Data about variables used later in process
# vardb = pd.read_csv(os.path.join('data','variables.csv'), index_col=False)
# vardb.drop('Description', axis=1, inplace=True)

In [None]:
d10_trs2.fillna(0, inplace = True)
SES.fillna(0, inplace = True)

<p>Split the dataset into testing and training where the test set size is 20%</p>

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    d10_trs2, SES['SES Ascent 2010-2019'], test_size=0.2, random_state=r_state)

<h2>n_estimators</h2>

In [None]:
param_grid = {
    "n_estimators" : [int(x) for x in np.arange(start=20, stop=2001, step=20)]
}

start = timer()
cv1 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

cv_to_df(cv1).to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/06-Hyperparameter Results/' + to_use + '-Scores-n_estimators.csv', index=False)

<p>Fitting 4 folds for each of 100 candidates, totalling 400 fits <br>
Best score: -0.34136118333074356 <br>
Best parameters: {'n_estimators': 140}<br>
R2:         0.51562<br>
MSE:        0.17629<br>
MAE:        0.32551<br>
Expl. Var:  0.51792<br>


Execution complete in:           951.6s (0:15:51.591884)</p>

<h2>max_depth</h2>

In [None]:
param_grid = {
    "max_depth" : [int(x) for x in np.arange(start=10, stop=161, step=10)],
}

start = timer()
cv2 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

cv_to_df(cv2).to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/06-Hyperparameter Results/' + to_use + '-Scores-max_depth.csv', index=False)

<p>Fitting 4 folds for each of 16 candidates, totalling 64 fits<br>
Best score: -0.34289399061295284<br>
Best parameters: {'max_depth': 120}<br>
R2:         0.51194<br>
MSE:        0.17763<br>
MAE:        0.32866<br>
Expl. Var:  0.51453<br>


Execution complete in:            19.5s (0:00:19.544697)</p>

<h2>min_samples_leaf</h2>

In [None]:
param_grid = {
    "min_samples_leaf" : [int(x) for x in np.arange(start=1, stop=26, step=1)],
}

start = timer()
cv3 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

# Save results to CSV file
cv_to_df(cv3).to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/06-Hyperparameter Results/' + to_use + '-Scores-min_samples_leaf.csv', index=False)

<p>Fitting 4 folds for each of 25 candidates, totalling 100 fits<br>
Best score: -0.34216994251968924<br>
Best parameters: {'min_samples_leaf': 3}<br>
R2:         0.52231<br>
MSE:        0.17385<br>
MAE:        0.32741<br>
Expl. Var:  0.52351<br>


Execution complete in:            14.5s (0:00:14.508016)</p>

<h2>max_features and bootstrap</h2>

In [None]:
param_grid = {
    "max_features"  : [float(x) for x in np.arange(start=0.1, stop=1.01, step=0.1)], # For regression normally n_features (worth trying after shorter runs)
    "bootstrap"     : [True, False]    # Not normally needed for ExtraTrees, but seems to improve performance?
}

param_grid['max_features'].append('auto')
param_grid['max_features'].append('sqrt')

start = timer()
cv4 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

# Save results to CSV file
cv_to_df(cv4).to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/06-Hyperparameter Results/' + to_use + '-Scores-max_features_and_bootstrap.csv', index=False)

<p>Fitting 4 folds for each of 24 candidates, totalling 96 fits<br>
Best score: -0.34217298840233246<br>
Best parameters: {'bootstrap': False, 'max_features': 0.5}<br>
R2:         0.53018<br>
MSE:        0.17099<br>
MAE:        0.32363<br>
Expl. Var:  0.53210<br>


Execution complete in:            14.6s (0:00:14.599733)</p>