In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor

# import other functions
from scripts_misc.imputer import *
from scripts_misc.feature_eng import *
from scripts_misc.drop import *

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, mode='mean', floor=False):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    floor : boolean
        if true, all the negative values are turned into 0s
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        if floor:
            errors = [np.sqrt(mean_squared_error(y_train, list(map(lambda x: 0 if x<0 else x, model.predict(X_train))))), 
                  np.sqrt(mean_squared_error(y_valid, list(map(lambda x: 0 if x<0 else x, model.predict(X_valid)))))]
            metric = 'root mean squared'
        else:
            errors = [np.sqrt(mean_squared_error(y_train, model.predict(X_train))), 
                      np.sqrt(mean_squared_error(y_valid, model.predict(X_valid)))]
            metric = 'root mean squared'
            
        print('Training RMSE:', errors[0])
        print('Validation RMSE:', errors[1])
        
    elif mode == 'median':
        if floor:
            errors = [mean_absolute_error(y_train, list(map(lambda x: 0 if x<0 else x, model.predict(X_train)))), 
                      mean_absolute_error(y_valid, list(map(lambda x: 0 if x<0 else x, model.predict(X_valid))))]
            metric= 'mean absolute'
        else:
            errors = [mean_absolute_error(y_train, model.predict(X_train)), 
                      mean_absolute_error(y_valid, model.predict(X_valid))]
            metric= 'mean absolute'
            
        print('Training MAE:', errors[0])
        print('Validation MAE:', errors[1])
        
    return(errors)

In [4]:
def report_importance(model, n, df):
    """
    Return column names and Gini coefficients of
    n most important features.
    
    Parameters
    ----------
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        gradient boosting regressor
        
    n: int
        number of features
        
    df: pd.DataFrame
        either `X_train` or `X_valid`
    
    Returns
    -------
    pd.DataFrame
    
    """
    # code attribution: https://tinyurl.com/ya52tn2p
    values = model.feature_importances_
    indices = (-values).argsort()[:n]
    
    # get column names of n most important features
    col_names = df.iloc[:, list(indices)].columns.to_list()
    
    # get Gini coefficient of n most important features
    gini_coeff = list(np.sort(values)[-n:][::-1])

    data = {'feature': col_names, 'Gini': gini_coeff}
    
    result = pd.DataFrame(data)
    
    display(result)

In [5]:
def report_search(search):
    """
    Print the best hyperparameter settings and
    search.cv_results_ as a dataframe.
    
    Parameters
    ----------
    search: sklearn.model_selection.RandomizedSearchCV
        
    """
    print(search.best_params_)
    
    results = pd.DataFrame(search.cv_results_)
    
    return results

Load the data:

In [6]:
df = pd.read_csv('../data/train_data.zip')

In [7]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [8]:
# drop rows missing target variable
df = drop_missing_unacast(df)

In [9]:
# remove playgrounds where 'external_id' == 'CA00070678'
df = df.query("external_id != 'CA00070678'")

Create `X` and `y`:

In [10]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

Split the data into training and validation sets:

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [12]:
# number of observations in training set
X_train.shape[0]

39592

In [13]:
# number of observations in validation set
X_valid.shape[0]

9898

Pre-process `X_train` and `X_valid`:

In [14]:
# impute NaN values
result = impute_data(X_train, X_valid)

In [15]:
X_train = result[0]
X_valid = result[1]

In [16]:
# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

In [17]:
# perform feature selection
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [18]:
# check the number of categorical columns to OHE
X_train.dtypes.value_counts()

int64      422
float64    201
object       3
dtype: int64

In [19]:
# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [20]:
# check if there are any missing values in X_train, y_train
print(X_train.isna().sum().sum())
print(y_train.isna().sum())

0
0


In [21]:
# check if there are any missing values in X_valid, y_valid
print(X_valid.isna().sum().sum())
print(y_valid.isna().sum())

0
0


In [22]:
X_train.shape[1]

632

## Models to predict mean session count

### Gradient boosting regression

Fit a model with default parameters:

In [38]:
gbr = GradientBoostingRegressor(n_estimators=200, 
                                random_state=2020, verbose=1) 
gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1       66167.6312            7.64m
         2       60407.0510            7.63m
         3       55700.6114            7.59m
         4       51808.1136            7.56m
         5       48584.9494            7.51m
         6       45809.8496            7.46m
         7       43589.0397            7.42m
         8       41635.1151            7.39m
         9       40082.9219            7.35m
        10       38686.2115            7.30m
        20       30737.9380            6.90m
        30       27290.4657            6.57m
        40       25126.6220            6.22m
        50       23686.2311            5.84m
        60       22423.3906            5.46m
        70       21605.3804            5.07m
        80       20649.3892            4.68m
        90       19862.4662            4.29m
       100       19269.5782            3.91m
       200       15089.5869            0.00s


GradientBoostingRegressor(n_estimators=200, random_state=2020, verbose=1)

In [39]:
# calculate RMSE
report_performance(gbr, X_train, y_train, X_valid, y_valid)

Training RMSE: 122.83967985225044
Validation RMSE: 138.86903174128227


In [40]:
# report Gini coefficients
report_importance(gbr, 20, X_train)

Unnamed: 0,feature,Gini
0,houses_per_sq_km,0.490098
1,walk_score,0.036173
2,year,0.029934
3,B08301e10,0.026956
4,B17020e6,0.02265
5,B25012e3,0.018782
6,month,0.016195
7,historic_foggy,0.01469
8,B25012e17,0.01458
9,Adult_obesity,0.010792


### Examine the trade-off between `n_estimators` and `learning_rate`

In [None]:
# code attribution: Tom Beuzen, DSCI 571 

#def get_trade_off(n_min, n_max, rate,
#                  X_train, y_train, X_valid, y_valid):
#    """
#    Return the trade-off between error and
#    number of boosting stages (`n_estimators`)
#    for a given learning rate.
#    
#    Parameters
#    ----------
#    n_min: int
#        minimum `n_estimators` to fit
#        
#    n_max: int
#        maximum `n_estimators` to fit
#        
#    rate: float
#        `learning_rate`
#    
#    Returns
#    -------
#    altair.Chart
#    
#    """
#    n_dict = {'n': [], 'train_error': [], 'valid_error': []}    
#    
#    for n in np.arange(low, high, 100):
#        model = GradientBoostingRegressor(random_state=2020).fit(X_train, y_train)
#        
#        n_dict['n'].append(n)
#        n_dict['train_error'] = mean_squared_error(y_train, 
#                                                   model.predict(X_train)) ** 0.5
#        n_dict['train_error'] = mean_squared_error(y_valid, 
#                                                   model.predict(X_valid)) ** 0.5
#        
#
#    n_df = pd.DataFrame(n_dict)
#    n_df = n_df.melt(id_vars='n', value_name='error', var_name='data')
#    
#    return n_df

`learning_rate=0.1`

In [26]:
param_grid_02 = {'n_estimators': [200, 250, 300, 350, 400]}

gbr_opt_02 = GradientBoostingRegressor(learning_rate=0.1, random_state=2020)

gbr_rs_02 = GridSearchCV(gbr_opt_02, param_grid_02,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1,
                         verbose=10)

gbr_rs_02.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done  13 out of  25 | elapsed: 16.8min remaining: 15.5min
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed: 20.1min remaining: 11.3min
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed: 42.3min remaining: 13.4min
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed: 50.2min remaining:  6.9min
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 59.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 59.4min finished


GridSearchCV(estimator=GradientBoostingRegressor(random_state=2020), n_jobs=-1,
             param_grid={'n_estimators': [200, 250, 300, 350, 400]},
             scoring='neg_mean_squared_error', verbose=10)

In [28]:
# print grid search results
report_search(gbr_rs_02)

{'n_estimators': 400}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,402.28921,0.464514,0.089504,0.004087,200,{'n_estimators': 200},-16267.747965,-19590.920223,-21590.891386,-18352.661151,-17574.869391,-18675.418023,1813.856863,5
1,502.170098,0.799819,0.096067,0.003812,250,{'n_estimators': 250},-15380.302511,-18843.028962,-20494.321051,-17599.419738,-16762.655625,-17815.945577,1750.271737,4
2,602.19942,1.634605,0.101443,0.003574,300,{'n_estimators': 300},-14660.62419,-18273.149878,-19620.569371,-16962.589277,-15964.801724,-17096.346888,1731.334459,3
3,1267.592485,304.961435,0.343842,0.294081,350,{'n_estimators': 350},-14185.270116,-17611.841068,-18916.275385,-16645.047523,-15451.203463,-16561.927511,1645.259226,2
4,1796.687439,215.268244,0.249638,0.140675,400,{'n_estimators': 400},-13776.051415,-17219.292885,-18218.777696,-16285.75094,-15194.392387,-16138.853065,1548.538232,1


In [29]:
# calculate MSE of best estimator
report_performance(gbr_rs_02, X_train, y_train, X_valid, y_valid)

Training RMSE: 104.8846857026784
Validation RMSE: 126.41784908158999


Random search for hyperparameter optimization (default learning rate):

In [26]:
param_grid = {'n_estimators': [100, 200, 300, 400],
              'max_depth': [3, 4, 5, 6, 7],
              'max_features': ['auto', 'sqrt'],
              'subsample': [0.8, 0.9, 1]}

In [27]:
gbr_opt = GradientBoostingRegressor(random_state=2020)

gbr_rs = RandomizedSearchCV(gbr_opt, param_grid,
                            scoring='neg_mean_squared_error',
                            n_jobs=-1,
                            verbose=10,
                            random_state=2020)

gbr_rs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 16.6min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 21.2min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 33.3min finished


In [31]:
# print random search results
report_search(gbr_rs)

{'subsample': 0.8, 'n_estimators': 300, 'max_features': 'auto', 'max_depth': 5}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,31.874775,0.148698,0.436807,0.120289,0.9,200,sqrt,6,"{'subsample': 0.9, 'n_estimators': 200, 'max_f...",-11519.622719,-14381.512884,-16201.996711,-15607.248755,-13208.588387,-14183.793891,1684.475763,5
1,15.984039,0.112916,0.225727,0.069267,0.9,100,sqrt,6,"{'subsample': 0.9, 'n_estimators': 100, 'max_f...",-13484.816234,-16724.109929,-19305.339339,-17560.837031,-15309.049146,-16476.830336,1977.500553,8
2,21.097317,0.040241,0.161587,0.029823,0.9,200,sqrt,4,"{'subsample': 0.9, 'n_estimators': 200, 'max_f...",-15425.816254,-18539.827897,-21030.849197,-18316.09076,-17264.080447,-18115.332911,1826.409895,9
3,678.393513,1.923945,0.15772,0.010629,1.0,200,auto,5,"{'subsample': 1.0, 'n_estimators': 200, 'max_f...",-11482.781493,-14390.22011,-15442.319743,-14563.907862,-12656.176124,-13707.081067,1432.745209,3
4,38.163724,0.100336,0.244896,0.041657,0.8,400,sqrt,4,"{'subsample': 0.8, 'n_estimators': 400, 'max_f...",-12648.319407,-15862.434336,-18537.657002,-16666.88626,-14707.010224,-15684.461446,1965.254544,6
5,1182.70172,90.381651,0.414526,0.422503,0.8,300,auto,5,"{'subsample': 0.8, 'n_estimators': 300, 'max_f...",-10960.813127,-13235.416982,-14789.71521,-13837.296953,-11258.765816,-12816.401618,1481.97204,1
6,33.504508,0.163685,0.186794,0.005493,1.0,300,sqrt,4,"{'subsample': 1.0, 'n_estimators': 300, 'max_f...",-13066.041617,-16020.699045,-19533.528967,-16376.371532,-15271.67294,-16053.66282,2085.765305,7
7,16.864336,0.067604,0.118602,0.003699,1.0,200,sqrt,3,"{'subsample': 1.0, 'n_estimators': 200, 'max_f...",-18603.712413,-21434.009975,-25557.419112,-20969.735238,-20047.722766,-21322.519901,2327.585886,10
8,96.026111,37.623874,0.548352,0.417269,0.9,300,sqrt,7,"{'subsample': 0.9, 'n_estimators': 300, 'max_f...",-10281.382629,-13541.711304,-14959.318144,-14089.465474,-11780.451449,-12930.4658,1683.645913,2
9,143.750401,37.113718,0.604736,0.327674,1.0,200,sqrt,6,"{'subsample': 1.0, 'n_estimators': 200, 'max_f...",-11026.745754,-14024.157128,-16657.34819,-14786.642699,-12943.059293,-13887.590613,1874.837317,4


In [36]:
# calculate MSE of best estimator
report_performance(gbr_rs, X_train, y_train, X_valid, y_valid)

Training RMSE: 64.63476315627366
Validation RMSE: 113.63002689115837


Perform another iteration of random search:

In [27]:
param_grid_03 = {'learning_rate': [0.08, 0.1, 0.12, 0.14],
                 'max_depth': [6, 7],
                 'max_features': [0.75, 1],
                 'subsample': [0.7, 0.8, 0.9]}

In [28]:
gbr_opt_03 = GradientBoostingRegressor(n_estimators=300, random_state=2020)

gbr_rs_03 = RandomizedSearchCV(gbr_opt_03, param_grid_03, scoring='neg_mean_squared_error',
                               n_jobs=-1, verbose=10, random_state=2020)

gbr_rs_03.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 26.9min remaining:  5.9min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 37.9min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 47.3min finished


RandomizedSearchCV(estimator=GradientBoostingRegressor(n_estimators=300,
                                                       random_state=2020),
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.08, 0.1, 0.12,
                                                          0.14],
                                        'max_depth': [6, 7],
                                        'max_features': [0.75, 1],
                                        'subsample': [0.7, 0.8, 0.9]},
                   random_state=2020, scoring='neg_mean_squared_error',
                   verbose=10)

In [29]:
# report random search results
report_search(gbr_rs_03).sort_values(by='rank_test_score')

{'subsample': 0.9, 'max_features': 0.75, 'max_depth': 6, 'learning_rate': 0.14}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_max_features,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,824.459262,1.711668,0.150739,0.012724,0.9,0.75,6,0.14,"{'subsample': 0.9, 'max_features': 0.75, 'max_...",-10367.780396,-12805.703943,-13244.158967,-12719.334172,-11061.161585,-12039.627813,1118.234199,1
6,674.500599,1.134846,0.150286,0.009423,0.7,0.75,6,0.08,"{'subsample': 0.7, 'max_features': 0.75, 'max_...",-10285.406149,-12533.19445,-13559.447403,-13915.257767,-10424.28073,-12143.5173,1529.975297,2
8,746.707701,1.388386,0.14157,0.003881,0.8,0.75,6,0.1,"{'subsample': 0.8, 'max_features': 0.75, 'max_...",-10532.389434,-12518.273587,-13972.740221,-13555.025245,-10573.396443,-12230.364986,1449.286262,3
2,783.70779,1.33283,0.167183,0.003184,0.7,0.75,7,0.12,"{'subsample': 0.7, 'max_features': 0.75, 'max_...",-10169.745608,-12956.262313,-13994.13483,-13282.163688,-10758.329821,-12232.127252,1493.79488,4
9,667.113126,6.062879,0.136318,0.00415,0.7,0.75,6,0.1,"{'subsample': 0.7, 'max_features': 0.75, 'max_...",-10361.560596,-12968.676559,-14206.339764,-14606.178567,-10564.518505,-12541.454798,1782.002271,5
3,4.339619,0.044223,0.179337,0.003519,0.9,1.0,7,0.12,"{'subsample': 0.9, 'max_features': 1, 'max_dep...",-14192.584891,-18522.141906,-22083.176537,-18288.647069,-17547.975643,-18126.905209,2515.889151,6
1,4.303509,0.072177,0.224907,0.033982,0.8,1.0,7,0.1,"{'subsample': 0.8, 'max_features': 1, 'max_dep...",-14351.395599,-17438.768574,-21852.922294,-19230.409955,-18123.030126,-18199.30531,2441.423234,7
5,4.354843,0.015972,0.181948,0.005664,0.9,1.0,7,0.1,"{'subsample': 0.9, 'max_features': 1, 'max_dep...",-14409.483052,-18276.326517,-22908.545772,-19968.759265,-17151.405825,-18542.904086,2834.938828,8
0,3.883218,0.021781,0.275706,0.003828,0.9,1.0,6,0.1,"{'subsample': 0.9, 'max_features': 1, 'max_dep...",-15406.023937,-19204.070545,-23829.044245,-19898.864794,-17787.890631,-19225.178831,2767.728909,9
7,3.691487,0.033392,0.152333,0.003181,0.8,1.0,6,0.1,"{'subsample': 0.8, 'max_features': 1, 'max_dep...",-15412.058959,-19034.074046,-23620.314401,-20005.757078,-18848.899277,-19384.220752,2627.471709,10


In [31]:
# calculate MSE of best estimator
report_performance(gbr_rs_03, X_train, y_train, X_valid, y_valid)

Training RMSE: 40.57947542558486
Validation RMSE: 105.50280880660851


In [58]:
# plot residual for training set
residual = y_train - gbr_rs_03.predict(X_train)

residual_dict = {'y_train': y_train.to_list(), 'residual': residual.to_list()}
residual_df = pd.DataFrame(residual_dict)

alt.Chart(residual_df).mark_circle().encode(
    alt.X('y_train:Q'), 
    alt.Y('residual:Q')
)

In [59]:
# plot residual for validation set
residual_v = y_valid - gbr_rs_03.predict(X_valid)

residual_v_dict = {'y_valid': y_valid.to_list(), 'residual': residual_v.to_list()}
residual_v_df = pd.DataFrame(residual_v_dict)

alt.Chart(residual_v_df).mark_circle().encode(
    alt.X('y_valid:Q'), 
    alt.Y('residual:Q')
)

### XGBoost

Fit a model with default parameters:

In [48]:
xgbr = XGBRegressor(n_estimators=200, verbosity=1, random_state=2020)
xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [49]:
# calculate MSE
report_performance(xgbr, X_train, y_train, X_valid, y_valid)

Training RMSE: 38.80612133676185
Validation RMSE: 105.35892966539521


In [50]:
# print feature importances
report_importance(xgbr, 20, X_train)

Unnamed: 0,feature,Gini
0,houses_per_sq_km,0.137375
1,B17020e6,0.09699
2,single_no_kids,0.043833
3,B20004e17,0.035555
4,B08301e6,0.023577
5,B25012e7,0.017746
6,B20004e14,0.015624
7,B19101e8,0.014225
8,men_without_health_insurance,0.013836
9,B17020e4,0.010832


Random search for hyperparameter optimization:

In [43]:
xgbr_params = {'n_estimators': [200, 300, 400, 500],
               'max_depth': [3, 4, 5, 6, 7],
               'colsample_bytree': [0.6, 0.8, 1],
               'subsample': [0.8, 0.9, 1]}

In [44]:
xgbr_opt = XGBRegressor(verbosity=1, random_state=2020)

xgbr_rs = RandomizedSearchCV(xgbr_opt, xgbr_params,
                            scoring='neg_mean_squared_error',
                            n_jobs=-1,
                            verbose=10,
                            random_state=2020)

xgbr_rs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 20.5min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 35.7min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 48.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 85.3min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 100.2min remaining: 22.0min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 108.5min remaining:  6.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 110.8min finished


RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_jobs...
                                          num_parallel_tree=None,
                                          random_state=2020, reg_alpha=None,
                                          reg_lambda=None,
        

In [45]:
# print random search results
report_search(xgbr_rs)

{'subsample': 0.8, 'n_estimators': 400, 'max_depth': 7, 'colsample_bytree': 1}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_depth,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1227.118021,5.117772,2.505538,0.708212,0.8,400,7,1.0,"{'subsample': 0.8, 'n_estimators': 400, 'max_d...",-10240.555552,-13510.320139,-11821.716491,-13746.286233,-9887.56896,-11841.289475,1599.651965,1
1,467.805288,250.587785,0.916923,0.739449,0.8,400,4,0.8,"{'subsample': 0.8, 'n_estimators': 400, 'max_d...",-10967.289922,-14128.301581,-13854.697046,-15094.466039,-12270.864474,-13263.123812,1463.402724,10
2,728.20781,135.784257,0.964923,0.750522,0.8,300,5,0.6,"{'subsample': 0.8, 'n_estimators': 300, 'max_d...",-10395.868815,-13098.153984,-13371.828139,-13670.047975,-11911.035097,-12489.386802,1205.036867,8
3,912.713303,6.472891,1.735126,0.693951,0.9,300,6,0.6,"{'subsample': 0.9, 'n_estimators': 300, 'max_d...",-10200.669356,-12635.541976,-12372.017941,-14575.391271,-11447.150284,-12246.154166,1443.275813,5
4,725.863013,46.206923,1.797075,0.054984,0.9,200,7,0.6,"{'subsample': 0.9, 'n_estimators': 200, 'max_d...",-10125.952453,-12657.148827,-12788.464963,-13617.445314,-10980.998173,-12034.001946,1281.74016,3
5,1376.226234,17.739908,1.218453,0.742313,0.8,400,4,1.0,"{'subsample': 0.8, 'n_estimators': 400, 'max_d...",-10694.747075,-13681.770019,-13931.730581,-15491.255432,-11476.822176,-13055.265057,1741.164924,9
6,1594.381782,36.173284,0.762905,0.657102,0.9,500,4,1.0,"{'subsample': 0.9, 'n_estimators': 500, 'max_d...",-10509.20647,-12932.808112,-13240.101367,-13895.330078,-11032.196614,-12321.928528,1314.623698,7
7,1525.503676,20.723955,1.290756,0.941781,1.0,400,6,0.8,"{'subsample': 1, 'n_estimators': 400, 'max_dep...",-10770.782195,-11820.095376,-12750.414035,-13305.904893,-10796.313689,-11888.702038,1019.683819,2
8,1155.875581,46.855016,0.649334,0.433409,1.0,300,5,1.0,"{'subsample': 1, 'n_estimators': 300, 'max_dep...",-9560.402806,-12703.910565,-12899.85783,-14176.721936,-11155.954257,-12099.369479,1591.102059,4
9,634.55085,150.037998,0.72161,0.47361,1.0,200,5,1.0,"{'subsample': 1, 'n_estimators': 200, 'max_dep...",-9855.622069,-12810.892706,-13088.281837,-14370.679884,-11397.330982,-12304.561496,1546.596624,6


In [46]:
# calculate MSE of best estimator
report_performance(xgbr_rs, X_train, y_train, X_valid, y_valid)

Training RMSE: 16.70276960130824
Validation RMSE: 100.91885748480932


Perform another iteration of random search:

In [46]:
xgbr_params_02 = {'learning_rate': [0.08, 0.09, 0.1],
                 'max_depth': [3, 4, 5, 6, 7],
                 'colsample_bytree': [0.25, 0.5, 0.75, 1],
                 'subsample': [0.7, 0.8, 0.9]}

In [48]:
xgbr_opt_02 = XGBRegressor(n_estimators=400, n_jobs=-1, random_state=2020)

xgbr_rs_02 = RandomizedSearchCV(xgbr_opt_02, xgbr_params_02,
                               scoring='neg_mean_squared_error',
                               n_jobs=-1,
                               verbose=10,
                               random_state=2020)

xgbr_rs_02.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 44.9min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 55.9min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 95.9min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 123.6min remaining: 27.1min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 141.2min remaining:  9.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 147.0min finished


RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=400, n_jobs...
                                          random_state=2020, reg_alpha=None,
                                          reg_lambda=None,
                                          scale_pos_weight=None, subsample

In [54]:
# report random search results
report_search(xgbr_rs_02).sort_values('rank_test_score')

{'subsample': 0.9, 'max_depth': 7, 'learning_rate': 0.08, 'colsample_bytree': 1}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_max_depth,param_learning_rate,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,2267.45333,76.19171,1.824472,0.933691,0.9,7,0.08,1.0,"{'subsample': 0.9, 'max_depth': 7, 'learning_r...",-8592.771707,-11748.519206,-11109.089066,-12061.796917,-8931.740834,-10488.783546,1446.743021,1
7,1746.083209,26.462268,1.919594,1.16988,0.9,7,0.08,0.75,"{'subsample': 0.9, 'max_depth': 7, 'learning_r...",-8731.429786,-11704.311485,-10881.864095,-11878.7111,-9684.001638,-10576.063621,1205.138681,2
9,1460.56108,397.611753,0.88098,0.798031,0.9,6,0.08,1.0,"{'subsample': 0.9, 'max_depth': 6, 'learning_r...",-9048.849056,-11728.402932,-11604.922191,-12508.046565,-9846.381665,-10947.320482,1287.995011,3
0,2112.063297,3.744002,2.508655,0.028454,0.7,6,0.1,1.0,"{'subsample': 0.7, 'max_depth': 6, 'learning_r...",-9085.35871,-11823.441901,-11966.828094,-12249.499343,-9679.128419,-10960.851293,1309.729104,4
2,674.143803,39.929782,1.073186,0.972021,0.7,7,0.09,0.25,"{'subsample': 0.7, 'max_depth': 7, 'learning_r...",-8967.556042,-12265.963902,-12307.324139,-12628.146312,-10512.972389,-11336.392557,1397.486744,5
3,578.946128,8.800075,2.159975,0.863934,0.8,6,0.1,0.25,"{'subsample': 0.8, 'max_depth': 6, 'learning_r...",-8759.60179,-12549.850609,-12676.167246,-13283.287416,-11310.432085,-11715.867829,1611.111011,6
6,1721.112484,55.367787,1.506475,0.859215,0.8,5,0.08,1.0,"{'subsample': 0.8, 'max_depth': 5, 'learning_r...",-10178.833194,-12560.240782,-13784.910626,-13476.998036,-10958.099602,-12191.816448,1407.09176,7
1,737.947367,60.056809,1.834681,0.0392,0.7,4,0.1,0.5,"{'subsample': 0.7, 'max_depth': 4, 'learning_r...",-11530.300974,-14778.267581,-14933.531355,-14955.323609,-12851.716968,-13809.828098,1387.585302,8
5,1400.590852,38.12151,1.790216,0.024566,0.7,4,0.08,1.0,"{'subsample': 0.7, 'max_depth': 4, 'learning_r...",-11747.249557,-14660.307902,-15400.259629,-15219.794097,-12797.58779,-13965.039795,1442.165025,9
4,694.589968,43.509481,0.983616,0.664946,0.8,4,0.08,0.5,"{'subsample': 0.8, 'max_depth': 4, 'learning_r...",-11663.987387,-14918.510554,-16312.66958,-15060.040332,-13296.464175,-14250.334405,1609.619131,10


In [55]:
# report MSE of best estimator
report_performance(xgbr_rs_02, X_train, y_train, X_valid, y_valid)

Training RMSE: 37.282374286388006
Validation RMSE: 101.39192502620274


Perform another iteration of random search (fewer trees to speed up computation):

In [22]:
# decreased `n_estimator` (will be underfit); explore higher values of `learning_rate` to compensate
xgbr_params_03 = {'learning_rate': [0.08, 0.1, 0.12, 0.14],
                  'max_depth': [5, 6, 7],
                  'colsample_bytree': [0.4, 0.6, 0.8, 1],
                  'subsample': [0.25, 0.5, 0.75, 1]}

In [23]:
xgbr_opt_03 = XGBRegressor(n_estimators=300, n_jobs=-1, random_state=2020)

xgbr_rs_03 = RandomizedSearchCV(xgbr_opt_03, xgbr_params_03,
                                scoring='neg_mean_squared_error',
                                n_jobs=-1,
                                verbose=10,
                                random_state=2020)

xgbr_rs_03.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 18.4min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 23.2min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 25.5min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 27.9min finished


RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=300, n_jobs...
                                          random_state=2020, reg_alpha=None,
                                          reg_lambda=None,
                                          scale_pos_weight=None, subsample

In [26]:
# report random search results
report_search(xgbr_rs_03).sort_values(by='rank_test_score')

{'subsample': 1, 'max_depth': 7, 'learning_rate': 0.08, 'colsample_bytree': 0.8}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_max_depth,param_learning_rate,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,325.810278,0.901136,0.407213,0.008684,1.0,7,0.08,0.8,"{'subsample': 1, 'max_depth': 7, 'learning_rat...",-8968.479366,-11817.722798,-11533.404084,-12799.178029,-9384.585953,-10900.674046,1474.960506,1
7,303.682632,0.523493,0.364723,0.003966,0.75,6,0.14,0.8,"{'subsample': 0.75, 'max_depth': 6, 'learning_...",-10218.852415,-12490.064913,-12137.289995,-12295.598138,-9747.883824,-11377.937857,1153.78489,2
8,294.264907,2.640237,0.39918,0.006451,0.5,7,0.14,0.6,"{'subsample': 0.5, 'max_depth': 7, 'learning_r...",-9515.874865,-12735.688402,-13169.182783,-13665.013019,-10737.319891,-11964.615792,1577.306102,3
5,313.874704,1.907739,0.322192,0.005322,0.75,5,0.12,1.0,"{'subsample': 0.75, 'max_depth': 5, 'learning_...",-10730.614603,-13039.74334,-13324.631275,-13146.136202,-10569.238847,-12162.072853,1239.065868,4
0,160.977379,0.214598,0.370292,0.006466,0.75,6,0.14,0.4,"{'subsample': 0.75, 'max_depth': 6, 'learning_...",-9769.180439,-12767.74206,-13127.676247,-13841.288562,-11378.237318,-12176.824925,1446.252719,5
2,274.410404,0.481538,0.316382,0.005627,0.5,5,0.1,0.8,"{'subsample': 0.5, 'max_depth': 5, 'learning_r...",-11524.718263,-14739.891585,-15486.359877,-15529.111278,-12381.466023,-13932.309405,1662.446894,6
3,138.179036,0.3132,0.342913,0.007962,0.25,6,0.14,0.4,"{'subsample': 0.25, 'max_depth': 6, 'learning_...",-12317.504535,-14518.151874,-15073.760296,-15800.493568,-13065.271888,-14155.036432,1284.332183,7
9,138.832085,2.007636,0.33815,0.008503,0.25,6,0.12,0.4,"{'subsample': 0.25, 'max_depth': 6, 'learning_...",-12357.845847,-15615.717535,-15621.87467,-15507.49098,-13734.12174,-14567.410154,1316.900888,8
6,217.672314,0.541771,0.301185,0.010584,0.25,5,0.12,0.8,"{'subsample': 0.25, 'max_depth': 5, 'learning_...",-12888.569368,-16761.468121,-16685.321447,-17486.358821,-14293.235118,-15622.990575,1740.237233,9
1,268.994025,0.191114,0.304733,0.004748,0.25,5,0.08,1.0,"{'subsample': 0.25, 'max_depth': 5, 'learning_...",-13351.209142,-16671.032757,-17568.643071,-16755.805822,-14976.320825,-15864.602323,1513.573116,10


In [28]:
# report MSE of best estimator
report_performance(xgbr_rs_03, X_train, y_train, X_valid, y_valid)

Training RMSE: 43.90104519469594
Validation RMSE: 103.64245450655704


## Models to predict median session count

### Gradient boosting regression

Fit a model with default parameters:

In [100]:
gbr_median = GradientBoostingRegressor(loss='quantile', 
                                       n_estimators=200, 
                                       random_state=2020, 
                                       alpha=0.5, 
                                       verbose=1)

gbr_median.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1          49.6413           12.16m
         2          48.8643           10.66m
         3          47.9232            9.96m
         4          47.0868           11.01m
         5          46.3201           11.67m
         6          45.6908           11.18m
         7          45.1657           10.73m
         8          44.7744           10.42m
         9          44.2259           10.33m
        10          44.0035           10.67m
        20          41.1483            9.27m
        30          39.4312            8.38m
        40          37.4947            7.68m
        50          36.2444            7.06m
        60          35.5483            6.49m
        70          34.9688            5.97m
        80          34.5666            5.56m
        90          34.1763            5.08m
       100          33.8831            4.67m
       200          31.3121            0.00s


GradientBoostingRegressor(alpha=0.5, loss='quantile', n_estimators=200,
                          random_state=2020, verbose=1)

In [102]:
# calculate MSE
report_performance(gbr_median, X_train, y_train, X_valid, y_valid, 'median')

Training mean absolute error: 62.62429042016808
Validation mean absolute error: 64.65782781787502


In [103]:
# print feature importances
report_importance(gbr_median, 20, X_train)

Unnamed: 0,feature,Gini
0,walk_score,0.161907
1,month,0.082855
2,houses_per_sq_km,0.075265
3,distance_to_M,0.063977
4,latitude,0.053011
5,year,0.049015
6,distance_to_S,0.014898
7,intersection_count,0.010238
8,distance_to_I,0.008728
9,Number_of_holidays,0.007777


Random search for hyperparameter optimization:

In [27]:
# reuse grid from previous search
param_grid

{'n_estimators': [100, 200, 300, 300, 400],
 'max_depth': [3, 4, 5, 6, 7],
 'max_features': ['auto', 'sqrt'],
 'subsample': [0.8, 0.9, 1]}

In [28]:
gbr_opt_med = GradientBoostingRegressor(loss='quantile', 
                                        random_state=2020, 
                                        alpha=0.5, 
                                        verbose=1)

gbr_rs_med = RandomizedSearchCV(gbr_opt_med, param_grid,
                                scoring='neg_mean_absolute_error',
                                n_jobs=-1,
                                verbose=10,
                                random_state=2020)

gbr_rs_med.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 25.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 33.1min remaining:  7.3min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 39.1min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 39.2min finished


      Iter       Train Loss   Remaining Time 
         1          48.9464           30.27m
         2          47.5345           31.21m
         3          46.1944           30.88m
         4          45.0688           30.93m
         5          44.0706           30.99m
         6          43.0525           31.09m
         7          42.1842           31.03m
         8          41.3288           30.90m
         9          40.2647           30.88m
        10          39.5916           31.04m
        20          34.4058           32.15m
        30          31.3095           32.00m
        40          29.4221           31.06m
        50          27.9485           30.00m
        60          27.0079           28.98m
        70          26.3622           28.01m
        80          25.6475           26.99m
        90          24.6728           25.98m
       100          24.0673           25.11m
       200          20.7117           16.50m
       300          19.7225            8.23m
       40

RandomizedSearchCV(estimator=GradientBoostingRegressor(alpha=0.5,
                                                       loss='quantile',
                                                       random_state=2020,
                                                       verbose=1),
                   n_jobs=-1,
                   param_distributions={'max_depth': [3, 4, 5, 6, 7],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [100, 200, 300, 300,
                                                         400],
                                        'subsample': [0.8, 0.9, 1]},
                   random_state=2020, scoring='neg_mean_absolute_error',
                   verbose=10)

In [30]:
# print random search results
report_search(gbr_rs_med)

{'subsample': 1, 'n_estimators': 400, 'max_features': 'auto', 'max_depth': 6}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1607.950257,2.416849,0.197307,0.007025,1.0,400,auto,6,"{'subsample': 1, 'n_estimators': 400, 'max_fea...",-45.405686,-47.344008,-49.193437,-47.497276,-46.51665,-47.191411,1.24652,1
1,602.272887,0.77538,0.110898,0.005468,1.0,300,auto,3,"{'subsample': 1, 'n_estimators': 300, 'max_fea...",-59.910962,-60.824399,-63.392073,-60.685513,-61.224982,-61.207586,1.172426,8
2,311.994019,0.608517,0.09408,0.004857,0.9,100,auto,5,"{'subsample': 0.9, 'n_estimators': 100, 'max_f...",-56.936723,-57.895093,-60.578778,-58.78654,-58.069979,-58.453423,1.215503,7
3,30.529378,0.067044,0.152927,0.004971,0.8,300,sqrt,4,"{'subsample': 0.8, 'n_estimators': 300, 'max_f...",-55.48445,-57.158952,-60.123882,-58.027889,-57.229758,-57.604986,1.507322,6
4,25.516948,0.04702,0.131388,0.007263,0.8,200,sqrt,5,"{'subsample': 0.8, 'n_estimators': 200, 'max_f...",-53.262738,-55.530084,-58.905275,-56.201474,-54.944757,-55.768866,1.846135,5
5,71.137122,0.141936,0.201343,0.001756,1.0,400,sqrt,6,"{'subsample': 1, 'n_estimators': 400, 'max_fea...",-45.987838,-47.57282,-49.685276,-47.645625,-46.888415,-47.555995,1.220404,2
6,45.959406,0.23367,0.168829,0.006684,0.8,300,sqrt,6,"{'subsample': 0.8, 'n_estimators': 300, 'max_f...",-46.701862,-48.657629,-51.256359,-48.857557,-47.899978,-48.674677,1.49606,4
7,671.704876,1.394942,0.128929,0.00475,0.8,200,auto,6,"{'subsample': 0.8, 'n_estimators': 200, 'max_f...",-47.216323,-48.515008,-50.467099,-48.48498,-47.905585,-48.517799,1.083467,3
8,26.815963,0.101058,0.115249,0.004857,1.0,300,sqrt,3,"{'subsample': 1, 'n_estimators': 300, 'max_fea...",-62.435328,-64.297643,-66.668148,-63.965415,-64.381042,-64.349515,1.355831,10
9,247.090773,0.742826,0.080436,0.003505,0.9,100,auto,4,"{'subsample': 0.9, 'n_estimators': 100, 'max_f...",-62.537665,-63.527792,-66.439633,-64.737632,-63.326393,-64.113823,1.359874,9


In [31]:
# calculate MSE of best estimator
report_performance(gbr_rs_med, X_train, y_train, X_valid, y_valid, 'median')

Training MAE: 38.9779679570756
Validation MAE: 46.43257917584963


Run another random search for median regression:

In [None]:
median_grid = {'learning_rate': [0.01, 0.03, 0.05, 0.1],
               'max_depth': [6, 7],
               'max_features': ['auto', 'sqrt'],
               'subsample': [0.7, 0.8, 0.9, 1]}

In [None]:
gbr_opt_median = GradientBoostingRegressor(n_estimators=400, loss='quantile', random_state=2020, 
                                           alpha=0.5, verbose=1)

gbr_rs_median = RandomizedSearchCV(gbr_opt_median, median_grid, scoring='neg_mean_absolute_error', 
                                   n_jobs=-1, verbose=10, random_state=2020)

gbr_rs_median.fit(X_train, y_train)

### Experiment: Model where `month` is encoded as periodic data

In [24]:
X_train_trig = X_train.copy()
X_valid_trig = X_valid.copy()

# Handle cyclical feature
X_train_trig['month_sin'] = np.sin((X_train_trig['month']-1)*(2.*np.pi/12))
X_train_trig['month_cos'] = np.cos((X_train_trig['month']-1)*(2.*np.pi/12))
X_train_trig = X_train_trig.drop(columns=['month'])

X_valid_trig['month_sin'] = np.sin((X_valid_trig['month']-1)*(2.*np.pi/12))
X_valid_trig['month_cos'] = np.cos((X_valid_trig['month']-1)*(2.*np.pi/12))
X_valid_trig = X_valid_trig.drop(columns=['month'])

Fit a model with default parameters:

In [25]:
xgbr_trig = XGBRegressor(n_estimators=200, verbosity=1, random_state=2020)
xgbr_trig.fit(X_train_trig, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [29]:
# calculate MSE
report_performance(xgbr_trig, X_train_trig, y_train, X_valid_trig, y_valid)

Training RMSE: 37.54722470328811
Validation RMSE: 107.59426043403701


In [31]:
# report feature importance
report_importance(xgbr_trig, 10, X_train)

Unnamed: 0,feature,Gini
0,motor_vehicle_theft,0.13069
1,B23008e14,0.071494
2,B01001e26,0.032982
3,B19101e16,0.031874
4,single_6_to_17,0.030182
5,B25012e15,0.023204
6,B09002e20,0.022998
7,B17012e4,0.020056
8,B11016e2,0.019423
9,historic_temp_60_to_70,0.017911


Fit a model with optimal hyperparameters found earlier:

In [30]:
xgbr_opt_trig = XGBRegressor(n_estimators=400, 
                             learning_rate=0.08, 
                             max_depth=7,
                             subsample=0.9, 
                             colsample_bytree=1, 
                             random_state=2020)

xgbr_opt_trig.fit(X_train_trig, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.08, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [32]:
# calculate MSE
report_performance(xgbr_opt_trig, X_train_trig, y_train, X_valid_trig, y_valid)

Training RMSE: 37.15141805462154
Validation RMSE: 102.38114994913344


In [33]:
# report feature importance
report_importance(xgbr_opt_trig, 10, X_train)

Unnamed: 0,feature,Gini
0,motor_vehicle_theft,0.14366
1,B23008e14,0.081734
2,B01001e26,0.041505
3,B09002e20,0.018216
4,B25012e11,0.016537
5,B19101e16,0.014588
6,historic_temp_70_to_80,0.01264
7,B12001e11,0.01242
8,C18108e4,0.010878
9,B25012e15,0.010756


### Experiment: Model trained on data where columns that made the mixed model "fail" have been dropped

In [25]:
# load in list of columns to drop based on Sirine's analysis
to_delete = ['monthly_repeated_sessions', 'B23008e21', 'B23008e23', 'B12001e11', 'B23008e26', 'B11005e14', 'B11005e17',
             'B20004e9', 'B19101e7', 'B19101e4', 'B12001e15', 'B09002e19', 'B23008e27', 'B20004e7', 'B11016e10', 'B17012e31',
             'B19101e8', 'C18108e5', 'B25012e8', 'B25012e15', 'B17020e4', 'B25012e16', 'B25012e10', 'B25012e6', 'B09002e17',
             'B25012e12', 'B25012e11', 'B08301e6', 'B17012e6', 'B19101e6', 'B25012e4', 'B19101e12', 'B17020e5', 'B23008e18',
             'B19101e5', 'B12001e16', 'B08303e12', 'B17020e11', 'B10010e1', 'B19101e11', 'B08303e13', 'B16007e5', 'B19101e16',
             'B11016e12', 'B17020e3', 'B19101e3', 'B25012e14', 'B09002e18', 'B17012e2', 'B19101e2', 'B25012e5', 'B17012e5',
             'B10010e2', 'B11005e4', 'B19101e15', 'B11005e5', 'B19101e17', 'B17012e4', 'B17012e7', 'B20004e8', 'B17020e10',
             'B09018e8', 'B20004e6', 'B17020e6', 'B10002e5', 'B11016e11', 'B10002e3', 'B17012e26', 'B25012e3', 'B11005e10',
             'B08301e5', 'B17012e25', 'B17012e20', 'B08301e4', 'B19101e13', 'B10002e4', 'B23008e13', 'C18108e3', 'B09018e7',
             'B11001e9', 'B10002e1', 'C18108e4', 'B23008e19', 'B10010e3', 'B16007e7', 'B23008e12', 'B15003e23', 'B15003e25',
             'B11005e6', 'B11003e7', 'B27003e1', 'B15003e20', 'B09018e1', 'B11016e3', 'B01001e4', 'B11016e2', 'B15003e21',
             'B15003e22', 'B19083e1', 'B15003e24', 'B23020e1', 'B11016e4', 'B09018e5', 'B19125e2', 'B16007e3', 'B01001e28',
             'B23020e3', 'B23020e2', 'B12001e6', 'B27001e30', 'B01001e29', 'B25103e2', 'B16007e4', 'B12001e9', 'B23008e10',
             'B09018e4', 'B01001e30', 'B11003e5', 'B23008e2', 'B23008e5', 'B09002e16', 'B25012e9', 'B01002e3', 'B20004e5',
             'B25012e2', 'B08303e1', 'B23025e4', 'B23008e4', 'B23025e7', 'B12001e1', 'B15003e17', 'B11005e7', 'B01001e3',
             'B23025e2', 'B09002e8', 'B22003e5', 'B08301e1', 'B14002e1', 'B09002e11', 'B27001e2', 'B25064e1', 'B25001e1',
             'B19058e2', 'B09001e5', 'B09002e10', 'B09002e13', 'B09002e12', 'B19301e1', 'B25119e2', 'B19125e3', 'B13016e4',
             'B11001e5', 'B25119e3', 'B15003e18', 'B01001e27', 'B13016e5', 'B11005e9', 'B20004e2', 'B15003e1', 'B10001e4',
             'B11001e6', 'B13016e7', 'B12001e7', 'B23008e9', 'B13016e6', 'B23008e7', 'B23008e6', 'B09002e14', 'B09001e10',
             'B11005e8', 'B13016e3', 'male_60_69', 'male_80_over', 'female_22_29', 'female_30_39', 'female_40_49',
             'female_50_59', 'female_60_69', 'female_70_79', 'female_80_over', 'total_enrolled', 'not_enrolled',
             'enrolled_nursery_pre_private', 'enrolled_kinder_public', 'enrolled_kinder_private', 'enrolled_grades_1_4_public',
             'enrolled_grades_1_4_private', 'enrolled_grades_5_8_public', 'enrolled_grades_9_12_private',
             'enrolled_undergrad_public', 'enrolled_undergrad_private', 'enrolled_graduate_public', 'less_than_12_no_diploma',
             'travel_15_minutes_less', 'travel_15_29_minutes', 'travel_30_44_minutes', 'single_under_6', 'single_0_to_17',
             'single_6_to_17', 'single_no_kids', 'four_or_more_in_family_household', 'four_or_more_in_nonfamily_household',
             'men_without_health_insurance', 'women_with_health_insurnace', 'women_without_health_insurance',
             'avg_birth_weight', 'avg_age_of_mother', 'longitude', 'latitude', 'alcohol', 'amenity', 'bank', 'bar', 'cafe',
             'camp_site', 'car_repair', 'childcare', 'clothes_store', 'convenience_store', 'fast_food', 'fire_station',
             'fitness_or_sports_centre', 'fuel', 'healthcare', 'hotel', 'museum_or_gallery', 'restaurant', 'shop',
             'supermarket', 'tourism', 'distance_to_U', 'distance_to_nearest_school', 'walk_score', 'bike_score', 'k_avg',
             'streets_per_node_avg', 'edge_length_total', 'edge_length_avg', 'street_length_avg',
             'streets_per_node_counts_2', 'streets_per_node_counts_3', 'streets_per_node_counts_5', 'n_osdw',
             'k_avg_osdw', 'streets_per_node_avg_osdw', 'edge_length_total_osdw', 'streets_per_node_counts_1_osdw',
             'streets_per_node_counts_3_osdw', 'streets_per_node_counts_5_osdw', 'intersection_count_osid',
             'edge_length_total_osid', 'edge_length_avg_osid', 'street_length_total_osid', 'street_segments_count_osid',
             'streets_per_node_counts_1_osid', 'streets_per_node_counts_2_osid', 'streets_per_node_counts_4_osid',
             'streets_per_node_counts_5_osid', 'violent_crime', 'criminal_homicide', 'rape', 'robbery',
             'aggravated_assault', 'property_crime', 'burglary', 'larceny_theft', 'motor_vehicle_theft',
             'houses_per_sq_km', 'historic_number_of_sessions', 'historic_sessions_per_day', 'historic_unique_sessions',
             'historic_unique_sessions_per_day', 'historic_repeat_sessions', 'historic_repeat_sessions_per_day',
             'historic_total_session_length', 'historic_avg_session_length', 'historic_avg_light_activity',
             'historic_avg_moderate_activity', 'historic_avg_vigorous_activity', 'historic_avg_mod_plus_vig',
             'historic_hour_7', 'historic_hour_8', 'historic_hour_9', 'historic_hour_10', 'historic_hour_11',
             'historic_hour_13', 'historic_hour_14', 'historic_hour_15', 'historic_hour_16', 'historic_hour_17',
             'historic_hour_19', 'historic_hour_20', 'historic_rain', 'historic_foggy', 'historic_snow', 'Green_2016',
             'Libertarians_2016', 'Poor_physical_health_days', 'Adult_smoking', 'Adult_obesity', 'weather_clear',
             'weather_rain', 'weather_fog', 'temp_avg_35_below', 'temp_max_35_below', 'temp_max_45_55',
             'state_amount_per_capita', 'historic_slide_count_comb', 'monthly_climb_count_comb', 'monthly_tube_count_comb',
             'historic_tube_count_comb', 'monthly_overhang_count_comb', 'historic_overhang_count_comb',
             'monthly_bridge_count_comb', 'historic_bridge_count_comb', 'monthly_swing_count_comb',
             'historic_swing_count_comb', 'historic_obsta_count_comb', 'historic_crawls_count_comb',
             'monthly_hour_night', 'historic_hour_night', 'avg_wind_calm', 'avg_wind_light_air', 'avg_wind_light_br',
             'avg_wind_gentle_br', 'avg_wind_moderate_br', 'monthly_ws_calm', 'monthly_ws_light_air', 'monthly_ws_light_br',
             'monthly_ws_gentle_br', 'monthly_ws_moderate_br', 'historic_ws_calm', 'historic_ws_light_air',
             'historic_ws_light_br', 'historic_ws_gentle_br', 'historic_ws_moderate_br', 'avg_fertility_rate', 'HI', 'LI',
             'MI', 'HD', 'LD', 'MD']

In [28]:
# drop columns that made the mixed model fail 
X_train_drop = X_train.drop(columns=to_delete)
X_valid_drop = X_valid.drop(columns=to_delete)

In [33]:
# check that the right number of columns has been dropped
print(X_train.shape[1] - X_train_drop.shape[1] == len(to_delete))
print(X_valid.shape[1] - X_valid_drop.shape[1] == len(to_delete))

True
True


Fit an `XGBRegressor` with little tuning:

In [34]:
xgbr_drop = XGBRegressor(n_estimators=200, verbosity=1, random_state=2020)
xgbr_drop.fit(X_train_drop, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [35]:
# calculate MSE
report_performance(xgbr_drop, X_train_drop, y_train, X_valid_drop, y_valid)

Training RMSE: 40.03033544581413
Validation RMSE: 109.87190831097047


In [36]:
# report feature importance
report_importance(xgbr_drop, 10, X_train_drop)

Unnamed: 0,feature,Gini
0,B08301e10,0.138635
1,Republicans_2016,0.078515
2,B09001e8,0.054149
3,B08301e3,0.048823
4,B25012e7,0.047178
5,B25103e3,0.045587
6,B22003e7,0.02083
7,Republican_12_Votes,0.016525
8,B25012e17,0.015337
9,street_length_avg_osid,0.015


Fit an `XGBRegressor` with optimal hyperparameters found earlier:

In [37]:
xgbr_opt_drop = XGBRegressor(n_estimators=400, 
                             learning_rate=0.08, 
                             max_depth=7,
                             subsample=0.9, 
                             colsample_bytree=1, 
                             random_state=2020)

xgbr_opt_drop.fit(X_train_drop, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.08, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [38]:
# calculate MSE
report_performance(xgbr_opt_drop, X_train_drop, y_train, X_valid_drop, y_valid)

Training RMSE: 38.88005741697605
Validation RMSE: 106.06158661543161


In [39]:
# report feature importance
report_importance(xgbr_opt_drop, 10, X_train_drop)

Unnamed: 0,feature,Gini
0,B08301e10,0.170529
1,Republicans_2016,0.05215
2,B08301e3,0.050341
3,B25012e7,0.046219
4,Democrats_2016,0.042383
5,B25103e3,0.0269
6,B22003e7,0.026259
7,B09001e8,0.022609
8,Republican_12_Votes,0.013153
9,street_segments_count,0.012725


### Experiment : set all the predicted values that are negative to 0

In [23]:
xgbf_floor = XGBRegressor(subsample=0.9, max_depth=7, learning_rate=0.08, colsample_bytree=1, n_estimators=400, random_state=2020, scoring='neg_mean_squared_error',)
xgbf_floor.fit(X_train, y_train.to_numpy().ravel())

Parameters: { scoring } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.08, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             scoring='neg_mean_squared_error', subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
report_performance(xgbf_floor, X_train, y_train, X_valid, y_valid)

Training root mean squared error: 37.282374286388006
Validation root mean squared error: 101.39192502620274


[37.282374286388006, 101.39192502620274]

In [30]:
report_performance(xgbf_floor, X_train, y_train, X_valid, y_valid, floor=True)

Training root mean squared error: 37.193505707067956
Validation root mean squared error: 101.32664233737974


[37.193505707067956, 101.32664233737974]

## Summary

- Input data
   - `playground_stats_capped.csv`
   - `unacast_session_count` was capped at 4000 because the initial round of modeling revealed that outliers were inflating the RMSE values
   - Updated preprocessing functions
   
   
- Building models to predict mean session count using `GradientBoostingRegressor`
   - `n_estimators=200`:
      - Validation RMSE: 139
   - Set `learning_rate=0.1` and performed a randomized search:
      - `{'subsample': 0.8, 'n_estimators': 300, 'max_features': 'auto', 'max_depth': 5}` gave rise to a validation RMSE of 114
   - Set `n_estimators=300` and performed another randomized search:
      - Fixed number of estimators since it takes too long to perform additional boosting stages (process can't be parallelized)
      - `{'subsample': 0.9, 'max_features': 0.75, 'max_depth': 6, 'learning_rate': 0.14}` gave rise to a validation RMSE of 106
      
      
- Building models to predict mean session count using `XGBRegressor`
   - Default setting:
      - Validation RMSE: 106
   - First randomized search:
      - `{'subsample': 0.8, 'n_estimators': 400, 'max_depth': 7, 'colsample_bytree': 1}` gave rise to a validation RMSE of 101 (seems to overfit)
   - Second randomized search (fix `n_estimators=400`):
      - `{'subsample': 0.9, 'max_depth': 7, 'learning_rate': 0.08, 'colsample_bytree': 1}` gave rise to a validation RMSE of 101 (less overfit than previous model)
   - Third randomized search (fix `n_estimators=300` to speed up computation):
      - `{'subsample': 1, 'max_depth': 7, 'learning_rate': 0.08, 'colsample_bytree': 0.8}` gave rise to a validation RMSE of 104
      

- Building models to predict median session count using `GradientBoostingRegressor`
   - Default settings:
      - Validation MAE: 65
   - Randomized search:
      - `{'subsample': 1, 'n_estimators': 400, 'max_features': 'auto', 'max_depth': 6}` gave rise to a validation MAE of 46
      

- Building models where `month` is encoded as periodic data (using sine and cosine)
   - `XBGRegressor` with default settings:
      - Validation RMSE: 108
   - `XGBRegressor` with combination found in second round of randomized search
      - Validation RMSE: 102
      

**Main takeaways**
- The drop in RMSE values is mostly attributable to the capping of the target variable
- Hit a wall: difficult to get validation RMSE to dip below 100

**Recommendations**
- In this iteration, grid search wasn't performed
   - Maybe the randomized search wasn't looking through the "right" parameter space
   - Could be worth running a thorough grid search on a subset of the training data to "re-center" future randomized searches
- Compare time to fit `GradientBoostingRegressor` versus `XGBRegressor`