In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor

# import other functions
from imputer import *
from feature_eng import *
from drop import *
from preprocessing import *

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, 
                                     model.predict(X_train)) ** 0.5, 
                  mean_squared_error(y_valid, 
                                     model.predict(X_valid)) ** 0.5]
        
        print('Training RMSE:', errors[0])
        print('Validation RMSE:', errors[1])
        
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, 
                                      model.predict(X_train)), 
                  mean_absolute_error(y_valid, 
                                      model.predict(X_valid))]
        
        print('Training MAE:', errors[0])
        print('Validation MAE:', errors[1])

In [4]:
def report_importance(model, n, df):
    """
    Return column names and Gini coefficients of
    n most important features.
    
    Parameters
    ----------
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        gradient boosting regressor
        
    n: int
        number of features
        
    df: pd.DataFrame
        either `X_train` or `X_valid`
    
    Returns
    -------
    pd.DataFrame
    
    """
    # code attribution: https://tinyurl.com/ya52tn2p
    values = model.feature_importances_
    indices = (-values).argsort()[:n]
    
    # get column names of n most important features
    col_names = df.iloc[:, list(indices)].columns.to_list()
    
    # get Gini coefficient of n most important features
    gini_coeff = list(np.sort(values)[-n:][::-1])

    data = {'feature': col_names, 'Gini': gini_coeff}
    
    result = pd.DataFrame(data)
    
    display(result)

In [5]:
def report_search(search):
    """
    Print the best hyperparameter settings and
    search.cv_results_ as a dataframe.
    
    Parameters
    ----------
    search: sklearn.model_selection.RandomizedSearchCV
        
    """
    print(search.best_params_)
    
    results = pd.DataFrame(search.cv_results_)
    
    display(results)

Load the data:

In [15]:
df = pd.read_csv('../data/train_data.zip')

In [16]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [17]:
# drop rows missing target variable
df = drop_missing_unacast(df)

In [18]:
# check that playgrounds 'external_id' == 'CA00070678' have been removed
df = df.query("external_id != 'CA00070678'")

Create `X` and `y`:

In [19]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

Split the data into training and validation sets:

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [21]:
# number of observations in training set
X_train.shape[0]

39592

In [22]:
# number of observations in validation set
X_valid.shape[0]

9898

Pre-process `X_train` and `X_valid`:

In [23]:
# impute NaN values
result = impute_data(X_train, X_valid)

In [24]:
X_train = result[0]
X_valid = result[1]

In [25]:
# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

In [26]:
# perform feature selection
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [27]:
# check the number of categorical columns to OHE
X_train.dtypes.value_counts()

int64      422
float64    201
object       3
dtype: int64

In [28]:
# OHE remaining categorical features
X_train = clean_categorical(X_train)
X_valid = clean_categorical(X_valid)

In [29]:
# check if there are any missing values in X_train, y_train
print(X_train.isna().sum().sum())
print(y_train.isna().sum())

0
0


In [30]:
# check if there are any missing values in X_valid, y_valid
print(X_valid.isna().sum().sum())
print(y_valid.isna().sum())

0
0


### Models to predict mean session count

#### Gradient boosting regression

Fit a model with default parameters:

In [38]:
gbr = GradientBoostingRegressor(n_estimators=200, 
                                random_state=2020, verbose=1) 
gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1       66167.6312            7.64m
         2       60407.0510            7.63m
         3       55700.6114            7.59m
         4       51808.1136            7.56m
         5       48584.9494            7.51m
         6       45809.8496            7.46m
         7       43589.0397            7.42m
         8       41635.1151            7.39m
         9       40082.9219            7.35m
        10       38686.2115            7.30m
        20       30737.9380            6.90m
        30       27290.4657            6.57m
        40       25126.6220            6.22m
        50       23686.2311            5.84m
        60       22423.3906            5.46m
        70       21605.3804            5.07m
        80       20649.3892            4.68m
        90       19862.4662            4.29m
       100       19269.5782            3.91m
       200       15089.5869            0.00s


GradientBoostingRegressor(n_estimators=200, random_state=2020, verbose=1)

In [39]:
# calculate MSE
report_performance(gbr, X_train, y_train, X_valid, y_valid)

Training RMSE: 122.83967985225044
Validation RMSE: 138.86903174128227


In [40]:
# report Gini coefficients
report_importance(gbr, 20, X_train)

Unnamed: 0,feature,Gini
0,houses_per_sq_km,0.490098
1,walk_score,0.036173
2,year,0.029934
3,B08301e10,0.026956
4,B17020e6,0.02265
5,B25012e3,0.018782
6,month,0.016195
7,historic_foggy,0.01469
8,B25012e17,0.01458
9,Adult_obesity,0.010792


Random search for hyperparameter optimization (default learning rate):

In [31]:
param_grid = {'n_estimators': [100, 200, 300, 300, 400],
              'max_depth': [3, 4, 5, 6, 7],
              'max_features': ['auto', 'sqrt'],
              'subsample': [0.8, 0.9, 1]}

In [27]:
gbr_opt = GradientBoostingRegressor(random_state=2020)

gbr_rs = RandomizedSearchCV(gbr_opt, param_grid,
                            scoring='neg_mean_squared_error',
                            n_jobs=-1,
                            verbose=10,
                            random_state=2020)

gbr_rs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 16.6min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 21.2min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 33.3min finished


In [31]:
# print random search results
report_search(gbr_rs)

{'subsample': 0.8, 'n_estimators': 300, 'max_features': 'auto', 'max_depth': 5}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,31.874775,0.148698,0.436807,0.120289,0.9,200,sqrt,6,"{'subsample': 0.9, 'n_estimators': 200, 'max_f...",-11519.622719,-14381.512884,-16201.996711,-15607.248755,-13208.588387,-14183.793891,1684.475763,5
1,15.984039,0.112916,0.225727,0.069267,0.9,100,sqrt,6,"{'subsample': 0.9, 'n_estimators': 100, 'max_f...",-13484.816234,-16724.109929,-19305.339339,-17560.837031,-15309.049146,-16476.830336,1977.500553,8
2,21.097317,0.040241,0.161587,0.029823,0.9,200,sqrt,4,"{'subsample': 0.9, 'n_estimators': 200, 'max_f...",-15425.816254,-18539.827897,-21030.849197,-18316.09076,-17264.080447,-18115.332911,1826.409895,9
3,678.393513,1.923945,0.15772,0.010629,1.0,200,auto,5,"{'subsample': 1.0, 'n_estimators': 200, 'max_f...",-11482.781493,-14390.22011,-15442.319743,-14563.907862,-12656.176124,-13707.081067,1432.745209,3
4,38.163724,0.100336,0.244896,0.041657,0.8,400,sqrt,4,"{'subsample': 0.8, 'n_estimators': 400, 'max_f...",-12648.319407,-15862.434336,-18537.657002,-16666.88626,-14707.010224,-15684.461446,1965.254544,6
5,1182.70172,90.381651,0.414526,0.422503,0.8,300,auto,5,"{'subsample': 0.8, 'n_estimators': 300, 'max_f...",-10960.813127,-13235.416982,-14789.71521,-13837.296953,-11258.765816,-12816.401618,1481.97204,1
6,33.504508,0.163685,0.186794,0.005493,1.0,300,sqrt,4,"{'subsample': 1.0, 'n_estimators': 300, 'max_f...",-13066.041617,-16020.699045,-19533.528967,-16376.371532,-15271.67294,-16053.66282,2085.765305,7
7,16.864336,0.067604,0.118602,0.003699,1.0,200,sqrt,3,"{'subsample': 1.0, 'n_estimators': 200, 'max_f...",-18603.712413,-21434.009975,-25557.419112,-20969.735238,-20047.722766,-21322.519901,2327.585886,10
8,96.026111,37.623874,0.548352,0.417269,0.9,300,sqrt,7,"{'subsample': 0.9, 'n_estimators': 300, 'max_f...",-10281.382629,-13541.711304,-14959.318144,-14089.465474,-11780.451449,-12930.4658,1683.645913,2
9,143.750401,37.113718,0.604736,0.327674,1.0,200,sqrt,6,"{'subsample': 1.0, 'n_estimators': 200, 'max_f...",-11026.745754,-14024.157128,-16657.34819,-14786.642699,-12943.059293,-13887.590613,1874.837317,4


In [36]:
# calculate MSE of best estimator
report_performance(gbr_rs, X_train, y_train, X_valid, y_valid)

Training RMSE: 64.63476315627366
Validation RMSE: 113.63002689115837


#### XGBoost

Fit a model with default parameters:

In [48]:
xgbr = XGBRegressor(n_estimators=200, verbosity=1, random_state=2020)
xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [49]:
# calculate MSE
report_performance(xgbr, X_train, y_train, X_valid, y_valid)

Training RMSE: 38.80612133676185
Validation RMSE: 105.35892966539521


In [50]:
# print feature importances
report_importance(xgbr, 20, X_train)

Unnamed: 0,feature,Gini
0,houses_per_sq_km,0.137375
1,B17020e6,0.09699
2,single_no_kids,0.043833
3,B20004e17,0.035555
4,B08301e6,0.023577
5,B25012e7,0.017746
6,B20004e14,0.015624
7,B19101e8,0.014225
8,men_without_health_insurance,0.013836
9,B17020e4,0.010832


Random search for hyperparameter optimization:

In [43]:
xgbr_params = {'n_estimators': [200, 300, 400, 500],
               'max_depth': [3, 4, 5, 6, 7],
               'colsample_bytree': [0.6, 0.8, 1],
               'subsample': [0.8, 0.9, 1]}

In [44]:
xgbr_opt = XGBRegressor(verbosity=1, random_state=2020)

xgbr_rs = RandomizedSearchCV(xgbr_opt, xgbr_params,
                            scoring='neg_mean_squared_error',
                            n_jobs=-1,
                            verbose=10,
                            random_state=2020)

xgbr_rs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 20.5min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 35.7min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 48.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 85.3min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 100.2min remaining: 22.0min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 108.5min remaining:  6.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 110.8min finished


RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_jobs...
                                          num_parallel_tree=None,
                                          random_state=2020, reg_alpha=None,
                                          reg_lambda=None,
        

In [45]:
# print random search results
report_search(xgbr_rs)

{'subsample': 0.8, 'n_estimators': 400, 'max_depth': 7, 'colsample_bytree': 1}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_depth,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1227.118021,5.117772,2.505538,0.708212,0.8,400,7,1.0,"{'subsample': 0.8, 'n_estimators': 400, 'max_d...",-10240.555552,-13510.320139,-11821.716491,-13746.286233,-9887.56896,-11841.289475,1599.651965,1
1,467.805288,250.587785,0.916923,0.739449,0.8,400,4,0.8,"{'subsample': 0.8, 'n_estimators': 400, 'max_d...",-10967.289922,-14128.301581,-13854.697046,-15094.466039,-12270.864474,-13263.123812,1463.402724,10
2,728.20781,135.784257,0.964923,0.750522,0.8,300,5,0.6,"{'subsample': 0.8, 'n_estimators': 300, 'max_d...",-10395.868815,-13098.153984,-13371.828139,-13670.047975,-11911.035097,-12489.386802,1205.036867,8
3,912.713303,6.472891,1.735126,0.693951,0.9,300,6,0.6,"{'subsample': 0.9, 'n_estimators': 300, 'max_d...",-10200.669356,-12635.541976,-12372.017941,-14575.391271,-11447.150284,-12246.154166,1443.275813,5
4,725.863013,46.206923,1.797075,0.054984,0.9,200,7,0.6,"{'subsample': 0.9, 'n_estimators': 200, 'max_d...",-10125.952453,-12657.148827,-12788.464963,-13617.445314,-10980.998173,-12034.001946,1281.74016,3
5,1376.226234,17.739908,1.218453,0.742313,0.8,400,4,1.0,"{'subsample': 0.8, 'n_estimators': 400, 'max_d...",-10694.747075,-13681.770019,-13931.730581,-15491.255432,-11476.822176,-13055.265057,1741.164924,9
6,1594.381782,36.173284,0.762905,0.657102,0.9,500,4,1.0,"{'subsample': 0.9, 'n_estimators': 500, 'max_d...",-10509.20647,-12932.808112,-13240.101367,-13895.330078,-11032.196614,-12321.928528,1314.623698,7
7,1525.503676,20.723955,1.290756,0.941781,1.0,400,6,0.8,"{'subsample': 1, 'n_estimators': 400, 'max_dep...",-10770.782195,-11820.095376,-12750.414035,-13305.904893,-10796.313689,-11888.702038,1019.683819,2
8,1155.875581,46.855016,0.649334,0.433409,1.0,300,5,1.0,"{'subsample': 1, 'n_estimators': 300, 'max_dep...",-9560.402806,-12703.910565,-12899.85783,-14176.721936,-11155.954257,-12099.369479,1591.102059,4
9,634.55085,150.037998,0.72161,0.47361,1.0,200,5,1.0,"{'subsample': 1, 'n_estimators': 200, 'max_dep...",-9855.622069,-12810.892706,-13088.281837,-14370.679884,-11397.330982,-12304.561496,1546.596624,6


In [46]:
# calculate MSE of best estimator
report_performance(xgbr_rs, X_train, y_train, X_valid, y_valid)

Training RMSE: 16.70276960130824
Validation RMSE: 100.91885748480932


### Models to predict median session count

#### Gradient boosting regression

Fit a model with default parameters:

In [100]:
gbr_median = GradientBoostingRegressor(loss='quantile', 
                                       n_estimators=200, 
                                       random_state=2020, 
                                       alpha=0.5, 
                                       verbose=1)

gbr_median.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1          49.6413           12.16m
         2          48.8643           10.66m
         3          47.9232            9.96m
         4          47.0868           11.01m
         5          46.3201           11.67m
         6          45.6908           11.18m
         7          45.1657           10.73m
         8          44.7744           10.42m
         9          44.2259           10.33m
        10          44.0035           10.67m
        20          41.1483            9.27m
        30          39.4312            8.38m
        40          37.4947            7.68m
        50          36.2444            7.06m
        60          35.5483            6.49m
        70          34.9688            5.97m
        80          34.5666            5.56m
        90          34.1763            5.08m
       100          33.8831            4.67m
       200          31.3121            0.00s


GradientBoostingRegressor(alpha=0.5, loss='quantile', n_estimators=200,
                          random_state=2020, verbose=1)

In [102]:
# calculate MSE
report_performance(gbr_median, X_train, y_train, 
                   X_valid, y_valid, 'median')

Training mean absolute error: 62.62429042016808
Validation mean absolute error: 64.65782781787502


In [103]:
# print feature importances
report_importance(gbr_median, 20, X_train)

Unnamed: 0,feature,Gini
0,walk_score,0.161907
1,month,0.082855
2,houses_per_sq_km,0.075265
3,distance_to_M,0.063977
4,latitude,0.053011
5,year,0.049015
6,distance_to_S,0.014898
7,intersection_count,0.010238
8,distance_to_I,0.008728
9,Number_of_holidays,0.007777


Random search for hyperparameter optimization:

In [32]:
# reuse grid from previous search
param_grid

{'n_estimators': [100, 200, 300, 300, 400],
 'max_depth': [3, 4, 5, 6, 7],
 'max_features': ['auto', 'sqrt'],
 'subsample': [0.8, 0.9, 1]}

In [None]:
gbr_opt_med = GradientBoostingRegressor(loss='quantile', 
                                        random_state=2020, 
                                        alpha=0.5, 
                                        verbose=1)

gbr_rs_med = RandomizedSearchCV(gbr_opt_med, param_grid,
                                scoring='neg_mean_absolute_error',
                                n_jobs=-1,
                                verbose=10,
                                random_state=2020)

gbr_rs_med.fit(X_train, y_train)

In [None]:
# print random search results

In [None]:
# calculate MSE of best estimator

In [None]:
# print feature importances