In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor

# import other functions
from imputer import *
from feature_eng import *
from drop import *
from preprocessing import *

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [31]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, 
                                     model.predict(X_train)) ** 0.5, 
                  mean_squared_error(y_valid, 
                                     model.predict(X_valid)) ** 0.5]
        metric = 'squared'
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, 
                                      model.predict(X_train)), 
                  mean_absolute_error(y_valid, 
                                      model.predict(X_valid))]
        metric= 'absolute'
    
    print('Training mean', metric, 'error:', errors[0])
    print('Validation mean', metric, 'error:', errors[1])

In [97]:
def report_importance(model, n, df):
    """
    Return column names and Gini coefficients of
    n most important features.
    
    Parameters
    ----------
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        gradient boosting regressor
        
    n: int
        number of features
        
    df: pd.DataFrame
        either `X_train` or `X_valid`
    
    Returns
    -------
    pd.DataFrame
    
    """
    # code attribution: https://tinyurl.com/ya52tn2p
    values = model.feature_importances_
    indices = (-values).argsort()[:n]
    
    # get column names of n most important features
    col_names = df.iloc[:, list(indices)].columns.to_list()
    
    # get Gini coefficient of n most important features
    gini_coeff = list(np.sort(values)[-n:][::-1])

    data = {'feature': col_names, 'Gini': gini_coeff}
    
    result = pd.DataFrame(data)
    
    display(result)

In [None]:
def report_search(search):
    """
    Print the best hyperparameter settings and
    search.cv_results_ as a dataframe.
    
    Parameters
    ----------
    search: sklearn.model_selection.RandomizedSearchCV
        
    """
    print(search.best_params_)
    
    results = pd.DataFrame(search.cv_results_)
    
    print(results)

Load the data:

In [4]:
df = pd.read_csv('../data/train_data.zip')

In [5]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [6]:
# drop rows missing target variable
df = drop_missing_unacast(df)

In [7]:
# check that playgrounds 'external_id' == 'CA00070678' have been removed
df = df.query("external_id != 'CA00070678'")

Create `X` and `y`:

In [8]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

Split the data into training and validation sets:

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [10]:
# number of observations in training set
X_train.shape[0]

39592

In [11]:
# number of observations in validation set
X_valid.shape[0]

9898

Pre-process `X_train` and `X_valid`:

In [12]:
# impute NaN values
result = impute_data(X_train, X_valid)

In [13]:
X_train = result[0]
X_valid = result[1]

In [14]:
# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

In [15]:
# perform feature selection
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [16]:
# check the number of categorical columns to OHE
X_train.dtypes.value_counts()

int64      422
float64    201
object       3
dtype: int64

In [17]:
# OHE remaining categorical features
X_train = clean_categorical(X_train)
X_valid = clean_categorical(X_valid)

In [18]:
# check if there are any missing values in X_train, y_train
print(X_train.isna().sum().sum())
print(y_train.isna().sum())

0
0


In [19]:
# check if there are any missing values in X_valid, y_valid
print(X_valid.isna().sum().sum())
print(y_valid.isna().sum())

0
0


### Models to predict mean session count

#### Gradient boosting regression

Fit a model with default parameters:

In [92]:
gbr = GradientBoostingRegressor(n_estimators=200, 
                                random_state=2020, verbose=1) 
gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1       66167.6312            8.73m
         2       60407.0510            8.71m
         3       55700.6114           10.19m
         4       51808.1136           10.47m
         5       48584.9494           10.39m
         6       45809.8496           10.67m
         7       43589.0397           11.16m
         8       41635.1151           11.02m
         9       40082.9219           10.80m
        10       38686.2115           10.58m
        20       30737.9380            9.45m
        30       27290.4657            8.42m
        40       25126.6220            7.68m
        50       23686.2311            7.08m
        60       22423.3906            6.53m
        70       21605.3804            5.99m
        80       20649.3892            5.51m
        90       19862.4662            5.12m
       100       19269.5782            4.64m
       200       15089.5869            0.00s


GradientBoostingRegressor(n_estimators=200, random_state=2020, verbose=1)

In [93]:
# calculate MSE
report_performance(gbr, X_train, y_train, X_valid, y_valid)

Training mean squared error: 122.83967985225044
Validation mean squared error: 138.86903174128227


In [94]:
# report Gini coefficients
report_importance(gbr, 20, X_train)

Unnamed: 0,feature,Gini
0,houses_per_sq_km,0.490098
1,walk_score,0.036173
2,year,0.029934
3,B08301e10,0.026956
4,B17020e6,0.02265
5,B25012e3,0.018782
6,month,0.016195
7,historic_foggy,0.01469
8,B25012e17,0.01458
9,Adult_obesity,0.010792


Random search for hyperparameter optimization:

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400],
              'max_depth': [3, 4, 5, 6, 7],
              'max_features': ['auto', 'sqrt'],
              'subsample': [0.8, 0.9, 1.0]}

In [None]:
gbr_opt = GradientBoostingRegressor(random_state=2020)

gbr_rs = RandomizedSearchCV(gbr_opt, param_grid,
                            scoring='neg_mean_squared_error',
                            n_jobs=-1,
                            verbose=5,
                            random_state=2020)

gbr_search = gbr_rs.fit(X_train, y_train)

In [None]:
# print random search results

In [None]:
# calculate MSE of best estimator

In [None]:
# print feature importances 

#### XGBoost

Fit a model with default parameters:

In [95]:
xgbr = XGBRegressor(n_estimators=200, verbosity=1, random_state=2020)
xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [96]:
# calculate MSE
report_performance(xgbr, X_train, y_train, X_valid, y_valid)

Training mean squared error: 38.80612133676185
Validation mean squared error: 105.35892966539521


In [98]:
# print feature importances
report_importance(xgbr, 20, X_train)

Unnamed: 0,feature,Gini
0,houses_per_sq_km,0.137375
1,B17020e6,0.09699
2,single_no_kids,0.043833
3,B20004e17,0.035555
4,B08301e6,0.023577
5,B25012e7,0.017746
6,B20004e14,0.015624
7,B19101e8,0.014225
8,men_without_health_insurance,0.013836
9,B17020e4,0.010832


Random search for hyperparameter optimization:

In [None]:
params = {}

In [None]:
# print random search results

In [None]:
# calculate MSE of best estimator

In [None]:
# print feature importances

### Models to predict median session count

#### Gradient boosting regression

Fit a model with default parameters:

In [100]:
gbr_median = GradientBoostingRegressor(loss='quantile', 
                                       n_estimators=200, 
                                       random_state=2020, 
                                       alpha=0.5, 
                                       verbose=1)

gbr_median.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1          49.6413           12.16m
         2          48.8643           10.66m
         3          47.9232            9.96m
         4          47.0868           11.01m
         5          46.3201           11.67m
         6          45.6908           11.18m
         7          45.1657           10.73m
         8          44.7744           10.42m
         9          44.2259           10.33m
        10          44.0035           10.67m
        20          41.1483            9.27m
        30          39.4312            8.38m
        40          37.4947            7.68m
        50          36.2444            7.06m
        60          35.5483            6.49m
        70          34.9688            5.97m
        80          34.5666            5.56m
        90          34.1763            5.08m
       100          33.8831            4.67m
       200          31.3121            0.00s


GradientBoostingRegressor(alpha=0.5, loss='quantile', n_estimators=200,
                          random_state=2020, verbose=1)

In [102]:
# calculate MSE
report_performance(gbr_median, X_train, y_train, 
                   X_valid, y_valid, 'median')

Training mean absolute error: 62.62429042016808
Validation mean absolute error: 64.65782781787502


In [103]:
# print feature importances
report_importance(gbr_median, 20, X_train)

Unnamed: 0,feature,Gini
0,walk_score,0.161907
1,month,0.082855
2,houses_per_sq_km,0.075265
3,distance_to_M,0.063977
4,latitude,0.053011
5,year,0.049015
6,distance_to_S,0.014898
7,intersection_count,0.010238
8,distance_to_I,0.008728
9,Number_of_holidays,0.007777


Random search for hyperparameter optimization:

In [None]:
params = {}

In [None]:
# print random search results

In [None]:
# calculate MSE of best estimator

In [None]:
# print feature importances

#### XGBoost

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [None]:
params = {}

In [None]:
# print random search results

In [None]:
# calculate MSE using best estimator

In [None]:
# print feature importances