In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

from xgboost import XGBRegressor

# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [2]:
def add_lagged_target(df):
    """
    This function takes a dataframe with the columns 
    "external_id", "month", "year" as primary keys.
    Adds a column of the "unacast_seassion_count" at lag of 1, as "session_lagged_1",
    and then deletes the first occurrence for each playground.


    Parameters
    ----------------
    df : pd.DataFrame
       A dataframe containing the columns "external_id", "month", "year"
       and "unacast_seassion_count"


    Returns
    ----------------
    pd.DataFrame
        With the new lagged session column, and deleted first occurrence of each playground, sorted by ["external_id","year","month"]
    """
    # subset and sort
    lagged = df.loc[:,["external_id","month","year","unacast_session_count"]].sort_values(by=["external_id","year","month"])

    # creat new column shifted by one (after sorting)
    lagged["session_lagged_1"] = lagged['unacast_session_count'].shift(1)
    
    # join the new column into the general dataframe
    out = pd.merge(df,lagged,how='left', left_on=['external_id','month','year',"unacast_session_count"], right_on =['external_id','month','year',"unacast_session_count"]).sort_values(by=["external_id","year","month"])

    # identify the 1st row of each playground, and subset as a df
    to_del = out.sort_values(by=["external_id","year","month"]).groupby('external_id',as_index=False).nth(0)
    # join the df with the rows to delete, then delete all rows that are have duplicates (both copies)
    out = pd.concat([out,to_del]).drop_duplicates(keep=False)
    return out

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, 
                                     model.predict(X_train)) ** 0.5, 
                  mean_squared_error(y_valid, 
                                     model.predict(X_valid)) ** 0.5]
        
        print('Training RMSE:', errors[0])
        print('Validation RMSE:', errors[1])
        
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, 
                                      model.predict(X_train)), 
                  mean_absolute_error(y_valid, 
                                      model.predict(X_valid))]
        
        print('Training MAE:', errors[0])
        print('Validation MAE:', errors[1])

In [4]:
def report_search(search):
    """
    Print the best hyperparameter settings and
    search.cv_results_ as a dataframe.
    
    Parameters
    ----------
    search: sklearn.model_selection.RandomizedSearchCV
        
    """
    print(search.best_params_)
    
    results = pd.DataFrame(search.cv_results_)
    
    return results

In [5]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [6]:
df = pd.read_csv('../data/train_data.zip')

In [7]:
# create lagged target variable column
df = add_lagged_target(df)

In [8]:
# impute missing lagged (t-1) target variable
df['session_lagged_1'] = df.groupby('external_id')['session_lagged_1'].apply(lambda x: x.interpolate(limit_direction='both'))

In [9]:
# drop rows missing target variable
df = drop_missing_unacast(df)

In [10]:
# split data into `X_train` and `X_valid`

train = df[(df['year'] != 2019) | (df['month'] <= 6)]
valid = df[(df['year'] == 2019) & (df['month'] > 6)]

#train = df.query("year != 2019 | month <= 6")
#valid = df.query("year == 2019 & month > 6")

In [11]:
# create X_train and y_train
X_train = train.drop('unacast_session_count', axis=1)
y_train = train.loc[:, 'unacast_session_count']

In [12]:
# create X_valid and y_valid
X_valid = valid.drop('unacast_session_count', axis=1)
y_valid = valid.loc[:, 'unacast_session_count']

In [13]:
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1]

In [14]:
# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

In [15]:
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [16]:
# OHE categorical columns on climate, density_class, income_class
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [17]:
# evaluate basic model performance
lr = LinearRegression()
lr.fit(X_train, y_train)
report_performance(lr, X_train, y_train, X_valid, y_valid)

Training RMSE: 78.7432924136308
Validation RMSE: 171.80114351082685


In [19]:
# create `month_number` column to create indices to use in cross validation
months_df = df.query("external_id == '01a78f56-5cc9-4309-8676-057933848570'").loc[:, ['month', 'year']]
months_df = months_df.reset_index().drop(columns='index').reset_index()
months_df = months_df.rename(columns={'index':'cross_val_ind'})
months_df = months_df[:16]

In [20]:
X_train_ind = X_train.reset_index().merge(months_df, how='left').set_index('index')

In [21]:
X_train_ind = X_train_ind.reset_index()

In [22]:
X_train_ind

Unnamed: 0,index,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,HI,LI,MI,HD,LD,MD,A,C,D,cross_val_ind
0,2445,3,2018,13,2,11,392153.846154,91923.076923,60769.230769,0.0,...,0,0,1,1,0,0,1,0,0,0
1,47628,4,2018,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,1,0,0,1,0,0,1
2,42531,5,2018,4,1,3,319000.000000,60000.000000,40000.000000,0.0,...,0,0,1,1,0,0,1,0,0,2
3,2443,6,2018,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,1,0,0,1,0,0,3
4,42530,7,2018,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,1,0,0,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39629,19014,2,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,11
39630,49286,3,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,12
39631,44250,4,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,13
39632,24063,5,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,14


In [110]:
# get indices for time dependent modeling 

# split 0
# train: 2018-03 to 2018-04; test: 2018-05 to 2018-06
train_0 = X_train_ind[(X_train_ind['cross_val_ind'] == 0) | (X_train_ind['cross_val_ind'] == 1)].index
test_0 = X_train_ind[(X_train_ind['cross_val_ind'] == 2) | (X_train_ind['cross_val_ind'] == 3)].index

# split 1
# train: 2018-03 to 2018-06; test: 2018-07 to 2018-09
train_1 = train_0.union(test_0)
test_1 = X_train_ind[(X_train_ind['cross_val_ind'] == 4) | 
                     (X_train_ind['cross_val_ind'] == 5) |
                     (X_train_ind['cross_val_ind'] == 6)].index

# split 2
# train: 2018-03 to 2018-09; test: 2018-10 to 2018-12
train_2 = train_1.union(test_1)
test_2 = X_train_ind[(X_train_ind['cross_val_ind'] == 7) | 
                     (X_train_ind['cross_val_ind'] == 8) |
                     (X_train_ind['cross_val_ind'] == 9)].index

# split 3
# train: 2018-03 to 2018-12; test: 2019-01 to 2019-03
train_3 = train_2.union(test_2)
test_3 = X_train_ind[(X_train_ind['cross_val_ind'] == 10) | 
                     (X_train_ind['cross_val_ind'] == 11) |
                     (X_train_ind['cross_val_ind'] == 12)].index

# split 4
# train: 2018-03 to 2019-03; test: 2019-04 to 2019-06
train_4 = train_3.union(test_3)
test_4 = X_train_ind[(X_train_ind['cross_val_ind'] == 13) | 
                     (X_train_ind['cross_val_ind'] == 14) |
                     (X_train_ind['cross_val_ind'] == 15)].index

In [23]:
# NEW SPLITS: get indices for time dependent modeling 

# split 0
# train: 2018-03 to 2019-01; test: 2019-02
train_0 = X_train_ind[(X_train_ind['cross_val_ind'] < 11)].index
test_0 = X_train_ind[(X_train_ind['cross_val_ind'] == 11)].index

# split 1
# train: 2018-03 to 2019-02; test: 2019-03
train_1 = train_0.union(test_0)
test_1 = X_train_ind[(X_train_ind['cross_val_ind'] == 12)].index

# split 2
# train: 2018-03 to 2019-03; test: 2019-04
train_2 = train_1.union(test_1)
test_2 = X_train_ind[(X_train_ind['cross_val_ind'] == 13)].index

# split 3
# train: 2018-03 to 2019-04; test: 2019-05
train_3 = train_2.union(test_2)
test_3 = X_train_ind[(X_train_ind['cross_val_ind'] == 14)].index

# split 4
# train: 2018-03 to 2019-05; test: 2019-04 to 2019-06
train_4 = train_3.union(test_3)
test_4 = X_train_ind[(X_train_ind['cross_val_ind'] == 15)].index

In [24]:
# create a list
folds = [(train_0, test_0), (train_1, test_1), (train_2, test_2), 
         (train_3, test_3), (train_4, test_4)]

Train gradient boosting models

In [26]:
xgbr = XGBRegressor(n_estimators=200, verbosity=1, random_state=2020)
xgbr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [27]:
# calculate MSE
report_performance(xgbr, X_train, y_train, X_valid, y_valid)

Training RMSE: 25.318972957117463
Validation RMSE: 183.399891758579


In [28]:
xgbr_params = {'n_estimators': [200, 250, 300, 350, 400],
               'max_depth': [3, 4, 5, 6, 7],
               'colsample_bytree': [0.6, 0.8, 1],
               'subsample': [0.8, 0.9, 1]}

In [30]:
xgbr_opt = XGBRegressor(verbosity=1, random_state=2020)

xgbr_rs = RandomizedSearchCV(xgbr_opt, xgbr_params, scoring='neg_mean_squared_error',
                             n_jobs=-1, cv=folds, verbose=10, random_state=2020)

xgbr_rs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 15.8min remaining:  3.5min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 17.5min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 18.4min finished


RandomizedSearchCV(cv=[(Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            39619, 39620, 39621, 39622, 39623, 39624, 39625, 39626, 39627,
            39628],
           dtype='int64', length=27244),
                        Int64Index([   11,    26,    42,    58,    73,    89,   105,   121,   137,
              153,
            ...
            39488, 39504, 39518, 39534, 39550, 39566, 39581, 39597, 39613,
            39629],
           dtype='int64', length=2451)),
                       (Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            39620, 39621, 39622, 39623, 3962...
                                          random_state=2020, reg_alpha=None,
                                          reg_lambda=None,
                                          scale_pos_weight=None, subsample=None,
                                          tree_method=None,
 

In [31]:
report_search(xgbr_rs)

{'subsample': 0.9, 'n_estimators': 350, 'max_depth': 3, 'colsample_bytree': 0.8}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_depth,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,220.838542,22.724874,0.096678,0.002695,1.0,200,7,1.0,"{'subsample': 1, 'n_estimators': 200, 'max_dep...",-20065.145285,-9027.98318,-11370.974153,-9215.453433,-4430.646272,-10822.040465,5145.600025,7
1,151.054087,14.786485,0.096268,0.001745,0.8,200,7,0.6,"{'subsample': 0.8, 'n_estimators': 200, 'max_d...",-16219.405693,-10183.856125,-12281.999778,-10014.608507,-4472.951962,-10634.564413,3806.768884,4
2,140.526189,14.320026,0.077465,0.001324,0.9,350,3,0.8,"{'subsample': 0.9, 'n_estimators': 350, 'max_d...",-15795.112396,-9514.406416,-14007.885503,-8989.360773,-4449.994538,-10551.351925,4004.654356,1
3,120.635959,12.225603,0.085755,0.002555,1.0,250,5,0.6,"{'subsample': 1, 'n_estimators': 250, 'max_dep...",-15271.444695,-9013.463906,-14648.179291,-9205.560764,-4626.118175,-10552.953366,3958.482179,2
4,270.276698,26.106677,0.125483,0.004688,1.0,300,7,0.8,"{'subsample': 1, 'n_estimators': 300, 'max_dep...",-17880.296096,-9827.312542,-11432.365759,-9343.190495,-4658.877765,-10628.408532,4270.013073,3
5,127.199297,12.590215,0.072685,0.001872,0.8,300,3,0.8,"{'subsample': 0.8, 'n_estimators': 300, 'max_d...",-15108.211548,-8430.342703,-17446.547298,-11956.454632,-4788.613208,-11546.033878,4539.688958,10
6,176.629278,20.005571,0.088518,0.00111,1.0,350,4,0.8,"{'subsample': 1, 'n_estimators': 350, 'max_dep...",-18149.158921,-10684.066914,-12778.822701,-10754.755026,-4413.035003,-11355.967713,4408.836243,9
7,242.885517,23.901205,0.093997,0.003844,0.8,200,7,1.0,"{'subsample': 0.8, 'n_estimators': 200, 'max_d...",-19715.947411,-10069.29218,-11879.337604,-9241.098796,-5218.845337,-11224.904266,4772.206282,8
8,86.195784,8.558522,0.072209,0.001906,0.8,200,4,0.6,"{'subsample': 0.8, 'n_estimators': 200, 'max_d...",-15490.36653,-10441.397038,-12960.343968,-10012.342801,-4946.020483,-10770.094164,3512.270735,6
9,124.806329,12.796169,0.074029,0.003913,1.0,200,5,0.8,"{'subsample': 1, 'n_estimators': 200, 'max_dep...",-17507.301754,-9334.313963,-12706.09626,-9144.183199,-4729.318328,-10684.242701,4249.640969,5


In [32]:
report_performance(xgbr_rs, X_train, y_train, X_valid, y_valid)

Training RMSE: 43.88789677620338
Validation RMSE: 179.13805122349103
