In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

from xgboost import XGBRegressor

# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [2]:
def add_lagged_target(df):
    """
    This function takes a dataframe with the columns 
    "external_id", "month", "year" as primary keys.
    Adds a column of the "unacast_seassion_count" at lag of 1, as "session_lagged_1",
    and then deletes the first occurrence for each playground.


    Parameters
    ----------------
    df : pd.DataFrame
       A dataframe containing the columns "external_id", "month", "year"
       and "unacast_seassion_count"


    Returns
    ----------------
    pd.DataFrame
        With the new lagged session column, and deleted first occurrence of each playground, sorted by ["external_id","year","month"]
    """
    # subset and sort
    lagged = df.loc[:,["external_id","month","year","unacast_session_count"]].sort_values(by=["external_id","year","month"])

    # creat new column shifted by one (after sorting)
    lagged["session_lagged_1"] = lagged['unacast_session_count'].shift(1)
    
    # join the new column into the general dataframe
    out = pd.merge(df,lagged,how='left', left_on=['external_id','month','year',"unacast_session_count"], right_on =['external_id','month','year',"unacast_session_count"]).sort_values(by=["external_id","year","month"])

    # identify the 1st row of each playground, and subset as a df
    to_del = out.sort_values(by=["external_id","year","month"]).groupby('external_id',as_index=False).nth(0)
    # join the df with the rows to delete, then delete all rows that are have duplicates (both copies)
    out = pd.concat([out,to_del]).drop_duplicates(keep=False)
    return out

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, 
                                     model.predict(X_train)) ** 0.5, 
                  mean_squared_error(y_valid, 
                                     model.predict(X_valid)) ** 0.5]
        
        print('Training RMSE:', errors[0])
        print('Validation RMSE:', errors[1])
        
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, 
                                      model.predict(X_train)), 
                  mean_absolute_error(y_valid, 
                                      model.predict(X_valid))]
        
        print('Training MAE:', errors[0])
        print('Validation MAE:', errors[1])

In [117]:
def report_search(search):
    """
    Print the best hyperparameter settings and
    search.cv_results_ as a dataframe.
    
    Parameters
    ----------
    search: sklearn.model_selection.RandomizedSearchCV
        
    """
    print(search.best_params_)
    
    results = pd.DataFrame(search.cv_results_)
    
    return results

In [4]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [50]:
df = pd.read_csv('../data/train_data.zip')

In [51]:
# create lagged target variable column
df = add_lagged_target(df)

In [52]:
# impute missing lagged (t-1) target variable
df['session_lagged_1'] = df.groupby('external_id')['session_lagged_1'].apply(lambda x: x.interpolate(limit_direction='both'))

In [53]:
# drop rows missing target variable
df = drop_missing_unacast(df)

In [54]:
# split data into `X_train` and `X_valid`

train = df[(df['year'] != 2019) | (df['month'] <= 6)]
valid = df[(df['year'] == 2019) & (df['month'] > 6)]

#train = df.query("year != 2019 | month <= 6")
#valid = df.query("year == 2019 & month > 6")

In [55]:
# create X_train and y_train
X_train = train.drop('unacast_session_count', axis=1)
y_train = train.loc[:, 'unacast_session_count']

In [56]:
# create X_valid and y_valid
X_valid = valid.drop('unacast_session_count', axis=1)
y_valid = valid.loc[:, 'unacast_session_count']

In [57]:
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1]

In [58]:
# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

In [59]:
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [60]:
X_train.head(5)

Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,monthly_ws_light_air,monthly_ws_light_br,monthly_ws_gentle_br,monthly_ws_moderate_br,historic_ws_calm,historic_ws_light_air,historic_ws_light_br,historic_ws_gentle_br,historic_ws_moderate_br,avg_fertility_rate
2445,3,2018,13,2,11,392153.846154,91923.076923,60769.230769,0.0,0,...,0,13,0,0,0.0,3.0,27.0,4.0,1.0,71.81125
47628,4,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0.0,3.0,27.0,4.0,1.0,71.81125
42531,5,2018,4,1,3,319000.0,60000.0,40000.0,0.0,0,...,0,0,4,0,0.0,3.0,27.0,4.0,1.0,71.81125
2443,6,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0.0,3.0,27.0,4.0,1.0,71.81125
42530,7,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0.0,3.0,27.0,4.0,1.0,71.81125


In [61]:
def ohe_test(X_train, X_valid, to_encode=['income_class', 'density_class', 'climate']):
    
    X_train_output = X_train.copy()
    X_valid_output = X_valid.copy()

    # apply One-Hot-Encoding to each one of the categorical variable
    
    ohe = OneHotEncoder(sparse=False, dtype=int)
    
    sub_X_train = ohe.fit_transform(X_train_output.loc[:, to_encode])
    sub_X_valid = ohe.transform(X_valid_output.loc[:, to_encode])
    
    ohe_cols = np.concatenate(ohe.categories_).ravel()
    
    sub_df_train = pd.DataFrame(sub_X_train, index=X_train.index, columns=ohe_cols)
    sub_df_valid = pd.DataFrame(sub_X_valid, index=X_valid.index, columns=ohe_cols)
    
    # concatenate
    X_train_output = pd.concat((X_train_output, sub_df_train), axis=1)
    X_valid_output = pd.concat((X_valid_output, sub_df_valid), axis=1)

    # drop the columns for which we used OHE
    X_train_output = X_train_output.drop(columns=to_encode)
    X_valid_output = X_valid_output.drop(columns=to_encode)
    
    return (X_train_output, X_valid_output)

In [62]:
# OHE categorical columns on climate, density_class, income_class
X_train_valid = ohe_test(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [63]:
# evaluate basic model performance
lr = LinearRegression()
lr.fit(X_train, y_train)
report_performance(lr, X_train, y_train, X_valid, y_valid)

Training RMSE: 78.74329241363074
Validation RMSE: 171.80114353341787


In [64]:
# create `month_number` column to create indices to use in cross validation
months_df = df.query("external_id == '01a78f56-5cc9-4309-8676-057933848570'").loc[:, ['month', 'year']]
months_df = months_df.reset_index().drop(columns='index').reset_index()
months_df = months_df.rename(columns={'index':'cross_val_ind'})
months_df = months_df[:16]

In [65]:
X_train_ind = X_train.reset_index().merge(months_df, how='left').set_index('index')

In [107]:
X_train_ind = X_train_ind.reset_index()

In [109]:
X_train_ind

Unnamed: 0,index,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,HI,LI,MI,HD,LD,MD,A,C,D,cross_val_ind
0,2445,3,2018,13,2,11,392153.846154,91923.076923,60769.230769,0.0,...,0,0,1,1,0,0,1,0,0,0
1,47628,4,2018,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,1,0,0,1,0,0,1
2,42531,5,2018,4,1,3,319000.000000,60000.000000,40000.000000,0.0,...,0,0,1,1,0,0,1,0,0,2
3,2443,6,2018,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,1,0,0,1,0,0,3
4,42530,7,2018,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,1,0,0,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39629,19014,2,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,11
39630,49286,3,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,12
39631,44250,4,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,13
39632,24063,5,2019,0,0,0,0.000000,0.000000,0.000000,0.0,...,0,0,1,0,1,0,0,1,0,14


In [110]:
# get indices for time dependent modeling 

# split 0
# train: 2018-03 to 2018-04; test: 2018-05 to 2018-06
train_0 = X_train_ind[(X_train_ind['cross_val_ind'] == 0) | (X_train_ind['cross_val_ind'] == 1)].index
test_0 = X_train_ind[(X_train_ind['cross_val_ind'] == 2) | (X_train_ind['cross_val_ind'] == 3)].index

In [111]:
# split 1
# train: 2018-03 to 2018-06; test: 2018-07 to 2018-09
train_1 = train_0.union(test_0)
test_1 = X_train_ind[(X_train_ind['cross_val_ind'] == 4) | 
                     (X_train_ind['cross_val_ind'] == 5) |
                     (X_train_ind['cross_val_ind'] == 6)].index

# split 2
# train: 2018-03 to 2018-09; test: 2018-10 to 2018-12
train_2 = train_1.union(test_1)
test_2 = X_train_ind[(X_train_ind['cross_val_ind'] == 7) | 
                     (X_train_ind['cross_val_ind'] == 8) |
                     (X_train_ind['cross_val_ind'] == 9)].index

# split 3
# train: 2018-03 to 2018-12; test: 2019-01 to 2019-03
train_3 = train_2.union(test_2)
test_3 = X_train_ind[(X_train_ind['cross_val_ind'] == 10) | 
                     (X_train_ind['cross_val_ind'] == 11) |
                     (X_train_ind['cross_val_ind'] == 12)].index

# split 4
# train: 2018-03 to 2019-03; test: 2019-04 to 2019-06
train_4 = train_3.union(test_3)
test_4 = X_train_ind[(X_train_ind['cross_val_ind'] == 13) | 
                     (X_train_ind['cross_val_ind'] == 14) |
                     (X_train_ind['cross_val_ind'] == 15)].index

In [120]:
# NEW SPLITS: get indices for time dependent modeling 

# split 0
# train: 2018-03 to 2018-04; test: 2018-05 to 2018-06
train_0 = X_train_ind[(X_train_ind['cross_val_ind'] < 11)].index
test_0 = X_train_ind[(X_train_ind['cross_val_ind'] == 11)].index

# split 1
# train: 2018-03 to 2018-06; test: 2018-07 to 2018-09
train_1 = train_0.union(test_0)
test_1 = X_train_ind[(X_train_ind['cross_val_ind'] == 12)].index

# split 2
# train: 2018-03 to 2018-09; test: 2018-10 to 2018-12
train_2 = train_1.union(test_1)
test_2 = X_train_ind[(X_train_ind['cross_val_ind'] == 13)].index

# split 3
# train: 2018-03 to 2018-12; test: 2019-01 to 2019-03
train_3 = train_2.union(test_2)
test_3 = X_train_ind[(X_train_ind['cross_val_ind'] == 14)].index

# split 4
# train: 2018-03 to 2019-03; test: 2019-04 to 2019-06
train_4 = train_3.union(test_3)
test_4 = X_train_ind[(X_train_ind['cross_val_ind'] == 15)].index

In [121]:
# create a list
folds = [(train_0, test_0), (train_1, test_1), (train_2, test_2), 
         (train_3, test_3), (train_4, test_4)]

Train gradient boosting models

In [125]:
xgbr_opt = XGBRegressor(n_estimators=300, n_jobs=-1, random_state=2020)

xgbr_rs = RandomizedSearchCV(xgbr_opt, xgbr_params, n_iter=1,
                             scoring='neg_mean_squared_error', n_jobs=-1,
                             cv=folds, verbose=10, random_state=2020)

xgbr_rs.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.1min remaining:  6.2min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  4.4min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.0min finished


RandomizedSearchCV(cv=[(Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            39619, 39620, 39621, 39622, 39623, 39624, 39625, 39626, 39627,
            39628],
           dtype='int64', length=27244),
                        Int64Index([   11,    26,    42,    58,    73,    89,   105,   121,   137,
              153,
            ...
            39488, 39504, 39518, 39534, 39550, 39566, 39581, 39597, 39613,
            39629],
           dtype='int64', length=2451)),
                       (Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            39620, 39621, 39622, 39623, 3962...
                                          num_parallel_tree=None,
                                          random_state=2020, reg_alpha=None,
                                          reg_lambda=None,
                                          scale_pos_weight=None, subsample=N