In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor

# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, model.predict(X_train)), 
                  mean_squared_error(y_valid, model.predict(X_valid))]
        metric = 'squared'
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, model.predict(X_train)), 
                  mean_absolute_error(y_valid, model.predict(X_valid))]
        metric= 'absolute'
    
    print('Training mean', metric, 'error:', errors[0])
    print('Validation mean', metric, 'error:', errors[1])

Load the data:

In [4]:
df = pd.read_csv('../data/train_data.zip')

In [5]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [6]:
# check that playgrounds 'external_id' == 'CA00070678' have been removed
df = df.query("external_id != 'CA00070678'")

Create `X` and `y`:

In [7]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

Split the data into training and validation sets:

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [9]:
# number of observations in training set
X_train.shape[0]

40080

In [10]:
# number of observations in validation set
X_valid.shape[0]

10020

Pre-process `X_train` and `X_valid`:

In [11]:
# impute NaN values
result = impute_data(X_train, X_valid)

In [12]:
X_train = result[0]
X_valid = result[1]

In [13]:
# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

In [14]:
# perform feature selection
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [15]:
# check the number of categorical columns to OHE
X_train.dtypes.value_counts()

int64      422
float64    216
object       3
dtype: int64

In [None]:
# OHE remaining categorical features

In [None]:
# check if there are any missing values in X_train, y_train

In [None]:
# check if there are any missing values in X_valid, y_valid

### Models to predict mean session count

#### Gradient boosting regression

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [None]:
def report_search(search):
    """
    Print the best hyperparameter settings and
    search.cv_results_ as a dataframe.
    
    Parameters
    ----------
    search: sklearn.model_selection.RandomizedSearchCV
        
    """
    print(search.best_params_)
    
    results = pd.DataFrame(search.cv_results_)
    
    print(results)

In [None]:
params = {}

In [None]:
# print random search results

In [None]:
# calculate MSE of best estimator

In [None]:
# print feature importances 

#### XGBoost

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [None]:
params = {}

In [None]:
# print random search results

In [None]:
# calculate MSE of best estimator

In [None]:
# print feature importances

### Models to predict median session count

#### Gradient boosting regression

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [None]:
params = {}

In [None]:
# print random search results

In [None]:
# calculate MSE of best estimator

In [None]:
# print feature importances

#### XGBoost

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [None]:
params = {}

In [None]:
# print random search results

In [None]:
# calculate MSE using best estimator

In [None]:
# print feature importances