In [2]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor

# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [8]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, model.predict(X_train)), 
                  mean_squared_error(y_valid, model.predict(X_valid))]
        metric = 'squared'
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, model.predict(X_train)), 
                  mean_absolute_error(y_valid, model.predict(X_valid))]
        metric= 'absolute'
    
    print('Training mean', metric, 'error:', errors[0])
    print('Validation mean', metric, 'error:', errors[1])

Load the data:

In [3]:
df = pd.read_csv('../data/train_data.zip')

Create `X` and `y`:

In [4]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

In [None]:
# check that playgrounds with 'external_id' == 'CA00070678' are removed

Split the data into training and validation sets:

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [6]:
# number of observations in training set
X_train.shape[0]

40096

In [7]:
# number of observations in validation set
X_valid.shape[0]

10024

Pre-process `X_train` and `X_valid`:

In [8]:
# impute NaN values

In [None]:
# perform feature engineering

In [1]:
# perform feature selection

In [2]:
# one-hot encode remaining categorical features

In [4]:
# check if there are any missing values in X_train, y_train

In [5]:
# check if there are any missing values in X_valid, y_valid

### Models to predict mean session count

#### Gradient boosting regression

Fit a model with default parameters:

In [20]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [None]:
def report_search(search):
    """
    Print the best hyperparameter settings and
    search.cv_results_ as a dataframe.
    
    Parameters
    ----------
    search: sklearn.model_selection.RandomizedSearchCV
        
    """
    print(search.best_params_)
    
    results = pd.DataFrame(search.cv_results_)
    
    print(results)

In [21]:
params = {}

In [13]:
# print random search results

In [11]:
# calculate MSE of best estimator

In [None]:
# print feature importances 

#### XGBoost

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [22]:
params = {}

In [29]:
# print random search results

In [15]:
# calculate MSE of best estimator

In [None]:
# print feature importances

### Models to predict median session count

#### Gradient boosting regression

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [25]:
params = {}

In [28]:
# print random search results

In [15]:
# calculate MSE of best estimator

In [None]:
# print feature importances

#### XGBoost

Fit a model with default parameters:

In [None]:
# calculate MSE

In [None]:
# print feature importances

Random search for hyperparameter optimization:

In [26]:
params = {}

In [27]:
# print random search results

In [15]:
# calculate MSE using best estimator

In [None]:
# print feature importances