**Note: clearing the output because the file size exceeds 100 MB for some strange reason...**

In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, Lasso

# import other functions
from scripts_misc.imputer import *
from scripts_misc.feature_eng import *
from scripts_misc.drop import *

from xgboost import XGBRegressor, XGBClassifier

In [None]:
alt.data_transformers.disable_max_rows()

In [None]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, 
                                     model.predict(X_train)) ** 0.5, 
                  mean_squared_error(y_valid, 
                                     model.predict(X_valid)) ** 0.5]
        
        print('Training RMSE:', errors[0])
        print('Validation RMSE:', errors[1])
        
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, 
                                      model.predict(X_train)), 
                  mean_absolute_error(y_valid, 
                                      model.predict(X_valid))]
        
        print('Training MAE:', errors[0])
        print('Validation MAE:', errors[1])

### Load the data

In [None]:
df = pd.read_csv('../data/train_data.zip')

In [None]:
# drop rows missing target variable
df = drop_missing_unacast(df)

In [None]:
# remove playgrounds where 'external_id' == 'CA00070678'
df = df.query("external_id != 'CA00070678'")

### Set 300 as an arbitrary cut-off between a low and high session count

In [None]:
df['over_300'] = df['unacast_session_count'] > 300

In [None]:
df[df['over_300'] == True].shape[0]

In [None]:
# Plot the distribution of 'high' session counts
alt.Chart(df[df['over_300']]).mark_bar().encode(
    alt.X('unacast_session_count:Q'),
    alt.Y('count()')
)

### Create a classifier to predict *low* and *high* observation

In [None]:
X = df.drop(columns=['over_300', 'unacast_session_count'], axis=1)
y = df.loc[:, 'over_300']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=2020)

In [None]:
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1]

# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

# perform feature selection
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [None]:
# check if there are any missing values in X_train, y_train
print(X_train.isna().sum().sum())
print(y_train.isna().sum())

In [None]:
# check if there are any missing values in X_valid, y_valid
print(X_valid.isna().sum().sum())
print(y_valid.isna().sum())

In [None]:
xgbc = XGBClassifier(n_estimators=200)
xgbc.fit(X_train, y_train)

#### Confusion matrix

In [None]:
confusion_matrix(y_valid, xgbc.predict(X_valid))

#### Classification report

In [None]:
print(classification_report(y_valid, xgbc.predict(X_valid)))

Classifier could be further improved by tuning to address class imbalance.

#### Misclassified observations in validation set

In [None]:
clf = xgbc.predict(X_valid) == y_valid
index = clf.index

In [None]:
clf_df = df.loc[index, ['unacast_session_count', 'over_300']]
clf_df['pred'] = xgbc.predict(X_valid)

In [None]:
prob = xgbc.predict_proba(X_valid)
clf_df['prob_over_300'] = [item[1] for item in prob]

In [None]:
# select misclassified observations
misclf_df = clf_df[clf_df['over_300'] != clf_df['pred']]

In [None]:
# plot distribution of target for misclassified observations
alt.Chart(misclf_df).mark_bar().encode(
    alt.X('unacast_session_count:Q', bin=alt.Bin(maxbins=20), title='unacast_session_count'),
    alt.Y('count()')
)

In [None]:
# plot scatter plot of unacast_session versus predict probability
alt.Chart(misclf_df).mark_circle().encode(
    alt.X('unacast_session_count:Q'),
    alt.Y('prob_over_300')
)

Most of the misclassified observations lie at the boundary, as expected.

In [None]:
# Split between high and low session counts
df_hi = df[df['over_300']]
df_lo = df[~df['over_300']]

### Create training and validation sets for *low* observation

In [None]:
X_lo = df_lo.drop(columns=['over_300', 'unacast_session_count'], axis=1)
y_lo = df_lo.loc[:, 'unacast_session_count']

In [None]:
X_train_lo, X_valid_lo, y_train_lo, y_valid_lo = train_test_split(X_lo, y_lo, test_size=0.2)

# impute NaN values
result_lo = impute_data(X_train_lo, X_valid_lo)
X_train_lo = result_lo[0]
X_valid_lo = result_lo[1]

# perform feature engineering
X_train_lo = comb_cols(X_train_lo)
X_valid_lo = comb_cols(X_valid_lo)

# perform feature selection
X_train_lo = drop_columns(X_train_lo)
X_valid_lo = drop_columns(X_valid_lo)

# perform OHE (climate, density_class, income_class)
X_train_valid_lo = clean_categorical(X_train_lo, X_valid_lo)
X_train_lo = X_train_valid_lo[0]
X_valid_lo = X_train_valid_lo[1]

# check if there are any missing values in X_train, y_train
print(X_train_lo.isna().sum().sum())
print(y_train_lo.isna().sum())

# check if there are any missing values in X_valid, y_valid
print(X_valid_lo.isna().sum().sum())
print(y_valid_lo.isna().sum())

In [None]:
#X_train_lo.to_csv('../data/X_train_lo.csv')
#X_valid_lo.to_csv('../data/X_valid_lo.csv')

#y_train_lo.to_frame().to_csv('../data/y_train_lo.csv')
#y_valid_lo.to_frame().to_csv('../data/y_valid_lo.csv')

#### `LinearRegression`

In [None]:
lr = LinearRegression()
lr.fit(X_train_lo, y_train_lo)

In [None]:
report_performance(lr, X_train_lo, y_train_lo, X_valid_lo, y_valid_lo)

In [None]:
def plot_residuals(model, X, y):
    """
    Plot the residuals.
    
    Parameters
    ----------
    X: pd.DataFrame
    
    y: pandas.core.series.Series
    
    model:
    
    Returns
    -------
    altair.vegalite.v4.api.Chart
    
    """
    
    plot_dict = {'pred': model.predict(X),
                'true': y}
    
    plot_df = pd.DataFrame(plot_dict)
    
    plot_df['res'] = plot_df['true'] - plot_df['pred']
    
    plot = (alt
            .Chart(plot_df)
            .mark_circle()
            .encode(
                alt.X('true:Q'),
                alt.Y('res:Q')
            )
           )
    
    return plot

In [None]:
plot_residuals(lr, X_valid_lo, y_valid_lo)

#### `XGBoost`

In [None]:
xgbr = XGBRegressor(n_estimators=200, verbosity=1, random_state=2020)
xgbr.fit(X_train_lo, y_train_lo)

In [None]:
report_performance(xgbr, X_train_lo, y_train_lo, X_valid_lo, y_valid_lo)

In [None]:
plot_residuals(xgbr, X_valid_lo, y_valid_lo)

### Create training and validation sets for *high* observation

In [None]:
X_hi = df_hi.drop(columns=['over_300', 'unacast_session_count'], axis=1)
y_hi = df_hi.loc[:, 'unacast_session_count']

In [None]:
X_train_hi, X_valid_hi, y_train_hi, y_valid_hi = train_test_split(X_hi, y_hi, test_size=0.2)

# impute NaN values
result_hi = impute_data(X_train_hi, X_valid_hi)
X_train_hi = result_hi[0]
X_valid_hi = result_hi[1]

# perform feature engineering
X_train_hi = comb_cols(X_train_hi)
X_valid_hi = comb_cols(X_valid_hi)

# perform feature selection
X_train_hi = drop_columns(X_train_hi)
X_valid_hi = drop_columns(X_valid_hi)

# perform OHE (climate, density_class, income_class)
X_train_valid_hi = clean_categorical(X_train_hi, X_valid_hi)
X_train_hi = X_train_valid_hi[0]
X_valid_hi = X_train_valid_hi[1]

# check if there are any missing values in X_train, y_train
print(X_train_hi.isna().sum().sum())
print(y_train_hi.isna().sum())

# check if there are any missing values in X_valid, y_valid
print(X_valid_hi.isna().sum().sum())
print(y_valid_hi.isna().sum())

In [None]:
#X_train_hi.to_csv('../data/X_train_hi.csv')
#X_valid_hi.to_csv('../data/X_valid_hi.csv')

In [None]:
#y_train_hi.to_frame().to_csv('../data/y_train_hi.csv')
#y_valid_hi.to_frame().to_csv('../data/y_valid_hi.csv')

# Summary

- Purpose: proof of concept for a two-step model
   - First, classify an observation as either "high" (>300) or "low" (<=300)
   - Then, predict the number of sessions using a model either trained on high-count data or low-count data
- Check out the companion file `classifier_regressor_model.md` that contains the work done in R


- Created an `XGBClassifier` with little tuning
   - F1 scores for "low" and "high" were 0.99 and 0.85, respectively
   
   
- Regression model built for high-count data
   - Poisson regression (log link) used to predict `unacast_session_count`
   - Validation RMSE was `1.22287e+73`
   - Other generalized linear models suitable for count data were considered (e.g. negative binomial); however, the model coefficients couldn't be obtained in R


- Regression model build for low-count data
   - `LinearRegression`
      - Validation RMSE: 54
   - `XGBRegressor`
      - Validation RMSE: 35
   - Poisson regression (log link) 
      - Validation RMSE: 53
      
- Worthwhile to mention that there's an increasing trend in the (validation) residual plots