In [34]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# import other functions
from imputer import *
from feature_eng import *
from drop import *

from xgboost import XGBRegressor, XGBClassifier

In [2]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [mean_squared_error(y_train, 
                                     model.predict(X_train)) ** 0.5, 
                  mean_squared_error(y_valid, 
                                     model.predict(X_valid)) ** 0.5]
        
        print('Training RMSE:', errors[0])
        print('Validation RMSE:', errors[1])
        
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, 
                                      model.predict(X_train)), 
                  mean_absolute_error(y_valid, 
                                      model.predict(X_valid))]
        
        print('Training MAE:', errors[0])
        print('Validation MAE:', errors[1])

In [3]:
df = pd.read_csv('../data/train_data.zip')

In [4]:
# drop rows missing target variable
df = drop_missing_unacast(df)

In [5]:
# remove playgrounds where 'external_id' == 'CA00070678'
df = df.query("external_id != 'CA00070678'")

In [6]:
df['over_300'] = df['unacast_session_count'] > 300

In [7]:
df[df['over_300'] == True].shape[0]

4150

In [8]:
X = df.drop(columns=['over_300', 'unacast_session_count'], axis=1)
y = df.loc[:, 'over_300']

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [10]:
# impute NaN values
result = impute_data(X_train, X_valid)

In [11]:
X_train = result[0]
X_valid = result[1]

In [12]:
# perform feature engineering
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)

In [13]:
# perform feature selection
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [14]:
# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [15]:
# check if there are any missing values in X_train, y_train
print(X_train.isna().sum().sum())
print(y_train.isna().sum())

0
0


In [16]:
# check if there are any missing values in X_valid, y_valid
print(X_valid.isna().sum().sum())
print(y_valid.isna().sum())

0
0


In [31]:
clf = RandomForestClassifier(n_estimators = 500, max_depth=10,class_weight="balanced")
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
clf.score(X_valid, y_valid)

0.9445342493433017

In [33]:
confusion_matrix(y_valid, clf.predict(X_valid))

array([[8618,  464],
       [  85,  731]])

In [None]:
print(classification_report())

In [None]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

In [281]:
# fit a preliminary logistic regression model
clf = LogisticRegression(class_weight='balanced', max_iter=700, verbose=2).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.1s finished


In [282]:
clf.score(X_valid, y_valid)

0.7955142453020813

In [290]:
confusion_matrix(y_valid, xgbc.predict(X_valid))

array([[9489,   34],
       [  82,  293]], dtype=int64)

In [338]:
xgbc.predict(X_valid) == y_valid

12721    True
17871    True
46441    True
48833    True
50069    True
         ... 
36902    True
5882     True
22158    True
35672    True
31331    True
Name: over_500, Length: 9898, dtype: bool

### Create two sets of data

In [132]:
df = pd.read_csv('../data/train_data.zip')

# drop rows missing target variable
df = drop_missing_unacast(df)

# remove playgrounds where 'external_id' == 'CA00070678'
df = df.query("external_id != 'CA00070678'")

In [268]:
# True if unacast_session_count > 500; False otherwise
df['over_500'] = df['unacast_session_count'] > 300

In [269]:
df[~df['over_500']].shape[0] + df[df['over_500']].shape[0] == df.shape[0]

True

In [270]:
df_hi = df[df['over_500']]
df_lo = df[~df['over_500']]

Create `hi` model:

In [271]:
X_hi = df_hi.drop(columns=['over_500', 'unacast_session_count'], axis=1)
y_hi = df_hi.loc[:, 'unacast_session_count']

In [272]:
X_train_hi, X_valid_hi, y_train_hi, y_valid_hi = train_test_split(X_hi, y_hi, test_size=0.2)

# impute NaN values
result_hi = impute_data(X_train_hi, X_valid_hi)
X_train_hi = result_hi[0]
X_valid_hi = result_hi[1]

# perform feature engineering
X_train_hi = comb_cols(X_train_hi)
X_valid_hi = comb_cols(X_valid_hi)

# perform feature selection
X_train_hi = drop_columns(X_train_hi)
X_valid_hi = drop_columns(X_valid_hi)

# perform OHE (climate, density_class, income_class)
X_train_valid_hi = clean_categorical(X_train_hi, X_valid_hi)
X_train_hi = X_train_valid_hi[0]
X_valid_hi = X_train_valid_hi[1]

# check if there are any missing values in X_train, y_train
print(X_train_hi.isna().sum().sum())
print(y_train_hi.isna().sum())

# check if there are any missing values in X_valid, y_valid
print(X_valid_hi.isna().sum().sum())
print(y_valid_hi.isna().sum())

0
0
0
0


In [273]:
lr = LinearRegression()
lr.fit(X_train_hi, y_train_hi)

LinearRegression()

In [274]:
report_performance(lr, X_train_hi, y_train_hi, X_valid_hi, y_valid_hi)

Training RMSE: 304.058813712788
Validation RMSE: 479.0032637453416


In [264]:
X_lo = df_lo.drop(columns=['over_500', 'unacast_session_count'], axis=1)
y_lo = df_lo.loc[:, 'unacast_session_count']

In [265]:
X_train_lo, X_valid_lo, y_train_lo, y_valid_lo = train_test_split(X_lo, y_lo, test_size=0.2)

# impute NaN values
result_lo = impute_data(X_train_lo, X_valid_lo)
X_train_lo = result_lo[0]
X_valid_lo = result_lo[1]

# perform feature engineering
X_train_lo = comb_cols(X_train_lo)
X_valid_lo = comb_cols(X_valid_lo)

# perform feature selection
X_train_lo = drop_columns(X_train_lo)
X_valid_lo = drop_columns(X_valid_lo)

# perform OHE (climate, density_class, income_class)
X_train_valid_lo = clean_categorical(X_train_lo, X_valid_lo)
X_train_lo = X_train_valid_lo[0]
X_valid_lo = X_train_valid_lo[1]

# check if there are any missing values in X_train, y_train
print(X_train_lo.isna().sum().sum())
print(y_train_lo.isna().sum())

# check if there are any missing values in X_valid, y_valid
print(X_valid_lo.isna().sum().sum())
print(y_valid_lo.isna().sum())

0
0
0
0


In [266]:
lr = LinearRegression()
lr.fit(X_train_lo, y_train_lo)

LinearRegression()

In [267]:
report_performance(lr, X_train_lo, y_train_lo, X_valid_lo, y_valid_lo)

Training RMSE: 53.13742745697422
Validation RMSE: 54.236752855382875


In [278]:
xgbr = XGBRegressor(n_estimators=100, verbosity=1, random_state=2020)
xgbr.fit(X_train_hi, y_train_hi)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [279]:
report_performance(xgbr, X_train_hi, y_train_hi, X_valid_hi, y_valid_hi)

Training RMSE: 43.772364779746084
Validation RMSE: 273.9819090342743
