In [1]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


#Download libraries
import re
from collections import defaultdict
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import time


alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
# Import functions from scripts
from scripts_misc.preprocessing_old import *
from scripts_misc.PCA_data import *

In [3]:
df = pd.read_csv('../data/old_train_data.zip')

### Running preprocessing and data split

In [5]:
clean_df = preprocessing_na(df)
clean_df = clean_categorical_old(clean_df)

In [10]:
X = clean_df.drop(['unacast_session_count','external_id', 'state'],axis=1)
y = clean_df['unacast_session_count']

In [11]:
#clean_df.drop('unacast_session_count',axis=1)

In [12]:
#clean_df.columns[clean_df.isna().any()].tolist()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

### Running single model fit to test

In [91]:
rf = RandomForestRegressor(n_estimators=1000, max_depth = 15, min_samples_split = 100, max_features = 0.10, bootstrap = True)

In [93]:
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [94]:
tr_time/60

19.015274806817374

In [95]:
rf.score(X_train,y_train)

0.37345521902158674

In [96]:
rf.score(X_test,y_test)

0.18648354515924348

### setting hyperparameter dictionary for optimization

In [126]:
max_depth = [1+i*4 for i in range(1,5)]
min_samples_leaf = [50+i*50 for i in range(2,8)]
max_features = [0.05*i for i in range(1,8)]
bootstrap = True
max_samples = [0.05*i for i in range(14,21)]
d = {"max_depth":max_depth, "min_samples_leaf":min_samples_leaf, "max_features":max_features}

In [124]:
d.values()

dict_values([[5, 9, 13, 17], [150, 200, 250, 300, 350, 400], [0.05, 0.1, 0.15000000000000002, 0.2, 0.25, 0.30000000000000004, 0.35000000000000003]])

In [125]:
c = 1
for val in d.values():
    c *= len(val)
c

168

### Running Randomized grid search for optimization

In [130]:
t0 = time.time()
rf_cv = RandomForestRegressor(n_estimators=750, bootstrap = True)
rgscv = RandomizedSearchCV(rf_cv,param_distributions=d,return_train_score=True ,n_iter=15 ,scoring=['neg_root_mean_squared_error'], refit=False)
search = rgscv.fit(X, y)
t1 = time.time()
cv_time = t1-t0

In [131]:
cv_time/60

599.4347654660543

In [158]:
d = search.cv_results_

In [168]:
#d

### Extracting and printing RGSCV results

In [175]:
rf_rgscv_results = pd.DataFrame(data=d).sort_values(by='rank_test_neg_root_mean_squared_error')
rf_rgscv_results=rf_rgscv_results.iloc[:,:22]

In [176]:
rf_rgscv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_features,param_max_depth,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,split2_test_neg_root_mean_squared_error,...,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_train_neg_root_mean_squared_error,split1_train_neg_root_mean_squared_error,split2_train_neg_root_mean_squared_error,split3_train_neg_root_mean_squared_error,split4_train_neg_root_mean_squared_error,mean_train_neg_root_mean_squared_error,std_train_neg_root_mean_squared_error
1,1016.926774,3.763795,0.587974,0.011191,150,0.3,13,-464.515447,-302.730338,-629.580331,...,-455.094796,159.378756,1,-481.215124,-512.288838,-433.786685,-435.254739,-519.565405,-476.422158,36.560282
6,1197.509361,8.609965,0.597603,0.012514,150,0.35,13,-464.434362,-302.716681,-629.421185,...,-455.160011,159.24192,2,-481.438013,-512.404457,-433.465274,-435.162117,-519.539372,-476.401847,36.678772
14,773.598454,0.646408,0.293532,0.005846,200,0.3,9,-469.116574,-308.985715,-632.90214,...,-460.035437,157.964256,3,-486.421906,-517.017877,-439.159816,-440.049705,-524.077024,-481.345266,36.357245
0,170.13329,1.54478,0.78284,0.004134,150,0.05,17,-469.553148,-310.363621,-633.059468,...,-460.885842,158.57553,4,-486.676589,-517.196068,-439.685469,-440.214125,-525.015812,-481.757612,36.461703
10,326.690148,2.641367,0.684469,0.002193,200,0.1,17,-471.175422,-313.376754,-634.705612,...,-462.867679,157.827871,5,-489.015941,-519.653478,-442.311804,-442.694866,-526.770448,-484.089307,36.248093
2,626.061902,3.289099,0.29434,0.006392,250,0.25,9,-471.471676,-313.456268,-635.223201,...,-463.548105,157.226865,6,-489.649715,-520.223689,-442.509509,-444.068146,-527.718017,-484.833815,36.243282
13,452.836061,2.845391,0.520202,0.003601,250,0.15,13,-472.613154,-315.70658,-635.963329,...,-464.642483,157.018427,7,-490.870122,-521.446642,-444.231599,-444.53498,-528.691804,-485.955029,36.24006
12,297.650893,2.421671,0.532775,0.007687,250,0.1,13,-474.514704,-318.194485,-636.889917,...,-466.500964,156.712289,8,-493.145293,-523.082607,-445.983079,-445.964647,-530.364993,-487.708124,36.287801
7,128.459541,0.376328,0.356902,0.003815,200,0.05,9,-475.063084,-319.331639,-637.667586,...,-467.553307,156.666474,9,-493.409384,-523.399621,-446.76338,-446.792077,-530.814128,-488.235718,36.09354
3,557.549278,4.709319,0.420704,0.00359,350,0.2,17,-476.147388,-321.006546,-638.344953,...,-468.683273,155.995321,10,-495.083086,-525.064618,-448.74342,-449.001338,-532.649134,-490.108319,35.936752


In [177]:
rf_rgscv_results.to_csv("../results/RF_v1_gscv_results.csv", index = False)

In [148]:
# min_sample_leaf = []
# max_features = []
# max_depth = []
# for i in search.cv_results_['params']:
#     min_sample_leaf.append(i['min_sample_leaf'])
#     max_features.append(i['max_features'])
#     max_depth.append(i['max_depth'])

[{'min_samples_leaf': 150, 'max_features': 0.05, 'max_depth': 17},
 {'min_samples_leaf': 150,
  'max_features': 0.30000000000000004,
  'max_depth': 13},
 {'min_samples_leaf': 250, 'max_features': 0.25, 'max_depth': 9},
 {'min_samples_leaf': 350, 'max_features': 0.2, 'max_depth': 17},
 {'min_samples_leaf': 400, 'max_features': 0.25, 'max_depth': 17},
 {'min_samples_leaf': 350,
  'max_features': 0.15000000000000002,
  'max_depth': 5},
 {'min_samples_leaf': 150,
  'max_features': 0.35000000000000003,
  'max_depth': 13},
 {'min_samples_leaf': 200, 'max_features': 0.05, 'max_depth': 9},
 {'min_samples_leaf': 350, 'max_features': 0.1, 'max_depth': 17},
 {'min_samples_leaf': 250, 'max_features': 0.05, 'max_depth': 17},
 {'min_samples_leaf': 200, 'max_features': 0.1, 'max_depth': 17},
 {'min_samples_leaf': 300, 'max_features': 0.1, 'max_depth': 13},
 {'min_samples_leaf': 250, 'max_features': 0.1, 'max_depth': 13},
 {'min_samples_leaf': 250,
  'max_features': 0.15000000000000002,
  'max_depth':

## Summary


- Preliminary round of modeling with SKlearn's RandomForestRegressor to minimize the RMSE (finding the mean)

    - MAE minimization in SKlearn's random forest is problmatic due to implementation, which leads to **very** long run times 

- Input data
    - Missing values were mostly imputed with 0s
    - Columns with a high proportion of missing values were dropped
    
- Tuning and optimization
    - Assuming that the quality of the random forest increases linearly with the number of trees, a medium-ish number was selected to save time (n=750)
    - optimized over `max_depth`, `min_samples_leaf`, and `max_features`. Explanation on each can be found [here](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    - train RMSE was generally between 470 to 490, and validation RMSE between 455 and 470, which suggests slight under fitting.
    - Generally runs with higher percentage of columns used per tree performed better, however this is expected due to slight undefitting.