In [1]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


#Download libraries
import re
from collections import defaultdict
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import time


alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
df = pd.read_csv('../data/train_data.zip')

### Importing 5 helpful functions

In [3]:
def dict_to_columns_df(col, key, val):
    """
    This functions takes a dataframe column which is in the
    form of list of dictionaries and creates a dataframe
    from the keys of the in the inner list of dictionaries 
    e.g. "[{'key': A, 'val': 1}, {'key': B, 'val': 2}]"
    
    Parameters
    ----------------
    col : DataFrame Series, the columns whose values are the in the format
    of a list of dictionaries.
    
    key : the keys in the inner dictionary from which column names are to be extracted
    
    val : the keys in the inner dictionary from which values in the column needs to
    be extracted
    
    
    Returns
    ----------------
    DataFrame
        With the new columns created from the keys of the inner dictionary
        
    """
    key_list = set()
    i=0
    # getting all the new column names
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list
            for dic in range(len(dic_list)):
                if re.match('[a-zA-Z]', dic_list[dic][str(key)][0]): #removing spanish names
                    key_list.add("monthly_"+dic_list[dic][str(key)])
        i+=1
    
    all_cols_dict = defaultdict(list)
    
    i = 0
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list

            for col_names in list(key_list):
                flag = 0 #to check if a column name exists in the dictionary
                for dic in range(len(dic_list)):
                    if dic_list[dic][str(key)] == col_names[8:]: #getting values from the inner dictionary matching the key
                        all_cols_dict[col_names].append(dic_list[dic][str(val)]) #putting inner dict values to new default dict
                        flag = 1
                        break
                
                if flag==0:
                    all_cols_dict[col_names].append(None)

        else:
            for col_names in list(key_list):
                all_cols_dict[col_names].append(None)

        i+=1
    new_cols_df = pd.DataFrame(all_cols_dict)
    
    # checking new df has same number of columns as given column
    if new_cols_df.shape[0] == col.shape[0]:
        return new_cols_df
    else:
        print("Column dimensions don't match")

In [4]:
def biba_pp(full_data):  
    
    """
    Performs the pre-processing of the columns for the biba data
    
    Paramters
    ---------------
    
    full_data : DataFrame, with no operations done on the biba columns
    
    Returns
    ---------------
    DataFrame
        with processed biba columns
    
    """
    biba_games_df = pd.DataFrame()
    biba_games_df = pd.concat([full_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'],
                               full_data.loc[:, 'historic_number_of_sessions':'historic_snow']], axis = 1)
    
    #extracting categorical features
    categorical_features = biba_games_df.loc[:, biba_games_df.dtypes == "object"]
     
    # creating cols from list of dictionaries
    monthly_survey_df = dict_to_columns_df(categorical_features['monthly_survey'], 'question', 'avg_answer')
    monthly_weekday_counts_df = dict_to_columns_df(categorical_features['monthly_weekday_counts'], 'weekday', 'count')
    
    biba_games_df = pd.concat([biba_games_df, monthly_survey_df, monthly_weekday_counts_df], axis = 1)
    
    #dropping categorical features
    biba_games_df = biba_games_df.drop(columns = list(categorical_features.columns))
    
    #dropping historic hours with low fill rate
    numerical_cols_to_remove = ['historic_hour_0', 'historic_hour_23', 'historic_hour_22', 'historic_hour_21',
                                'historic_hour_7','historic_hour_6','historic_hour_5','historic_hour_4', 
                                'historic_hour_3','historic_hour_2','historic_hour_1', 'MonthYear']
    
    biba_games_df = biba_games_df.drop(columns = numerical_cols_to_remove)
    
    impute_biba_games_df =  biba_games_df.fillna(0)
    
    #removing the previous columns in the input data
    cols_to_drop = list(df.loc[:, 'monthly_number_of_sessions': 'distance_to_nearest_bus_stop'].columns) +\
                    list(df.loc[:, 'historic_number_of_sessions' : 'historic_snow'].columns)
    
    
    full_data = full_data.drop(columns = cols_to_drop)
    
    #adding processed columns
    full_data = pd.concat([full_data, impute_biba_games_df], axis = 1)
    
    return full_data

In [5]:
def preprocess_neighbour(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to locale information (`city` to
    `houses_per_sq_km`). Drop columns with >30%
    NaN values and replace remaining NaN values with 0.
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    """
    
    df_neighbour = input_data.loc[:, 'city':'houses_per_sq_km']
    df_neighbour.drop(columns=['climate'])
    missing = df_neighbour.isna()
    
    # Count number of missing values for each column
    num_missing = missing.sum().sort_values(ascending=False)
    
    # Calculate proportion of missing values for each column
    prop_missing = num_missing / df.shape[0]
    
    # Create a list of columns with >30% of values missing
    to_drop = prop_missing[prop_missing > 0.3].index.to_list()
    
    # Add `country` to the list since all playgrounds are in the U.S.
    # Add `city` and `county` since lat. and long. should take care of them
    to_drop.append('country')
    to_drop.append('city')
    to_drop.append('county')
    
    # Drop columns with names in list
    output_data = input_data.drop(to_drop, axis=1)
    
    # Fill in remaining NaN values in locale-related columns with 0
    to_impute = prop_missing[(0 < prop_missing) & (prop_missing <= 0.3)].index.to_list()
    to_impute.remove('city')
    to_impute.remove('county')
    output_data[to_impute] = output_data[to_impute].fillna(0)
    output_data['climate'] = input_data['climate']

    return output_data

In [6]:
def preprocess_weather(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to weather information (`Democrats_08_Votes` to
    the end + `climate`). Impute NaN of `Number_of_holidays` 
    by using the values the we have for the same month,
    impute NaN of `Green_2016` by using values found online, or 0, 
    and replace remaining NaN values with 0.
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    df_weather = input_data.loc[:, 'Democrats_08_Votes':]
    df_weather['state'] = input_data['state']
    df_weather['climate'] = input_data['climate']
    df_weather['external_id'] = input_data['external_id']
    df_weather['month'] = input_data['month']
    df_weather['year'] = input_data['year']
    
    
    #fill up NaNs for `Number_of_holidays` column
    #I sorted the values so that the values are ordered by time, and the NaNs are at the end of each time period
    df_weather = df_weather.sort_values(['month', 'year', 'Number_of_holidays'])
    df_weather['Number_of_holidays'] = df_weather['Number_of_holidays'].fillna(method='ffill')
    
    #fill up NaNs for the `Green_2016` column
    #I only found values for Alaska and North Carolina, so I just put 0 for the other states
    df_weather['Green_2016'] = np.where(
     df_weather['state'] == 'Alaska', 5735, 
         np.where(
            df_weather['state'] == 'North Carolina', 12105,  
             np.where(
                df_weather['Green_2016'].isnull(), 0, df_weather['Green_2016'] 
             )
         )
    )
    
    df_weather['climate'] = df_weather['climate'].fillna(df_weather['climate'].mode()[0])
    
    #Substitute every remaining NaNs by 0
    df_weather = df_weather.fillna(value=0)
    
    output_data = input_data.copy()
    output_data.loc[:, 'Democrats_08_Votes':] = df_weather.loc[:, 'Democrats_08_Votes':]
    output_data['climate'] = df_weather['climate']
    
    #Tests
    
    #Check that there are no missing values in the `Number_of_holidays` column
    if not output_data['Number_of_holidays'].isnull().sum() == 0:
        raise Error('There should not be NaNs in the Number_of_holidays column')
    
    #Check that every month has only one value for the `Number_of_holiday` column
    number_of_error = 0
    for month in range(12):
        for year in [2018, 2019]:
            sub_df = output_data[(output_data['month'] == month+1) & (output_data['year'] == year)]
            if len(sub_df['Number_of_holidays'].unique()) > 1:
                number_of_error += 1 
    if not number_of_error == 0:
        raise Error('Every month should have the same value for Number_of_holidays')
    
    
               
    return output_data

In [7]:
def clean_categorical(input_data, to_drop=['income_class', 'density_class', 'climate']):
    """
    Given the original dataframe, uses One-Hot-Encoding to encode the categorical variables
    
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    to_drop : list
        The list of the categorical variables on which we want to apply OHE
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    output_data = input_data.copy()

    #Apply One-Hot-Encoding to each one of the categorical variable
    for col in to_drop:
        ohe = OneHotEncoder(sparse=False, dtype=int)
        sub_df = pd.DataFrame(ohe.fit_transform(input_data[[col]]), columns=ohe.categories_[0])
        output_data = pd.concat((output_data, sub_df), axis=1)
    #Drop the columns for which we used OHE
    output_data.drop(columns = to_drop, inplace=True)
    
    return output_data

### Running preprocessing and data split

In [9]:
df = pd.read_csv('../data/train_data.zip')
clean_df = preprocess_neighbour(df) 
clean_df = biba_pp(clean_df)
clean_df = preprocess_weather(clean_df)
#clean_df.loc[:, clean_df.dtypes == "object"].columns
clean_df = clean_categorical(clean_df)
# filling 0 in 'days_since_first_sess'
clean_df['days_since_first_sess'] = clean_df['days_since_first_sess'].fillna(0)


In [10]:
X = clean_df.drop(['unacast_session_count','external_id', 'state'],axis=1)
y = clean_df['unacast_session_count']

In [11]:
#clean_df.drop('unacast_session_count',axis=1)

In [12]:
#clean_df.columns[clean_df.isna().any()].tolist()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

### Running single model fit to test

In [91]:
rf = RandomForestRegressor(n_estimators=1000, max_depth = 15, min_samples_split = 100, max_features = 0.10, bootstrap = True)

In [93]:
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [94]:
tr_time/60

19.015274806817374

In [95]:
rf.score(X_train,y_train)

0.37345521902158674

In [96]:
rf.score(X_test,y_test)

0.18648354515924348

### setting hyperparameter dictionary for optimization

In [126]:
max_depth = [1+i*4 for i in range(1,5)]
min_samples_leaf = [50+i*50 for i in range(2,8)]
max_features = [0.05*i for i in range(1,8)]
bootstrap = True
max_samples = [0.05*i for i in range(14,21)]
d = {"max_depth":max_depth, "min_samples_leaf":min_samples_leaf, "max_features":max_features}

In [124]:
d.values()

dict_values([[5, 9, 13, 17], [150, 200, 250, 300, 350, 400], [0.05, 0.1, 0.15000000000000002, 0.2, 0.25, 0.30000000000000004, 0.35000000000000003]])

In [125]:
c = 1
for val in d.values():
    c *= len(val)
c

168

### Running Randomized grid search for optimization

In [130]:
t0 = time.time()
rf_cv = RandomForestRegressor(n_estimators=750, bootstrap = True)
rgscv = RandomizedSearchCV(rf_cv,param_distributions=d,return_train_score=True ,n_iter=15 ,scoring=['neg_root_mean_squared_error'], refit=False)
search = rgscv.fit(X, y)
t1 = time.time()
cv_time = t1-t0

In [131]:
cv_time/60

599.4347654660543

In [158]:
d = search.cv_results_

In [168]:
#d

### Extracting and printing RGSCV results

In [175]:
rf_rgscv_results = pd.DataFrame(data=d).sort_values(by='rank_test_neg_root_mean_squared_error')
rf_rgscv_results=rf_rgscv_results.iloc[:,:22]

In [176]:
rf_rgscv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_features,param_max_depth,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,split2_test_neg_root_mean_squared_error,...,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_train_neg_root_mean_squared_error,split1_train_neg_root_mean_squared_error,split2_train_neg_root_mean_squared_error,split3_train_neg_root_mean_squared_error,split4_train_neg_root_mean_squared_error,mean_train_neg_root_mean_squared_error,std_train_neg_root_mean_squared_error
1,1016.926774,3.763795,0.587974,0.011191,150,0.3,13,-464.515447,-302.730338,-629.580331,...,-455.094796,159.378756,1,-481.215124,-512.288838,-433.786685,-435.254739,-519.565405,-476.422158,36.560282
6,1197.509361,8.609965,0.597603,0.012514,150,0.35,13,-464.434362,-302.716681,-629.421185,...,-455.160011,159.24192,2,-481.438013,-512.404457,-433.465274,-435.162117,-519.539372,-476.401847,36.678772
14,773.598454,0.646408,0.293532,0.005846,200,0.3,9,-469.116574,-308.985715,-632.90214,...,-460.035437,157.964256,3,-486.421906,-517.017877,-439.159816,-440.049705,-524.077024,-481.345266,36.357245
0,170.13329,1.54478,0.78284,0.004134,150,0.05,17,-469.553148,-310.363621,-633.059468,...,-460.885842,158.57553,4,-486.676589,-517.196068,-439.685469,-440.214125,-525.015812,-481.757612,36.461703
10,326.690148,2.641367,0.684469,0.002193,200,0.1,17,-471.175422,-313.376754,-634.705612,...,-462.867679,157.827871,5,-489.015941,-519.653478,-442.311804,-442.694866,-526.770448,-484.089307,36.248093
2,626.061902,3.289099,0.29434,0.006392,250,0.25,9,-471.471676,-313.456268,-635.223201,...,-463.548105,157.226865,6,-489.649715,-520.223689,-442.509509,-444.068146,-527.718017,-484.833815,36.243282
13,452.836061,2.845391,0.520202,0.003601,250,0.15,13,-472.613154,-315.70658,-635.963329,...,-464.642483,157.018427,7,-490.870122,-521.446642,-444.231599,-444.53498,-528.691804,-485.955029,36.24006
12,297.650893,2.421671,0.532775,0.007687,250,0.1,13,-474.514704,-318.194485,-636.889917,...,-466.500964,156.712289,8,-493.145293,-523.082607,-445.983079,-445.964647,-530.364993,-487.708124,36.287801
7,128.459541,0.376328,0.356902,0.003815,200,0.05,9,-475.063084,-319.331639,-637.667586,...,-467.553307,156.666474,9,-493.409384,-523.399621,-446.76338,-446.792077,-530.814128,-488.235718,36.09354
3,557.549278,4.709319,0.420704,0.00359,350,0.2,17,-476.147388,-321.006546,-638.344953,...,-468.683273,155.995321,10,-495.083086,-525.064618,-448.74342,-449.001338,-532.649134,-490.108319,35.936752


In [177]:
rf_rgscv_results.to_csv("../results/RF_v1_gscv_results.csv", index = False)

In [148]:
# min_sample_leaf = []
# max_features = []
# max_depth = []
# for i in search.cv_results_['params']:
#     min_sample_leaf.append(i['min_sample_leaf'])
#     max_features.append(i['max_features'])
#     max_depth.append(i['max_depth'])

[{'min_samples_leaf': 150, 'max_features': 0.05, 'max_depth': 17},
 {'min_samples_leaf': 150,
  'max_features': 0.30000000000000004,
  'max_depth': 13},
 {'min_samples_leaf': 250, 'max_features': 0.25, 'max_depth': 9},
 {'min_samples_leaf': 350, 'max_features': 0.2, 'max_depth': 17},
 {'min_samples_leaf': 400, 'max_features': 0.25, 'max_depth': 17},
 {'min_samples_leaf': 350,
  'max_features': 0.15000000000000002,
  'max_depth': 5},
 {'min_samples_leaf': 150,
  'max_features': 0.35000000000000003,
  'max_depth': 13},
 {'min_samples_leaf': 200, 'max_features': 0.05, 'max_depth': 9},
 {'min_samples_leaf': 350, 'max_features': 0.1, 'max_depth': 17},
 {'min_samples_leaf': 250, 'max_features': 0.05, 'max_depth': 17},
 {'min_samples_leaf': 200, 'max_features': 0.1, 'max_depth': 17},
 {'min_samples_leaf': 300, 'max_features': 0.1, 'max_depth': 13},
 {'min_samples_leaf': 250, 'max_features': 0.1, 'max_depth': 13},
 {'min_samples_leaf': 250,
  'max_features': 0.15000000000000002,
  'max_depth':

## Summary

- Preliminary round of modeling with SKlearns RandomForestRegressor to minimize the RMSE (finding the mean)
    - MAE minimization in SKlearn's random forest is problmatic due to implementation, which leads to **very** long run times 

- Input data
    - Missing values were mostly imputed with 0s
    - Columns with a high proportion of missing values were dropped
    
- Tuning and optimization
    - Assuming that the quality of the random forest increases linearly with the number of trees, a medium-ish number was selected to save time (n=750)
    - optomized over `max_depth`, `min_samples_leaf`, and `max_features`. Explnation on each can be found [here](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    - train RMSE was generally between 470 to 490, and validation RMSE between 455 and 470, which suggests slight under fitting.
    - Generally runs with higher percentage of columns used per tree performed better, however this is expected due to slight undefitting.