In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


#Download libraries

alt.data_transformers.disable_max_rows()

# import other functions
from imputer import *
from feature_eng import *
from drop import *
from preprocessing import *

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [(mean_squared_error(y_train, model.predict(X_train)))**0.5, 
                  (mean_squared_error(y_valid, model.predict(X_valid)))**0.5]
        metric = 'squared'
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, model.predict(X_train)), 
                  mean_absolute_error(y_valid, model.predict(X_valid))]
        metric= 'absolute'
    
    print('Training mean', metric, 'error:', errors[0])
    print('Validation mean', metric, 'error:', errors[1])

In [4]:
df = pd.read_csv('../data/train_data.zip')

In [5]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [6]:
# Remove playgrounds with 'external_id' == 'CA00070678'
df = df.query("external_id != 'CA00070678'")

In [8]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [9]:
X_train = clean_categorical(X_train)
X_valid = clean_categorical(X_valid)

In [10]:
X_train.isna().sum().sum()

0

In [19]:
rf = RandomForestRegressor(n_estimators=1500, max_depth = 15, min_samples_split = 100, max_features = 0.7,n_jobs=-1, bootstrap = True)

In [20]:
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [21]:
tr_time/60

16.52150979042053

In [22]:
report_performance(rf, X_train, y_train, X_valid, y_valid)

Training mean squared error: 138.82634321074957
Validation mean squared error: 151.0121505087009


In [23]:
rf.score(X_train,y_train)

0.7368927783281327

In [24]:
rf.score( X_valid, y_valid)

0.6797252667872987

In [25]:
importances = rf.feature_importances_
n = 10
indices = (-importances).argsort()[:n]
X_train.iloc[:, list(indices)].columns.to_list()

['houses_per_sq_km',
 'B08301e10',
 'walk_score',
 'Republicans_2016',
 'B17020e6',
 'state_amount_per_capita',
 'Libertarians_2016',
 'B25012e17',
 'B08301e6',
 'B19101e8']

In [11]:
max_depth = [1+i*4 for i in range(1,5)]
min_samples_leaf = [50+i*50 for i in range(2,8)]
max_features = [0.1*i for i in range(1,8)]
bootstrap = True
#max_samples = [0.05*i for i in range(14,21)]
d = {"max_depth":max_depth, "min_samples_leaf":min_samples_leaf, "max_features":max_features}

In [12]:
c = 1
for val in d.values():
    c *= len(val)
c

168

In [13]:
d.values()

dict_values([[5, 9, 13, 17], [150, 200, 250, 300, 350, 400], [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001]])

In [27]:
t0 = time.time()
rf_cv = RandomForestRegressor(n_estimators=750, bootstrap = True,n_jobs=-1)
rgscv = RandomizedSearchCV(rf_cv,n_iter=30,param_distributions=d,return_train_score=True,n_jobs=-1  ,scoring=['neg_root_mean_squared_error'], refit=False)
search = rgscv.fit(X_train, y_train)
t1 = time.time()
cv_time = t1-t0

ValueError: Invalid parameter std_train_neg_root_mean_squared_error for estimator RandomForestRegressor(n_estimators=750, n_jobs=-1). Check the list of available parameters with `estimator.get_params().keys()`.

In [28]:
cv_time/60

40.88478633562724

In [29]:
d = search.cv_results_

pd.DataFrame(data=d)#.sort_values(by='rank_test_neg_root_mean_squared_error	')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,...,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_train_neg_root_mean_squared_error,split1_train_neg_root_mean_squared_error,split2_train_neg_root_mean_squared_error,split3_train_neg_root_mean_squared_error,split4_train_neg_root_mean_squared_error,mean_train_neg_root_mean_squared_error,std_train_neg_root_mean_squared_error
0,504.885567,0.162856,0.631539,0.069793,250,0.3,5,"{'min_samples_leaf': 250, 'max_features': 0.30...",-194.26392,-197.868361,...,-202.556477,6.620606,4,-202.620411,-202.154907,-199.784299,-202.097054,-202.076401,-201.746614,1.001247
1,332.93329,1.642642,1.276994,0.040026,150,0.1,13,"{'min_samples_leaf': 150, 'max_features': 0.1,...",-184.12602,-186.538728,...,-191.680349,6.679094,2,-191.48114,-189.568512,-187.849579,-189.529552,-189.860065,-189.65777,1.154085
2,667.47659,0.763314,0.764687,0.086409,300,0.4,5,"{'min_samples_leaf': 300, 'max_features': 0.4,...",-198.516093,-201.261732,...,-206.550643,6.529373,5,-207.30843,-204.928381,-203.832061,-206.301958,-206.192399,-205.712646,1.206288
3,334.303211,18.019675,0.787085,0.031489,150,0.2,5,"{'min_samples_leaf': 150, 'max_features': 0.2,...",-187.006045,-190.488742,...,-194.997859,6.607441,3,-195.430319,-193.653737,-191.582538,-194.573927,-194.087219,-193.865548,1.285025
4,1311.831599,46.421283,0.624695,0.269612,150,0.6,13,"{'min_samples_leaf': 150, 'max_features': 0.60...",-176.468589,-181.111405,...,-185.049616,6.412804,1,-184.842594,-183.509201,-180.074524,-183.184137,-183.435514,-183.009194,1.577086


In [62]:
min_sample_leaf = []
max_features = []
max_depth = []
for i in search.cv_results_['params']:
    min_sample_leaf.append(i['min_samples_leaf'])
    max_features.append(i['max_features'])
    max_depth.append(i['max_depth'])
    
# for i in [max_depth,max_features,min_samples_leaf]:
#     d[str(i)]=i
# del d['params']

In [63]:
rf_rgscv_results = pd.DataFrame(data=d).sort_values(by='rank_test_neg_root_mean_squared_error')
rf_rgscv_results=rf_rgscv_results.iloc[:,:22]
rf_rgscv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,...,split4_test_neg_root_mean_squared_error,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_train_neg_root_mean_squared_error,split1_train_neg_root_mean_squared_error,split2_train_neg_root_mean_squared_error,split3_train_neg_root_mean_squared_error,split4_train_neg_root_mean_squared_error,mean_train_neg_root_mean_squared_error
0,1.958995,0.005861,0.0,0.0,250,0.5,17,"{'min_samples_leaf': 250, 'max_features': 0.5,...",,,...,,,,1,,,,,,
27,1.952336,0.029851,0.0,0.0,350,0.6,13,"{'min_samples_leaf': 350, 'max_features': 0.60...",,,...,,,,2,,,,,,
28,1.946589,0.03044,0.0,0.0,250,0.6,9,"{'min_samples_leaf': 250, 'max_features': 0.60...",,,...,,,,3,,,,,,
29,1.954135,0.00696,0.0,0.0,350,0.4,5,"{'min_samples_leaf': 350, 'max_features': 0.4,...",,,...,,,,4,,,,,,
30,1.944766,0.020059,0.0,0.0,250,0.4,9,"{'min_samples_leaf': 250, 'max_features': 0.4,...",,,...,,,,5,,,,,,
31,1.952235,0.006146,0.0,0.0,250,0.4,17,"{'min_samples_leaf': 250, 'max_features': 0.4,...",,,...,,,,6,,,,,,
32,1.958204,0.014899,0.0,0.0,350,0.3,5,"{'min_samples_leaf': 350, 'max_features': 0.30...",,,...,,,,7,,,,,,
33,1.960084,0.01624,0.0,0.0,400,0.3,5,"{'min_samples_leaf': 400, 'max_features': 0.30...",,,...,,,,8,,,,,,
34,1.966102,0.008459,0.0,0.0,250,0.2,9,"{'min_samples_leaf': 250, 'max_features': 0.2,...",,,...,,,,9,,,,,,
35,1.976299,0.016576,0.0,0.0,350,0.1,17,"{'min_samples_leaf': 350, 'max_features': 0.1,...",,,...,,,,10,,,,,,
