In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


#Download libraries

alt.data_transformers.disable_max_rows()

# import other functions
from imputer import *
from feature_eng import *
from drop import *
from preprocessing import *

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [(mean_squared_error(y_train, model.predict(X_train)))**0.5, 
                  (mean_squared_error(y_valid, model.predict(X_valid)))**0.5]
        metric = 'squared'
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, model.predict(X_train)), 
                  mean_absolute_error(y_valid, model.predict(X_valid))]
        metric= 'absolute'
    
    print('Training mean', metric, 'error:', errors[0])
    print('Validation mean', metric, 'error:', errors[1])

In [4]:
df = pd.read_csv('../data/train_data.zip')

In [6]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [15]:
# Remove playgrounds with 'external_id' == 'CA00070678'
df = df.query("external_id != 'CA00070678'")

In [5]:
# drop rows missing target variable
df = drop_missing_unacast(df)
### until Sirine's PR gets merged
df = df.query("external_id != 'CA00070678'")
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [None]:
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [10]:
X_train.isna().sum().sum()

0

In [19]:
rf = RandomForestRegressor(n_estimators=1500, max_depth = 15, min_samples_split = 100, max_features = 0.7,n_jobs=-1, bootstrap = True)

In [20]:
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [21]:
tr_time/60

16.52150979042053

In [22]:
report_performance(rf, X_train, y_train, X_valid, y_valid)

Training mean squared error: 138.82634321074957
Validation mean squared error: 151.0121505087009


In [23]:
rf.score(X_train,y_train)

0.7368927783281327

In [24]:
rf.score( X_valid, y_valid)

0.6797252667872987

In [25]:
importances = rf.feature_importances_
n = 10
indices = (-importances).argsort()[:n]
X_train.iloc[:, list(indices)].columns.to_list()

['houses_per_sq_km',
 'B08301e10',
 'walk_score',
 'Republicans_2016',
 'B17020e6',
 'state_amount_per_capita',
 'Libertarians_2016',
 'B25012e17',
 'B08301e6',
 'B19101e8']

In [4]:
max_depth = [3,5,7]
min_samples_leaf = [50,125,200]
max_features = [0.1*i for i in range(1,8)]
bootstrap = True
#max_samples = [0.05*i for i in range(14,21)]
d = {"max_depth":max_depth, "min_samples_leaf":min_samples_leaf, "max_features":max_features}

In [5]:
c = 1
for val in d.values():
    c *= len(val)
c

63

In [6]:
d.values()

dict_values([[3, 5, 7], [50, 125, 200], [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001]])

In [None]:
t0 = time.time()
rf_cv = RandomForestRegressor(n_estimators=500, bootstrap = True,n_jobs=-1)
rgscv = RandomizedSearchCV(rf_cv,n_iter=5,param_distributions=d,return_train_score=True,n_jobs=-1  ,scoring=['neg_root_mean_squared_error'], refit=False)
search = rgscv.fit(X_train, y_train)
t1 = time.time()
cv_time = t1-t0

In [None]:
cv_time/60

In [None]:
d = search.cv_results_

pd.DataFrame(data=d)#.sort_values(by='rank_test_neg_root_mean_squared_error	')

In [None]:
# min_sample_leaf = []
# max_features = []
# max_depth = []
# for i in search.cv_results_['params']:
#     min_sample_leaf.append(i['min_samples_leaf'])
#     max_features.append(i['max_features'])
#     max_depth.append(i['max_depth'])
    
# # for i in [max_depth,max_features,min_samples_leaf]:
# #     d[str(i)]=i
# # del d['params']

In [None]:
# rf_rgscv_results = pd.DataFrame(data=d).sort_values(by='rank_test_neg_root_mean_squared_error')
# rf_rgscv_results=rf_rgscv_results.iloc[:,:22]
# rf_rgscv_results