In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


#Download libraries

alt.data_transformers.disable_max_rows()

# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
def report_performance(model, X_train, y_train, X_valid, y_valid, 
                       mode='mean'):
    """
    Evaluate train and validation performance on a fitted model.
    
    Parameters
    ---------     
    model: sklearn.ensemble._gb.GradientBoostingRegressor
        scikit-learn model
    X_train: pandas.core.frame.DataFrame
        X of training set
    y_train: pandas.core.series.Series
        y of training set
    X_valid: pandas.core.frame.DataFrame        
        X of validation set
    y_valid: pandas.core.series.Series
        y of validation set     
    mode: string
        'mean' or 'median'
    
    Returns
    -------
    errors: list
        
    """
    if mode == 'mean':
        errors = [(mean_squared_error(y_train, model.predict(X_train)))**0.5, 
                  (mean_squared_error(y_valid, model.predict(X_valid)))**0.5]
        metric = 'squared'
        
    elif mode == 'median':
        errors = [mean_absolute_error(y_train, model.predict(X_train)), 
                  mean_absolute_error(y_valid, model.predict(X_valid))]
        metric= 'absolute'
    
    print('Training mean', metric, 'error:', errors[0])
    print('Validation mean', metric, 'error:', errors[1])

In [52]:
def plot_resid(model, X_train=None, y_train=None, X_valid=None, y_valid=None, plot = 'both'):
    d = dict()
    
    if plot != 'valid':
        train_df = pd.DataFrame({'Predicted Train':model.predict(X_train), 'True Train':y_train})
        train_df['Train Error Distance'] =  train_df['Predicted Train'] - train_df['True Train']
        train_df['Train Error Proportion'] =  train_df['Predicted Train']/train_df['True Train']
        train_dist = alt.Chart(train_df).mark_circle().encode(alt.X("True Train:Q"), y=alt.Y('Train Error Distance:Q'))
        train_prop = alt.Chart(train_df).mark_circle().encode(alt.X("True Train:Q"), y=alt.Y('Train Error Proportion:Q'))
        d["Train_Distance"] = train_dist
        d["Train_Proportion"] = train_prop
    else:
        d["Valid_Distance"] = "No training set inputted"
        d["Valid_Proportion"] = "No training set inputted"
        
    if plot != 'train':
        valid_df = pd.DataFrame({'Predicted Valid':model.predict(X_valid), 'True Valid':y_valid})
        valid_df['Valid Error Distance'] =  valid_df['Predicted Valid'] - valid_df['True Valid']
        valid_df['Valid Error Proportion'] =  valid_df['Predicted Valid']/valid_df['True Valid']
        
        valid_dist = alt.Chart(valid_df).mark_circle().encode(alt.X("True Valid:Q"), y=alt.Y('Valid Error Distance:Q'))
        valid_prop = alt.Chart(valid_df).mark_circle().encode(alt.X("True Validation:Q"), y=alt.Y('Validation Error Proportion:Q'))
        d["Valid_Distance"] = valid_dist
        d["Valid_Proportion"] = valid_prop
    else:
        d["Valid_Distance"] = "No validation set inputted"
        d["Valid_Proportion"] = "No validation set inputted"
        
    return d

In [5]:
df = pd.read_csv('../data/train_data.zip')

In [6]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [7]:
# drop rows missing target variable
df = drop_missing_unacast(df)
### until Sirine's PR gets merged
df = df.query("external_id != 'CA00070678'")
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [8]:
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [9]:
y_train.isna().sum().sum()

0

In [19]:
rf = RandomForestRegressor(n_estimators=1500, max_depth = 15, min_samples_split = 100, max_features = 0.7,n_jobs=-1, bootstrap = True)

In [20]:
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [21]:
tr_time/60

16.52150979042053

In [22]:
report_performance(rf, X_train, y_train, X_valid, y_valid)

Training mean squared error: 138.82634321074957
Validation mean squared error: 151.0121505087009


In [23]:
rf.score(X_train,y_train)

0.7368927783281327

In [24]:
rf.score( X_valid, y_valid)

0.6797252667872987

In [25]:
importances = rf.feature_importances_
n = 10
indices = (-importances).argsort()[:n]
X_train.iloc[:, list(indices)].columns.to_list()

['houses_per_sq_km',
 'B08301e10',
 'walk_score',
 'Republicans_2016',
 'B17020e6',
 'state_amount_per_capita',
 'Libertarians_2016',
 'B25012e17',
 'B08301e6',
 'B19101e8']

In [8]:
max_depth = [3,20,15,5,8]
min_samples_leaf = [50,125,200]
max_features = [0.1*i for i in range(1,8)]
bootstrap = True
#max_samples = [0.05*i for i in range(14,21)]
d = {"max_depth":max_depth, "min_samples_leaf":min_samples_leaf, "max_features":max_features}

In [11]:
c = 1
for val in d.values():
    c *= len(val)
c

105

In [6]:
d.values()

dict_values([[3, 5, 7], [50, 125, 200], [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001]])

In [12]:
t0 = time.time()
rf_cv = RandomForestRegressor(n_estimators=500, bootstrap = True,n_jobs=-1)
rgscv = RandomizedSearchCV(rf_cv,n_iter=10,param_distributions=d,return_train_score=True,n_jobs=-1  ,scoring=['neg_root_mean_squared_error'], refit=False)
search = rgscv.fit(X_train, y_train)
t1 = time.time()
cv_time = t1-t0

In [None]:
cv_time/60/60

In [14]:
d = search.cv_results_

pd.DataFrame(data=d)#.sort_values(by='rank_test_neg_root_mean_squared_error	')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,...,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_train_neg_root_mean_squared_error,split1_train_neg_root_mean_squared_error,split2_train_neg_root_mean_squared_error,split3_train_neg_root_mean_squared_error,split4_train_neg_root_mean_squared_error,mean_train_neg_root_mean_squared_error,std_train_neg_root_mean_squared_error
0,1606.684599,2.491543,1.162499,0.156037,50,0.5,20,"{'min_samples_leaf': 50, 'max_features': 0.5, ...",-146.28467,-152.649436,...,-155.746717,7.935287,1,-152.745745,-151.785304,-146.831546,-151.556381,-151.023166,-150.788428,2.055559
1,144.309173,1.215526,0.608651,0.043203,125,0.2,3,"{'min_samples_leaf': 125, 'max_features': 0.2,...",-193.719139,-196.625389,...,-201.431892,6.330096,10,-202.047059,-200.732452,-197.491112,-201.711613,-200.538896,-200.504226,1.610606
2,176.695714,0.767955,0.827943,0.080414,125,0.1,8,"{'min_samples_leaf': 125, 'max_features': 0.1,...",-182.425875,-185.751417,...,-190.263214,6.719719,6,-189.591023,-188.326295,-185.911613,-188.628986,-188.707452,-188.233074,1.234802
3,892.502714,8.388668,0.878335,0.013495,50,0.5,8,"{'min_samples_leaf': 50, 'max_features': 0.5, ...",-158.126367,-163.919682,...,-166.536875,7.413346,3,-164.903213,-163.403168,-158.620236,-163.869281,-162.763158,-162.711811,2.161646
4,426.849827,2.661796,0.564516,0.052117,50,0.6,3,"{'min_samples_leaf': 50, 'max_features': 0.600...",-182.441978,-185.942054,...,-189.691303,6.841661,5,-190.12511,-188.046731,-184.55863,-189.781298,-187.934059,-188.089166,1.974787
5,3776.644686,517.590457,3.688427,0.385812,200,0.7,8,"{'min_samples_leaf': 200, 'max_features': 0.70...",-183.970463,-188.792381,...,-193.035813,6.562291,8,-192.334685,-191.708563,-189.604299,-192.00822,-191.490675,-191.429288,0.955746
6,2840.053057,338.792729,3.801004,2.157962,125,0.3,15,"{'min_samples_leaf': 125, 'max_features': 0.30...",-172.883108,-178.030591,...,-181.339516,6.449368,4,-180.543068,-180.04876,-175.935475,-179.229405,-179.585761,-179.068494,1.627548
7,1778.256419,11.925694,4.335996,1.653517,200,0.2,20,"{'min_samples_leaf': 200, 'max_features': 0.2,...",-185.61191,-188.55162,...,-193.369914,6.474536,9,-193.201163,-192.120075,-189.496255,-191.99127,-191.94506,-191.750765,1.218068
8,868.253258,45.537747,2.392829,0.893235,50,0.3,3,"{'min_samples_leaf': 50, 'max_features': 0.300...",-184.241727,-185.755599,...,-190.55844,7.121209,7,-191.107707,-188.932243,-186.115328,-190.722719,-189.023593,-189.180318,1.764885
9,829.860981,123.569716,1.621974,1.458029,50,0.1,20,"{'min_samples_leaf': 50, 'max_features': 0.1, ...",-152.073831,-157.344081,...,-160.830755,7.588249,2,-158.061459,-156.685437,-153.071706,-157.253711,-157.014424,-156.417347,1.733453


In [11]:
t0 = time.time()
rf_cv = RandomForestRegressor(n_estimators=500, bootstrap = True,n_jobs=-1)
rgscv1 = RandomizedSearchCV(rf_cv,n_iter=10,param_distributions=d,return_train_score=True,n_jobs=-1  ,scoring=['neg_root_mean_squared_error'], refit=False)
search1 = rgscv1.fit(X_train, y_train)
t1 = time.time()
cv_time = t1-t0
d1 = search1.cv_results_

In [12]:
pd.DataFrame(data=d1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,...,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_train_neg_root_mean_squared_error,split1_train_neg_root_mean_squared_error,split2_train_neg_root_mean_squared_error,split3_train_neg_root_mean_squared_error,split4_train_neg_root_mean_squared_error,mean_train_neg_root_mean_squared_error,std_train_neg_root_mean_squared_error
0,5536.621997,19.954894,3.499835,0.623025,200,0.6,15,"{'min_samples_leaf': 200, 'max_features': 0.60...",-181.454794,-186.23447,...,-190.461063,6.624129,4,-189.671666,-189.220101,-186.812901,-189.057651,-189.086111,-188.769686,1.002845
1,5067.753646,61.576164,3.504749,0.201787,125,0.7,8,"{'min_samples_leaf': 125, 'max_features': 0.70...",-177.97281,-183.164258,...,-186.550786,6.384466,3,-186.5276,-185.274983,-181.820103,-184.682967,-184.582822,-184.577695,1.542968
2,2395.255626,8.373793,3.383983,0.244566,125,0.5,5,"{'min_samples_leaf': 125, 'max_features': 0.5,...",-183.868052,-189.378081,...,-192.352498,6.249743,5,-192.397946,-191.987361,-187.562348,-191.500771,-191.549996,-190.999684,1.749277
3,2058.006819,18.018362,3.351321,0.337233,125,0.7,3,"{'min_samples_leaf': 125, 'max_features': 0.70...",-192.714838,-197.400518,...,-200.799172,5.95032,10,-202.192824,-200.765795,-197.508543,-200.482655,-199.896787,-200.169321,1.529872
4,1867.865213,45.371774,2.927546,1.180799,200,0.4,5,"{'min_samples_leaf': 200, 'max_features': 0.4,...",-188.343625,-193.48308,...,-197.437788,6.696089,8,-197.025975,-196.997772,-194.195141,-196.932983,-196.944542,-196.419282,1.112593
5,4379.777742,53.856829,2.848914,1.54991,125,0.6,8,"{'min_samples_leaf': 125, 'max_features': 0.60...",-177.707792,-183.163354,...,-186.193732,6.336114,2,-185.873417,-185.40486,-181.164363,-184.604588,-184.075367,-184.224519,1.651867
6,4166.188082,938.1825,2.002912,1.789938,200,0.7,8,"{'min_samples_leaf': 200, 'max_features': 0.70...",-183.760395,-188.837142,...,-192.897731,6.551268,6,-192.219186,-191.725557,-189.861107,-191.881469,-191.39378,-191.41622,0.821679
7,488.751607,24.033742,2.86514,1.12016,50,0.1,5,"{'min_samples_leaf': 50, 'max_features': 0.1, ...",-176.466246,-179.817985,...,-183.642707,6.974695,1,-183.158964,-181.04947,-177.916449,-183.369207,-181.825041,-181.463826,1.969265
8,605.040898,10.290444,3.517885,0.297434,50,0.2,3,"{'min_samples_leaf': 50, 'max_features': 0.2, ...",-187.572379,-187.974343,...,-193.270796,6.825195,7,-194.063556,-191.729866,-188.288236,-193.533616,-191.95895,-191.914845,2.021179
9,245.913585,56.319772,1.752811,0.957482,50,0.1,3,"{'min_samples_leaf': 50, 'max_features': 0.1, ...",-194.009646,-192.425506,...,-199.236771,6.78678,9,-200.179715,-197.688328,-193.995733,-199.580306,-197.201753,-197.729167,2.174868


In [10]:
t0 = time.time()
rf_cv = RandomForestRegressor(n_estimators=1000, bootstrap = True,n_jobs=-1)
rgscv2 = RandomizedSearchCV(rf_cv,n_iter=5,param_distributions=d,return_train_score=True,n_jobs=-1  ,scoring=['neg_root_mean_squared_error'], refit=False)
search2 = rgscv2.fit(X_train, y_train)
t1 = time.time()
cv_time = t1-t0
d2 = search2.cv_results_

In [11]:
pd.DataFrame(data=d2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,...,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_train_neg_root_mean_squared_error,split1_train_neg_root_mean_squared_error,split2_train_neg_root_mean_squared_error,split3_train_neg_root_mean_squared_error,split4_train_neg_root_mean_squared_error,mean_train_neg_root_mean_squared_error,std_train_neg_root_mean_squared_error
0,856.498512,0.403155,0.679534,0.037446,50,0.6,3,"{'min_samples_leaf': 50, 'max_features': 0.600...",-182.113708,-185.856621,...,-189.552145,6.906814,1,-189.79029,-188.13336,-184.785702,-189.808338,-188.100206,-188.123579,1.83076
1,1149.25289,1.294673,0.902257,0.095812,125,0.5,5,"{'min_samples_leaf': 125, 'max_features': 0.5,...",-184.274431,-189.193007,...,-192.421129,6.140397,3,-192.711922,-191.822793,-187.493221,-191.539452,-191.769016,-191.067281,1.831116
2,1375.77882,0.896225,0.83849,0.02173,125,0.6,5,"{'min_samples_leaf': 125, 'max_features': 0.60...",-184.770366,-190.235264,...,-193.050712,6.099773,4,-193.54125,-192.958607,-188.259516,-192.088629,-191.916753,-191.752951,1.84365
3,979.230218,23.538915,0.753645,0.110153,50,0.7,3,"{'min_samples_leaf': 50, 'max_features': 0.700...",-182.358169,-187.283885,...,-190.321842,6.904825,2,-190.311495,-189.216369,-185.558852,-190.376237,-188.394686,-188.771528,1.766792
4,703.786022,216.524648,0.590293,0.166373,200,0.4,5,"{'min_samples_leaf': 200, 'max_features': 0.4,...",-188.642619,-193.326993,...,-197.530619,6.607228,5,-197.282939,-196.73075,-194.15529,-197.277656,-196.980327,-196.485392,1.183072


In [12]:
rf = RandomForestRegressor(n_estimators=1500, max_depth = 20, min_samples_split = 50, max_features = 0.5,n_jobs=-1, bootstrap = True)
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [13]:
report_performance(rf, X_train, y_train, X_valid, y_valid)

Training mean squared error: 122.616826254174
Validation mean squared error: 139.73050716408426


In [14]:
rf = RandomForestRegressor(n_estimators=1500, max_depth = 30, min_samples_split = 25, max_features = 0.5,n_jobs=-1, bootstrap = True)
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [15]:
report_performance(rf, X_train, y_train, X_valid, y_valid)

Training mean squared error: 101.2009516264235
Validation mean squared error: 129.74548851998952


In [10]:
rf = RandomForestRegressor(n_estimators=1500, max_depth = 25, min_samples_split = 25, max_features = 0.5,n_jobs=-1, bootstrap = True)
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
tr_time = t1-t0

In [13]:
tr_time/60

17.91033939123154

In [28]:
report_performance(rf, X_train, y_train, X_valid, y_valid)

Training mean squared error: 102.73709137777978
Validation mean squared error: 130.4186745486596


In [53]:
p = plot_resid(rf, X_train, y_train, X_valid, y_valid)

In [30]:
p

{'Train_Distance': alt.Chart(...),
 'Train_Proportion': alt.Chart(...),
 'Valid_Distance': alt.Chart(...),
 'Valid_Proportion': alt.Chart(...)}

In [54]:
p['Valid_Distance']

In [38]:
valid_df = pd.DataFrame({'Predicted Valid':rf.predict(X_valid), 'True Valid':y_valid})
valid_df['Valid Error Distance'] =  valid_df['Predicted Valid'] - valid_df['True Valid']
valid_df['Valid Error Proportion'] =  valid_df['Predicted Valid']/valid_df['True Valid']
valid_df

Unnamed: 0,Predicted Valid,True Valid,Valid Error Distance,Valid Error Proportion
12721,251.854696,387.0,-135.145304,0.650787
17871,60.851852,56.0,4.851852,1.086640
46441,46.475018,31.0,15.475018,1.499194
48833,122.663007,131.0,-8.336993,0.936359
50069,96.940607,32.0,64.940607,3.029394
...,...,...,...,...
36902,75.937002,62.0,13.937002,1.224790
5882,56.639207,71.0,-14.360793,0.797735
22158,26.966143,29.0,-2.033857,0.929867
35672,109.599462,156.0,-46.400538,0.702561


In [60]:
len(y_train)

39592

In [58]:
len(np.unique(valid_df['Predicted Valid']))

9898

In [59]:
len(np.unique(rf.predict(X_train)))

39592

In [41]:
alt.Chart(valid_df).mark_circle().encode(alt.X("True Valid:Q"), y=alt.Y('Valid Error Distance:Q'))


In [32]:
p['Valid_Distance']

In [None]:
# min_sample_leaf = []
# max_features = []
# max_depth = []
# for i in search.cv_results_['params']:
#     min_sample_leaf.append(i['min_samples_leaf'])
#     max_features.append(i['max_features'])
#     max_depth.append(i['max_depth'])
    
# # for i in [max_depth,max_features,min_samples_leaf]:
# #     d[str(i)]=i
# # del d['params']

In [None]:
# rf_rgscv_results = pd.DataFrame(data=d).sort_values(by='rank_test_neg_root_mean_squared_error')
# rf_rgscv_results=rf_rgscv_results.iloc[:,:22]
# rf_rgscv_results