In [1]:
# This script is for RF GridSearch
# Take 70% Dataset 3 NYC for example

In [5]:
# Packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [6]:
# Read the files
X_sociodemo_train = pd.read_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/02 70%_NYC_1000/X_sociodemo_train.csv", index_col='GEOID')
y_train = pd.read_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/02 70%_NYC_1000/y_train.csv", index_col='GEOID')

In [10]:
# Flatten y_train
y_train = np.ravel(y_train)

In [11]:
X_sociodemo_train.head()

Unnamed: 0_level_0,% Black,% Ame Indi and AK Native,% Asian,% Nati Hawa and Paci Island,% Hispanic or Latino,% male,% married,% age 18-29,% age 30-39,% age 40-49,...,% age >=60,% <highschool,median income,% unemployment,% below poverty line,% food stamp/SNAP,median value units built,median year units built,% renter-occupied housing units,population density
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36081040700,0.045621,0.00701,0.035591,0.0,0.917494,0.541954,0.414732,0.172131,0.205242,0.137295,...,0.090703,0.456792,50993,0.039768,0.202918,0.302752,636600,1947,0.863405,46498.33254
36085030301,0.120352,0.002544,0.209198,0.0,0.308611,0.49002,0.515054,0.237573,0.123483,0.136008,...,0.193151,0.12733,76641,0.058712,0.087323,0.047671,355100,1977,0.317808,7109.130015
36081030600,0.855427,0.0,0.032726,0.0,0.06773,0.459818,0.457799,0.142709,0.158244,0.114954,...,0.19449,0.151437,75529,0.052151,0.152285,0.171363,417100,1958,0.345011,3575.395826
36081099705,0.0,0.0,0.437407,0.0,0.104694,0.511923,0.574672,0.156855,0.09538,0.13301,...,0.253353,0.078818,108009,0.015395,0.077124,0.032227,624400,1981,0.275829,539.352236
36081027400,0.81986,0.0,0.037556,0.0,0.04774,0.355188,0.32287,0.125398,0.122852,0.150223,...,0.218332,0.140323,65938,0.051546,0.092298,0.196181,347600,1964,0.189236,10936.60805


In [12]:
len(X_sociodemo_train), len(y_train)

(1396, 1396)

In [13]:
y_train

array([31.9, 30.9, 34.1, ..., 32.1, 35.3, 28.6])

In [14]:
# Store the number of sociodemographic features
number_sociodemo = len(X_sociodemo_train.columns)
number_sociodemo

21

In [15]:
# Create list of max_features to be searched
max_features = [number_sociodemo, int(number_sociodemo/2), int(number_sociodemo/3), 'sqrt', 'log2']

In [11]:
# Grid search - sociodemographic features
rf = RandomForestRegressor(random_state=42)

RandomForestRegressor()

pipe = Pipeline([
    ('scale', StandardScaler()), # standardization data set
    ('model', rf)
])

param_grid = [
    {'model__n_estimators': range(10, 1010, 10),
     'model__max_features':max_features},
    {'model__bootstrap':[False],
     'model__n_estimators': range(10, 1010, 10),
     'model__max_features':max_features}
]

rf_sociodemo_search = GridSearchCV(pipe, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, return_train_score=True)
rf_sociodemo_search.fit(X_sociodemo_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('model',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid=[{'model__max_features': [21, 10, 7, 'sqrt', 'log2'],
                          'model__n_estimators': range(10, 1010, 10)},
                         {'model__bootstrap': [False],
                          'model__max_features': [21, 10, 7, 'sqrt', 'log2'],
                          'model__n_estimators': range(10, 1010, 10)}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [13]:
# Optimal parameters for sociodemographic features
print(rf_sociodemo_search.best_params_)

{'model__bootstrap': False, 'model__max_features': 7, 'model__n_estimators': 560}


In [14]:
rf_sociodemo_search.best_params_results = pd.DataFrame(rf_sociodemo_search.best_params_, index = [0])
rf_sociodemo_search.best_params_results.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/02 70%_NYC_1000/rf_sociodemo_search.best_params_results.csv", index=False)

In [15]:
# Obtain all the searching results
sociodemo_cvres = rf_sociodemo_search.cv_results_

df_sociodemo_all_results = pd.DataFrame(sociodemo_cvres)
df_sociodemo_all_results.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/02 70%_NYC_1000/rf_sociodemo_all_results.csv", index=False)

df_sociodemo_param_score = pd.DataFrame({'param':sociodemo_cvres['params'], 'rmts':np.sqrt(-sociodemo_cvres['mean_test_score'])}) #- suppose = negative, need to find out
df_sociodemo_param_score.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/02 70%_NYC_1000/rf_sociodemo_param_score.csv", index=False)