In [1]:
# This script is for RF GridSearch
# Take 70% Dataset 3 LA for example

In [2]:
# Packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
# Read the files
X_sociodemo_train = pd.read_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/03 70%_LA_1000/X_sociodemo_train.csv", index_col='GEOID')
y_train = pd.read_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/03 70%_LA_1000/y_train.csv", index_col='GEOID')

In [4]:
# Flatten y_train
y_train = np.ravel(y_train)

In [5]:
X_sociodemo_train.head()

Unnamed: 0_level_0,% Black,% Ame Indi and AK Native,% Asian,% Nati Hawa and Paci Island,% Hispanic or Latino,% male,% married,% age 18-29,% age 30-39,% age 40-49,...,% age >=60,% <highschool,median income,% unemployment,% below poverty line,% food stamp/SNAP,median value units built,median year units built,% renter-occupied housing units,population density
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6037127605,0.018509,0.0,0.056028,0.0,0.801151,0.531016,0.461859,0.2001,0.210105,0.113557,...,0.071786,0.302725,41701,0.102301,0.285248,0.185833,469400,1976,0.927323,6165.975475
6037271701,0.12619,0.003116,0.23507,0.0,0.156656,0.480872,0.385144,0.274364,0.180197,0.136057,...,0.203393,0.02113,91567,0.068697,0.091652,0.019502,1045700,1968,0.726616,5796.226082
6037276000,0.047294,0.013307,0.133969,0.0,0.230714,0.506923,0.517106,0.118684,0.13217,0.136666,...,0.270455,0.033132,124904,0.019415,0.051789,0.008768,875400,1949,0.216284,2379.01718
6037213202,0.013926,0.005624,0.269416,0.0,0.656936,0.500536,0.363458,0.207552,0.186931,0.115694,...,0.189877,0.351434,37656,0.092937,0.275576,0.117331,577800,1943,0.829755,11054.79436
6037239601,0.184073,0.017636,0.002204,0.016533,0.793331,0.510885,0.403396,0.185726,0.187379,0.114081,...,0.081565,0.470883,36029,0.050236,0.321369,0.171806,338600,1953,0.752203,8597.610959


In [6]:
len(X_sociodemo_train), len(y_train)

(663, 663)

In [7]:
len(y_train)

663

In [9]:
# Store the number of sociodemographic features
number_sociodemo = len(X_sociodemo_train.columns)
number_sociodemo

21

In [10]:
# Create list of max_features to be searched
max_features = [number_sociodemo, int(number_sociodemo/2), int(number_sociodemo/3), 'sqrt', 'log2']

In [9]:
# Grid search - sociodemographic features
rf = RandomForestRegressor(random_state=42)

RandomForestRegressor()

pipe = Pipeline([
    ('scale', StandardScaler()), # standardization data set
    ('model', rf)
])

param_grid = [
    {'model__n_estimators': range(10, 1010, 10),
     'model__max_features':max_features},
    {'model__bootstrap':[False],
     'model__n_estimators': range(10, 1010, 10),
     'model__max_features':max_features}
]

rf_sociodemo_search = GridSearchCV(pipe, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, return_train_score=True)
rf_sociodemo_search.fit(X_sociodemo_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('model',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid=[{'model__max_features': [21, 10, 7, 'sqrt', 'log2'],
                          'model__n_estimators': range(10, 1010, 10)},
                         {'model__bootstrap': [False],
                          'model__max_features': [21, 10, 7, 'sqrt', 'log2'],
                          'model__n_estimators': range(10, 1010, 10)}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [10]:
# Optimal parameters for sociodemographic features
print(rf_sociodemo_search.best_params_)

{'model__bootstrap': False, 'model__max_features': 10, 'model__n_estimators': 890}


In [14]:
rf_sociodemo_search.best_params_results = pd.DataFrame(rf_sociodemo_search.best_params_, index = [0])
rf_sociodemo_search.best_params_results.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/03 70%_LA_1000/rf_sociodemo_search.best_params_results.csv", index=False)

In [15]:
# Obtain all the searching results
sociodemo_cvres = rf_sociodemo_search.cv_results_

df_sociodemo_all_results = pd.DataFrame(sociodemo_cvres)
df_sociodemo_all_results.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/03 70%_LA_1000/rf_sociodemo_all_results.csv", index=False)

df_sociodemo_param_score = pd.DataFrame({'param':sociodemo_cvres['params'], 'rmts':np.sqrt(-sociodemo_cvres['mean_test_score'])}) #- suppose = negative, need to find out
df_sociodemo_param_score.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/03 70%_LA_1000/rf_sociodemo_param_score.csv", index=False)