In [1]:
# This script is for RF GridSearch
# Take 70% Dataset 3 BUF for example

In [2]:
# Packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
# Read the files
X_sociodemo_train = pd.read_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/04 70%_Buf_1000/X_sociodemo_train.csv", index_col='GEOID')
y_train = pd.read_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/04 70%_Buf_1000/y_train.csv", index_col='GEOID')

In [4]:
# Flatten y_train
y_train = np.ravel(y_train)

In [5]:
X_sociodemo_train.head()

Unnamed: 0_level_0,% Black,% Ame Indi and AK Native,% Asian,% Nati Hawa and Paci Island,% Hispanic or Latino,% male,% married,% age 18-29,% age 30-39,% age 40-49,...,% age >=60,% <highschool,median income,% unemployment,% below poverty line,% food stamp/SNAP,median value units built,median year units built,% renter-occupied housing units,population density
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36029001100,0.057063,0.01531,0.032359,0.0,0.105428,0.525052,0.324919,0.186152,0.134308,0.092206,...,0.217815,0.170852,37886,0.071886,0.213184,0.23905,74300,1939,0.54343,1885.467988
36029005801,0.110658,0.004622,0.04323,0.0,0.24932,0.450517,0.265847,0.161773,0.127243,0.091898,...,0.156879,0.206461,26111,0.085294,0.422784,0.471989,70500,1939,0.621148,5311.666893
36029006602,0.196508,0.004872,0.023143,0.0,0.041007,0.485181,0.320053,0.218433,0.116931,0.131141,...,0.331709,0.097209,52774,0.049477,0.131878,0.267767,286700,1939,0.685571,5519.723543
36029000500,0.06295,0.0,0.0,0.0,0.153477,0.432854,0.361005,0.118106,0.126499,0.083933,...,0.220624,0.228101,26605,0.063401,0.414311,0.331218,50800,1939,0.376904,266.853532
36029006902,0.156107,0.006745,0.139484,0.0,0.229824,0.506384,0.34893,0.189593,0.197543,0.100458,...,0.184775,0.257278,26226,0.095465,0.397037,0.381648,123600,1939,0.654849,6411.702013


In [6]:
len(X_sociodemo_train), len(y_train)

(54, 54)

In [7]:
y_train

array([33.4, 37.4, 28.2, 38.6, 33.9, 30.2, 42.8, 42.4, 29.1, 37.3, 47.2,
       39. , 36.6, 29.8, 41.1, 44. , 30.2, 47. , 49.1, 45.1, 46. , 46.6,
       47.5, 26.2, 46.1, 30.5, 44.1, 46.9, 44.7, 37.8, 40.8, 26.7, 33.5,
       39. , 43.5, 36.1, 46.3, 27. , 39.8, 32.4, 28.6, 31.6, 26.2, 43.5,
       41.7, 34.3, 47. , 30.8, 29.4, 42.3, 33.6, 30. , 26.1, 41.7])

In [8]:
# Store the number of sociodemographic features
number_sociodemo = len(X_sociodemo_train.columns)
number_sociodemo

21

In [12]:
# Create list of max_features to be searched
max_features = [number_sociodemo, int(number_sociodemo/2), int(number_sociodemo/3), 'sqrt', 'log2']

In [13]:
# Grid search - sociodemographic features
rf = RandomForestRegressor(random_state=42)

RandomForestRegressor()

pipe = Pipeline([
    ('scale', StandardScaler()), # standardization data set
    ('model', rf)
])

param_grid = [
    {'model__bootstrap':[False],
     'model__n_estimators': range(10, 1010, 10),
     'model__max_features':max_features}
]

rf_sociodemo_search = GridSearchCV(pipe, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, return_train_score=True)
rf_sociodemo_search.fit(X_sociodemo_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('model',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid=[{'model__bootstrap': [False],
                          'model__max_features': [21, 10, 7, 'sqrt', 'log2'],
                          'model__n_estimators': range(10, 1010, 10)}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [14]:
# Optimal parameters for sociodemographic features
print(rf_sociodemo_search.best_params_)

{'model__bootstrap': False, 'model__max_features': 'sqrt', 'model__n_estimators': 160}


In [14]:
rf_sociodemo_search.best_params_results = pd.DataFrame(rf_sociodemo_search.best_params_, index = [0])
rf_sociodemo_search.best_params_results.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/04 70%_Buf_1000/rf_sociodemo_search.best_params_results.csv", index=False)

In [15]:
# Obtain all the searching results
sociodemo_cvres = rf_sociodemo_search.cv_results_

df_sociodemo_all_results = pd.DataFrame(sociodemo_cvres)
df_sociodemo_all_results.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/04 70%_Buf_1000/rf_sociodemo_all_results.csv", index=False)

df_sociodemo_param_score = pd.DataFrame({'param':sociodemo_cvres['params'], 'rmts':np.sqrt(-sociodemo_cvres['mean_test_score'])}) #- suppose = negative, need to find out
df_sociodemo_param_score.to_csv("../Data/04 Data for RF and DNN Optimal Parameters Search/04 70%_Buf_1000/rf_sociodemo_param_score.csv", index=False)