## Data Cleaning & Preprocessing

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import VotingRegressor

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor


In [2]:
df = pd.read_csv('Data/Cleaned/county_df2.csv')
df.head()

Unnamed: 0,FIPS_x,County,Years of Potential Life Lost Rate (premature death),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair/Poor Health,percent_smokers,percent_obese,Food Environment Index,...,cases_2020,cases_2021,cases_2022,deaths_2020,deaths_2021,deaths_2022,Masks,FIPS_y,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct
0,1001,Autauga,8824.0,10471.0,,8707.0,18,19,38,7.2,...,4190.0,11018.0,18961.0,48.0,160.0,230.0,267.0,1001,42.2,73.8
1,1003,Baldwin,7225.0,10042.0,3087.0,7278.0,18,17,31,8.0,...,13601.0,39911.0,67496.0,161.0,593.0,719.0,267.0,1003,53.2,89.9
2,1005,Barbour,9586.0,11333.0,,7310.0,26,22,44,5.6,...,1514.0,3860.0,7027.0,32.0,81.0,103.0,267.0,1005,44.5,75.3
3,1007,Bibb,11784.0,14813.0,,11328.0,20,20,38,7.6,...,1834.0,4533.0,7692.0,46.0,95.0,108.0,267.0,1007,36.6,64.2
4,1009,Blount,10908.0,,5620.0,11336.0,21,20,34,8.5,...,4641.0,11256.0,17731.0,63.0,198.0,260.0,267.0,1009,31.9,56.6


In [3]:
# Drop rows that we will not be using 
df.drop(columns = ['County', 'YPLL Rate (Black)', 'YPLL Rate (Hispanic)', 'YPLL Rate (White)', 'Number Uninsured', 'Number Primary Care Physicians', 'FIPS_y', 'Number pre-mature Deaths',
                        'Preventable Hosp. Rate (Black)', 'Preventable Hosp. Rate (Hispanic)', 'Preventable Hosp. Rate (White)',  'Percent Vaccinated Flu (Black)', 'Percent Uninsured',
                        'Percent  Vaccinated (Hispanic) Flu', 'Percent Vaccinated (White) Flu', 'Number Some College', 'Number Unemployed', 'Labor Force', 'PCP Ratio', 
                        '80th Percentile Income', '20th Percentile Income', '95% CI - Low', '95% CI - High', 'Life Expectancy (Black)', 'Life Expectancy (Hispanic)', 
                        'Life Expectancy (White)', 'Number HIV Cases', 'Household income (Black)', 'Household income (Hispanic)', 'Household income (White)'], inplace = True)

In [4]:
# Make FIPS index 
df.set_index('FIPS_x', inplace=True)

# Dummify  Presence of water violation
df['water'] = df['Presence of water violation'].map({'No': 0, 'Yes': 1})
df.drop(columns = ['Presence of water violation', 'State'], inplace = True)

# Set case and deaths
df['cases'] = df['cases_2022']
df['deaths'] = df['deaths_2022']
df.drop(columns = ['cases_2020', 'cases_2021', 'cases_2022', 'deaths_2020', 'deaths_2021', 'deaths_2022'], inplace = True)

# Drop NA values
df.dropna(inplace=True)
df.shape

(1828, 47)

## Random Forest (RF) w/ Random Search CV - **County (Cases)**

In [46]:
y = df['cases']

X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [47]:
params = {
    'max_features': np.arange(5, X.shape[1] + 1),
    'max_depth': np.append(np.arange(1, 50), None),    
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

rf = RandomForestRegressor(
    random_state = 42,
)


rf_rs_county_cases = RandomizedSearchCV(rf, params, n_iter=100, cv = 5, n_jobs = 8)
# rf_rs_county_cases.get_params()

In [48]:
%%time
rf_rs_county_cases.fit(X_train, y_train)

CPU times: total: 56.4 s
Wall time: 25min 23s


In [49]:
print(f'Train Score: {rf_rs_county_cases.score(X_train, y_train)}')
print(f'Test Score: {rf_rs_county_cases.score(X_test, y_test)}')

Train Score: 0.9500416947494468
Test Score: 0.8198483742785813


In [50]:
rf_rs_county_cases.best_params_

{'n_estimators': 1200, 'max_features': 45, 'max_depth': 48}

In [140]:
pd.DataFrame({'Features': X.columns, 'Importance': rf_rs_county_cases.feature_importances_}).sort_values('Importance', ascending = False)

Unnamed: 0,Features,Importance
5,Population,0.923528
0,% Physically Inactive,0.016651
6,percent Asian,0.014197
4,Percent Uninsured Adults,0.013008
3,Percent Insufficient Sleep,0.008314
2,Average Daily PM2.5,0.008033
7,percent Not Proficient in English,0.006464
1,Percent Unemployed,0.006415
8,Masks,0.00339


## Random Forest (RF) w/ Random Search CV - **County (Deaths)**

In [41]:
y = df['deaths']

X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [42]:
params = {
    'max_features': np.arange(5, X.shape[1] + 1),
    'max_depth': np.append(np.arange(1, 50), None),
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

rf = RandomForestRegressor(
    random_state = 42,
)


rf_rs_county_deaths = RandomizedSearchCV(rf, params, n_iter=100, cv = 5, n_jobs = 8)
# rf_rs_county_deaths.get_params()

In [43]:
%%time
rf_rs_county_deaths.fit(X_train, y_train)

CPU times: total: 2.02 s
Wall time: 25min 19s


In [44]:
print(f'Train Score: {rf_rs_county_deaths.score(X_train, y_train)}')
print(f'Test Score: {rf_rs_county_deaths.score(X_test, y_test)}')

Train Score: 0.9207419098172177
Test Score: 0.7684792892632633


In [45]:
rf_rs_county_deaths.best_params_

{'n_estimators': 50, 'max_features': 37, 'max_depth': 25}

## Extra Trees (ET) w/ Random Search CV - **County (Deaths)**

In [21]:
y = df['deaths']

X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [22]:
params = {
    'max_features': np.arange(5, X.shape[1] + 1),
    'min_samples_leaf': [2, 3],
  
    'max_depth': np.append(np.arange(1, 50), None),
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

et = ExtraTreesRegressor(random_state = 42)

et_rs_county_deaths = RandomizedSearchCV(et, params, n_iter=100, cv = 5, n_jobs = 8)
# et_rs_county_deaths.get_params()

In [23]:
%%time
et_rs_county_deaths.fit(X_train, y_train)

CPU times: total: 2.88 s
Wall time: 7.56 s


In [24]:
print(f'Train Score: {et_rs_county_deaths.score(X_train, y_train)}')
print(f'Test Score: {et_rs_county_deaths.score(X_test, y_test)}')

Train Score: 0.8381905366896447
Test Score: 0.754295836725502


In [25]:
et_rs_county_deaths.best_params_

{'n_estimators': 800,
 'min_samples_leaf': 3,
 'max_features': 22,
 'max_depth': 14}

In [27]:
pd.DataFrame({'Features': X.columns, 'Importance': et_rs_county_deaths.best_estimator_.feature_importances_}).sort_values('Importance', ascending = False)

Unnamed: 0,Features,Importance
29,Population,0.398346
15,Percent Severe Housing Problems,0.094239
34,percent Asian,0.073256
16,Severe Housing Cost Burden,0.053883
38,percent Not Proficient in English,0.052232
17,Overcrowding,0.03394
14,Average Daily PM2.5,0.027853
37,percent Non-Hispanic White,0.019717
21,HIV Prevalence Rate,0.019321
40,number Rural,0.015762
