## Data Cleaning & Preprocessing

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import VotingRegressor

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor


In [2]:
df = pd.read_csv('Data/Cleaned/county_df2.csv')
df.head()

Unnamed: 0,FIPS_x,County,Years of Potential Life Lost Rate (premature death),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair/Poor Health,percent_smokers,percent_obese,Food Environment Index,...,cases_2020,cases_2021,cases_2022,deaths_2020,deaths_2021,deaths_2022,Masks,FIPS_y,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct
0,1001,Autauga,8824.0,10471.0,,8707.0,18,19,38,7.2,...,4190.0,11018.0,18961.0,48.0,160.0,230.0,267.0,1001,42.2,73.8
1,1003,Baldwin,7225.0,10042.0,3087.0,7278.0,18,17,31,8.0,...,13601.0,39911.0,67496.0,161.0,593.0,719.0,267.0,1003,53.2,89.9
2,1005,Barbour,9586.0,11333.0,,7310.0,26,22,44,5.6,...,1514.0,3860.0,7027.0,32.0,81.0,103.0,267.0,1005,44.5,75.3
3,1007,Bibb,11784.0,14813.0,,11328.0,20,20,38,7.6,...,1834.0,4533.0,7692.0,46.0,95.0,108.0,267.0,1007,36.6,64.2
4,1009,Blount,10908.0,,5620.0,11336.0,21,20,34,8.5,...,4641.0,11256.0,17731.0,63.0,198.0,260.0,267.0,1009,31.9,56.6


In [3]:
# Drop rows that we will not be using 
df.drop(columns = ['County', 'YPLL Rate (Black)', 'YPLL Rate (Hispanic)', 'YPLL Rate (White)', 'Number Uninsured', 'Number Primary Care Physicians', 'FIPS_y', 'Number pre-mature Deaths',
                        'Preventable Hosp. Rate (Black)', 'Preventable Hosp. Rate (Hispanic)', 'Preventable Hosp. Rate (White)',  'Percent Vaccinated Flu (Black)', 'Percent Uninsured',
                        'Percent  Vaccinated (Hispanic) Flu', 'Percent Vaccinated (White) Flu', 'Number Some College', 'Number Unemployed', 'Labor Force', 'PCP Ratio', 
                        '80th Percentile Income', '20th Percentile Income', '95% CI - Low', '95% CI - High', 'Life Expectancy (Black)', 'Life Expectancy (Hispanic)', 
                        'Life Expectancy (White)', 'Number HIV Cases', 'Household income (Black)', 'Household income (Hispanic)', 'Household income (White)'], inplace = True)

In [4]:
# Make FIPS index 
df.set_index('FIPS_x', inplace=True)

# Dummify  Presence of water violation
df['water'] = df['Presence of water violation'].map({'No': 0, 'Yes': 1})
df.drop(columns = ['Presence of water violation', 'State'], inplace = True)

# Set case and deaths
df['cases'] = df['cases_2022']
df['deaths'] = df['deaths_2022']
df.drop(columns = ['cases_2020', 'cases_2021', 'cases_2022', 'deaths_2020', 'deaths_2021', 'deaths_2022'], inplace = True)

# Drop NA values
df.dropna(inplace=True)
df.shape

(1828, 47)

## Linear Regression (LR) - **County (Cases)**

### All variables

In [27]:
y = df['cases']
X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [28]:
lr_county_cases = LinearRegression()
lr_county_cases.fit(X_train, y_train)

In [29]:
print(f'Train Score: {lr_county_cases.score(X_train, y_train)}')
print(f'Test Score: {lr_county_cases.score(X_test, y_test)}')

Train Score: 0.9649955996688843
Test Score: 0.9301377525966599


In [30]:
pd.DataFrame(set(zip(X.columns, lr_county_cases.coef_)), columns = ['Variable Name', 'Coefficient']).sort_values('Coefficient').round(1).head(5)

Unnamed: 0,Variable Name,Coefficient
27,percent Native Hawaiian/Other Pacific Islander,-3673.5
37,Percent Unemployed,-1178.9
0,Average Daily PM2.5,-1033.1
11,percent_smokers,-771.1
41,percent Excessive Drinking,-409.1


### 8 variables

In [31]:
y = df['cases']
X = df[['% Physically Inactive', 'Percent Unemployed', 'Average Daily PM2.5', 'Percent Insufficient Sleep', 'Percent Uninsured Adults',
       'Population', 'percent Asian', 'percent Not Proficient in English', 'Masks', 'Administered_Dose1_Pop_Pct']]

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [32]:
lr_county_cases_lim = LinearRegression()
lr_county_cases_lim.fit(X_train, y_train)

In [33]:
print(f'Train Score: {lr_county_cases_lim.score(X_train, y_train)}')
print(f'Test Score: {lr_county_cases_lim.score(X_test, y_test)}')

Train Score: 0.9639786858046525
Test Score: 0.932502060671604


In [34]:
pd.DataFrame(set(zip(X.columns, lr_county_cases_lim.coef_)), columns = ['Variable Name', 'Coefficient']).sort_values('Coefficient').round(1).head(5)

Unnamed: 0,Variable Name,Coefficient
2,percent Asian,-2293.4
3,Average Daily PM2.5,-1333.1
7,Percent Unemployed,-1221.3
6,Percent Uninsured Adults,-401.5
4,Masks,-3.8


## Linear Regression (LR) - **County (Deaths)**

### All variables

In [19]:
y = df['deaths']
X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [20]:
lr_county_deaths = LinearRegression()
lr_county_deaths.fit(X_train, y_train)

In [21]:
print(f'Train Score: {lr_county_deaths.score(X_train, y_train)}')
print(f'Test Score: {lr_county_deaths.score(X_test, y_test)}')

Train Score: 0.9311904155340461
Test Score: 0.9076985158526915


In [22]:
pd.DataFrame(set(zip(X.columns, lr_county_deaths.coef_)), columns = ['Variable Name', 'Coefficient']).sort_values('Coefficient').round(1).head(5)

Unnamed: 0,Variable Name,Coefficient
18,percent Native Hawaiian/Other Pacific Islander,-196.4
43,Food Environment Index,-30.6
7,Percent Food Insecure,-23.0
17,% Fair/Poor Health,-17.7
22,Inadequate Facilities,-11.3


### 8 X-variables

In [23]:
y = df['deaths']
X = df[['% Physically Inactive', 'Percent Unemployed', 'Average Daily PM2.5', 'Percent Insufficient Sleep', 'Percent Uninsured Adults',
       'Population', 'percent Asian', 'percent Not Proficient in English', 'Masks', 'Administered_Dose1_Pop_Pct']]

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [24]:
lr_county_deaths_lim = LinearRegression()
lr_county_deaths_lim.fit(X_train, y_train)

In [25]:
print(f'Train Score: {lr_county_deaths_lim.score(X_train, y_train)}')
print(f'Test Score: {lr_county_deaths_lim.score(X_test, y_test)}')

Train Score: 0.9234030729783924
Test Score: 0.9044371813655786


In [26]:
pd.DataFrame(set(zip(X.columns, lr_county_deaths_lim.coef_)), columns = ['Variable Name', 'Coefficient']).sort_values('Coefficient').round(1).head(5)

Unnamed: 0,Variable Name,Coefficient
8,percent Asian,-41.2
2,Average Daily PM2.5,-19.5
9,Percent Uninsured Adults,-5.4
6,Percent Unemployed,-3.6
0,Administered_Dose1_Pop_Pct,-0.2


## Random Forest (RF) w/ GradBoost & Random Search CV - **County (Cases)**

In [92]:
y = df['cases']

X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [93]:
params = {
    'max_features': np.arange(5, X.shape[1] + 1),
    'max_depth': np.append(np.arange(1, 50), None),
    'min_samples_leaf': [2, 3],   
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

rf_gb = GradientBoostingRegressor(random_state = 42)

rf_gb_county_cases = RandomizedSearchCV(rf_gb, params, n_iter=100, cv = 5, n_jobs = 8)
# rf_gb_county_cases.get_params()

In [94]:
%%time
rf_gb_county_cases.fit(X_train, y_train)

CPU times: total: 4.58 s
Wall time: 35min 42s


In [95]:
print(f'Train Score: {rf_gb_county_cases.score(X_train, y_train)}')
print(f'Test Score: {rf_gb_county_cases.score(X_test, y_test)}')

Train Score: 0.9998258580194476
Test Score: 0.9089670660992167


In [96]:
rf_gb_county_cases.best_params_

{'n_estimators': 400,
 'min_samples_leaf': 3,
 'max_features': 36,
 'max_depth': 4}

In [None]:
pd.DataFrame({'Features': X.columns, 'Importance': rf_gb_county_cases.feature_importances_}).sort_values('Importance', ascending = False)

## Random Forest (RF) w/ GradBoost & Random Search CV - **County (Deaths)**

In [56]:
y = df['deaths']

X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [57]:
params = {
    'max_features': np.arange(5, X.shape[1] + 1),
    'max_depth': np.append(np.arange(1, 50), None),
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

rf_gb = GradientBoostingRegressor(random_state = 42)

rf_gb_county_deaths = RandomizedSearchCV(rf_gb, params, n_iter=100, cv = 5, n_jobs = 8)
# rf_gb_county_deaths.get_params()

In [58]:
%%time
rf_gb_county_deaths.fit(X_train, y_train)

CPU times: total: 10.5 s
Wall time: 9min 59s


In [59]:
print(f'Train Score: {rf_gb_county_deaths.score(X_train, y_train)}')
print(f'Test Score: {rf_gb_county_deaths.score(X_test, y_test)}')

Train Score: 0.99973869062371
Test Score: 0.939742882722045


In [60]:
rf_gb_county_deaths.best_params_

{'n_estimators': 1800, 'max_features': 40, 'max_depth': 2}

In [None]:
pd.DataFrame({'Features': X.columns, 'Importance': rf_gb_county_deaths.feature_importances_}).sort_values('Importance', ascending = False)

## Extra Trees (ET) w/ Random Search CV - **County (Cases)**

In [97]:
y = df['cases']

X = df.drop(columns = ['cases', 'deaths'])

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [98]:
params = {
    'max_features': np.arange(5, X.shape[1] + 1),
    'min_samples_leaf': [2, 3],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

et = ExtraTreesRegressor(n_estimators = 500, random_state = 42)

et_rs_county_cases = RandomizedSearchCV(et, params, n_iter=100, cv = 5, n_jobs = 8)
# et_rs_county_cases.get_params()

In [99]:
%%time
et_rs_county_cases.fit(X_train, y_train)

CPU times: total: 15.1 s
Wall time: 6min 48s


In [100]:
print(f'Train Score: {et_rs_county_cases.score(X_train, y_train)}')
print(f'Test Score: {et_rs_county_cases.score(X_test, y_test)}')

Train Score: 0.9015879124098862
Test Score: 0.9021278426774058


In [101]:
et_rs_county_cases.best_params_

{'n_estimators': 1400,
 'min_samples_leaf': 2,
 'max_features': 44,
 'max_depth': 50}