# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# State Level Models

In [2]:
state_data = pd.read_csv('../Data/merged_state_final.csv')
state_data.head()

Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,...,"Yes, 50 or more people",all_causes_2020,all_causes_2021,all_causes_2022,covid_2020,covid_2021,covid_2022,Covid_pop_perce_2020,Covid_pop_perce_2021,Covid_pop_perce_2022
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,...,1.0,9021,13018,6246,6337,9771,3933,0.001261,0.001945,0.000783
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,...,0.0,545,1429,686,213,804,275,0.00029,0.001096,0.000375
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,...,2.89,13186,17961,8835,8603,13536,5849,0.001203,0.001893,0.000818
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,...,1.0,4992,6908,3854,3691,5333,2593,0.001226,0.001771,0.000861
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,...,1.18,41279,60680,36786,29962,48834,21158,0.000758,0.001235,0.000535


### Y = Excess Deaths

In [3]:
X1 = state_data.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                      'all_causes_2020', 'all_causes_2021', 'all_causes_2022', 'covid_2020',
                      'covid_2021', 'covid_2022', 'Covid_pop_perce_2020', 'Covid_pop_perce_2021',
                     'Covid_pop_perce_2022'])
y1 = state_data[['Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022']]

In [4]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42)

In [5]:
state_lr = LinearRegression()
state_lr.fit(X1_train, y1_train)
state_lr.score(X1_train, y1_train), state_lr.score(X1_test, y1_test)

(1.0, 0.8032811801244987)

### Y= Covid Deaths

In [6]:
y2 = state_data[['covid_2020', 'covid_2021', 'covid_2022']]
X2_train, X2_test, y2_train, y2_test = train_test_split(X1, y2, random_state=42)

In [7]:
state_lr2 = LinearRegression()
state_lr2.fit(X2_train, y2_train)
state_lr2.score(X2_train, y2_train), state_lr2.score(X2_test, y2_test)

(1.0, 0.8804713015012814)

# County Level Models

In [8]:
county_data = pd.read_csv('../Data/Cleaned/county_df3.csv')
county_data.head()

Unnamed: 0,FIPS_x,County,Years of Potential Life Lost Rate (premature death),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair/Poor Health,percent_smokers,percent_obese,Food Environment Index,...,cases_2020,cases_2021,cases_2022,deaths_2020,deaths_2021,deaths_2022,Masks,FIPS_y,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct
0,1001,Autauga,8824.0,10471.0,,8707.0,18,19,38,7.2,...,4190.0,11018.0,18961.0,48.0,160.0,230.0,267.0,1001,42.2,73.8
1,1003,Baldwin,7225.0,10042.0,3087.0,7278.0,18,17,31,8.0,...,13601.0,39911.0,67496.0,161.0,593.0,719.0,267.0,1003,53.2,89.9
2,1005,Barbour,9586.0,11333.0,,7310.0,26,22,44,5.6,...,1514.0,3860.0,7027.0,32.0,81.0,103.0,267.0,1005,44.5,75.3
3,1007,Bibb,11784.0,14813.0,,11328.0,20,20,38,7.6,...,1834.0,4533.0,7692.0,46.0,95.0,108.0,267.0,1007,36.6,64.2
4,1009,Blount,10908.0,,5620.0,11336.0,21,20,34,8.5,...,4641.0,11256.0,17731.0,63.0,198.0,260.0,267.0,1009,31.9,56.6


### Y = Number of Cases

In [None]:
y3 = county_data['cases']
X3 = county_data.drop(columns = ['cases', 'deaths'])

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state = 42)

In [None]:
lr_county_cases = LinearRegression()
lr_county_cases.fit(X3_train, y3_train)

print(f'Train Score: {lr_county_cases.score(X_train, y_train)}')
print(f'Test Score: {lr_county_cases.score(X_test, y_test)}')

In [None]:
pd.DataFrame(set(zip(X3.columns, lr_county_cases.coef_)), columns = ['Variable Name',
                                    'Coefficient']).sort_values('Coefficient').round(1).head(5)

### Y = Deaths

In [None]:
y4 = county_data['deaths']
X4 = county_data.drop(columns = ['cases', 'deaths'])

# TTS
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, random_state = 42)

In [None]:
lr_county_deaths = LinearRegression()
lr_county_deaths.fit(X_train, y_train)

print(f'Train Score: {lr_county_deaths.score(X_train, y_train)}')
print(f'Test Score: {lr_county_deaths.score(X_test, y_test)}')

In [None]:
pd.DataFrame(set(zip(X4.columns, lr_county_deaths.coef_)), columns = ['Variable Name',
                                    'Coefficient']).sort_values('Coefficient').round(1).head(5)