### Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

### 2020 Census Population data

In [2]:
url = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010220'
res = requests.get(url)

In [3]:
soup = BeautifulSoup(res.content)

In [4]:
table = soup.find(attrs={'class':'qf-graph-scroll'})
# soup.find_all(attrs={'class':'qf-positive'})
tbody = table.find_all(attrs={'class':"qf-graph-geo"})
tr = tbody[1]
state = tr.find('a').attrs['data-title']
tr.find(attrs={'class':'qf-positive'}).attrs['data-value']

'29145505'

In [5]:
pop_2020 = []
for tr in tbody:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2020 Population': pop}
    pop_2020.append(state_pop)

In [6]:
pop_2020 = pd.DataFrame(pop_2020)

### 2010 Census Population Data

In [7]:
url2 = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010210'
res2 = requests.get(url2)

In [8]:
soup2 = BeautifulSoup(res2.content)
table2 = soup2.find(attrs={'class':'qf-graph-scroll'})
tbody2 = table2.find_all(attrs={'class':"qf-graph-geo"})

In [9]:
pop_2010 = []
for tr in tbody2:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2010 Population': pop}
    pop_2010.append(state_pop)

pop_2010 = pd.DataFrame(pop_2010)

In [10]:
population = pd.merge(pop_2010, pop_2020)
population.head()

Unnamed: 0,State,2010 Population,2020 Population
0,California,37253956,39538223
1,Texas,25145561,29145505
2,New York,19378102,20201249
3,Florida,18801310,21538187
4,Illinois,12830632,12812508


In [11]:
population.to_csv('Data/Population_data_2010_&_2020.csv')

### Excess Deaths Data

In [12]:
deaths = pd.read_csv('Ignore/Excess_Deaths_Associated_with_COVID-19.csv')
deaths.head()

Unnamed: 0,Week Ending Date,State,Observed Number,Upper Bound Threshold,Exceeds Threshold,Average Expected Count,Excess Estimate,Total Excess Estimate,Percent Excess Estimate,Year,Type,Outcome,Suppress,Note
0,2017-01-07,Alabama,1121.0,1136,False,1059,62,29601,5.8527,2017,Predicted (weighted),All causes,,
1,2017-01-14,Alabama,1130.0,1140,False,1067,63,29601,5.906102,2017,Predicted (weighted),All causes,,
2,2017-01-21,Alabama,1048.0,1142,False,1071,0,29601,0.0,2017,Predicted (weighted),All causes,,
3,2017-01-28,Alabama,1026.0,1142,False,1070,0,29601,0.0,2017,Predicted (weighted),All causes,,
4,2017-02-04,Alabama,1036.0,1142,False,1068,0,29601,0.0,2017,Predicted (weighted),All causes,,


In [13]:
# getting sum of each states excess deaths 
def excess_deaths(deaths, start_year=2017, end_year=2022):
    state_totals = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
    for index, row in deaths.iterrows():
        state = row['State']
        year = row['Year']
        excess_estimate = row['Excess Estimate']
        
        if start_year <= year <= end_year:
            state_totals[state][year - start_year] += excess_estimate

    result = pd.DataFrame(state_totals).T.reset_index()
    result.columns = ['State'] + [f'Exc_deaths_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
    return result


excess_deaths = excess_deaths(deaths)
excess_deaths.head()


Unnamed: 0,State,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022
0,Alabama,2649,4062,945,20726,29283,14805
1,Alaska,417,492,639,1422,3483,1783
2,Arizona,3522,4281,1785,30955,40347,20653
3,Arkansas,3054,2199,1434,11285,15391,8969
4,California,17241,13434,1029,93875,133206,89195


In [14]:
# Getting count for each year and each state where they exceed their threshold
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    state_counts = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
    for index, row in deaths.iterrows():
        state = row['State']
        year = row['Year']
        exceeds_threshold = row['Exceeds Threshold']
        
        if start_year <= year <= end_year and exceeds_threshold:
            state_counts[state][year - start_year] += 1

    result = pd.DataFrame(state_counts).T.reset_index()
    result.columns = ['State'] + [f'Exc_count_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
    return result
exceeds_threshold = count_exceeds_threshold(deaths)
exceeds_threshold.head()

Unnamed: 0,State,Exc_count_2017,Exc_count_2018,Exc_count_2019,Exc_count_2020,Exc_count_2021,Exc_count_2022
0,Alabama,3,15,0,99,106,87
1,Alaska,6,6,3,34,75,42
2,Arizona,9,15,0,89,114,77
3,Arkansas,12,12,0,66,93,64
4,California,18,12,0,94,104,106


In [15]:
# Merging the data
finaldeaths = pd.merge(excess_deaths, exceeds_threshold, how='left')
finaldeaths.head()

Unnamed: 0,State,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022,Exc_count_2017,Exc_count_2018,Exc_count_2019,Exc_count_2020,Exc_count_2021,Exc_count_2022
0,Alabama,2649,4062,945,20726,29283,14805,3,15,0,99,106,87
1,Alaska,417,492,639,1422,3483,1783,6,6,3,34,75,42
2,Arizona,3522,4281,1785,30955,40347,20653,9,15,0,89,114,77
3,Arkansas,3054,2199,1434,11285,15391,8969,12,12,0,66,93,64
4,California,17241,13434,1029,93875,133206,89195,18,12,0,94,104,106


In [16]:
finaldeaths.to_csv('Data/Excess_deaths&Exceeds_Threshold_data.csv')

### Population Density Data

In [17]:
url3 = 'https://wisevoter.com/state-rankings/population-density-by-state/'
res3 = requests.get(url3)
soup3 = BeautifulSoup(res3.content)

In [18]:
table3 = soup3.find('table', attrs={'id': 'shdb-on-page-table'})
tbody = table3.find('tbody')
trs = tbody.find_all('tr')

In [19]:
pop_density = []
for tr in trs:
    state = tr.find(attrs={'class':'shdb-on-page-table-body-Geo'}).text
    density = tr.find(attrs={'class':'shdb-on-page-table-body-Data'}).text.split()[0]
    
    state_density = {'State':state, 'Population Density per mi²': density}
    pop_density.append(state_density)
Pop_density = pd.DataFrame(pop_density)
Pop_density.head()

Unnamed: 0,State,Population Density per mi²
0,New Jersey,1283.4
1,Rhode Island,1074.3
2,Massachusetts,919.82
3,Connecticut,746.7
4,Maryland,648.84


In [20]:
Pop_density.to_csv('Data/Population_Density_data.csv')

### Merging the data

In [26]:
mask = pd.read_csv('Data/cleaned_mask.csv')

In [24]:
mask

Unnamed: 0.1,Unnamed: 0,Location,Mask_Mandate,Mandatory
0,0,Alabama,07/16/2020,Yes
1,1,Alaska,,No
2,2,Arizona,,No
3,3,Arkansas,07/20/2020,Yes
4,4,California,07/18/2020,Yes
5,5,Colorado,07/17/2020,Yes
6,6,Connecticut,04/17/2020,Yes
7,7,Delaware,09/16/2020,Yes
8,8,District of Columbia,07/22/2020,Yes
9,9,Florida,,No


### Modeling 

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression