### Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# State Level Data

### Census Population

##### 2020

In [2]:
url = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010220'
res = requests.get(url)

In [3]:
soup = BeautifulSoup(res.content)
table = soup.find(attrs={'class':'qf-graph-scroll'})
# soup.find_all(attrs={'class':'qf-positive'})
tbody = table.find_all(attrs={'class':"qf-graph-geo"})
tr = tbody[1]
state = tr.find('a').attrs['data-title']
tr.find(attrs={'class':'qf-positive'}).attrs['data-value']

'29145505'

In [4]:
pop_2020 = []
for tr in tbody:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2020 Population': pop}
    pop_2020.append(state_pop)

pop_2020 = pd.DataFrame(pop_2020)
pop_2020.head()

Unnamed: 0,State,2020 Population
0,California,39538223
1,Texas,29145505
2,Florida,21538187
3,New York,20201249
4,Pennsylvania,13002700


##### 2010

In [5]:
url2 = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010210'
res2 = requests.get(url2)

In [6]:
soup2 = BeautifulSoup(res2.content)
table2 = soup2.find(attrs={'class':'qf-graph-scroll'})
tbody2 = table2.find_all(attrs={'class':"qf-graph-geo"})

In [7]:
pop_2010 = []
for tr in tbody2:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2010 Population': pop}
    pop_2010.append(state_pop)

pop_2010 = pd.DataFrame(pop_2010)

population = pd.merge(pop_2010, pop_2020)
population.head()

Unnamed: 0,State,2010 Population,2020 Population
0,California,37253956,39538223
1,Texas,25145561,29145505
2,New York,19378102,20201249
3,Florida,18801310,21538187
4,Illinois,12830632,12812508


In [8]:
population.dtypes

State              object
2010 Population    object
2020 Population    object
dtype: object

In [10]:
population[['2010 Population', '2020 Population']] = population[['2010 Population',
                                                                 '2020 Population']].astype(int)
population.to_csv('../Data/Population_data_2010_&_2020.csv', index=False)

### Asthma

CSV files with information on asthma, diabetes, cancer, copd, heart disease, kidney disease incident rates/ prevalence rates were downloaded from the CDC website. These csv files are each approximately 40 MB with responses to various questions, rates by demographic as well as overall population totals for more than a decade. Each dataset must be cleaned and unecessary information dropped to reduce the files to a manageable size.

In [None]:
asthma = pd.read_csv('Ignore/Asthma.csv')
asthma.shape

In [None]:
asthma.head()

In [None]:
asthma.Question.value_counts()

In [None]:
asthma.describe()

In [None]:
# We only need data for pre-existing conditions for 1 year
asthma = asthma[asthma['YearStart'] == 2019]

In [None]:
asthma.shape

In [None]:
asthma.YearStart.value_counts()

In [None]:
asthma = asthma[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue',
                 'Stratification1', 'DataValueType']]
asthma.head(), astha.shape

In [None]:
asthma['LocationDesc'].nunique()

In [None]:
asthma.drop_duplicates(inplace=True)
asthma = asthma[asthma['Stratification1'] == 'Overall']
asthma.shape

### Cardiovascular

In [None]:
heart = pd.read_csv('Ignore/Cardiovascular.csv')
heart.head()

In [None]:
# Splitting cardiac dataset into deaths from cardio diseases and diagnosed prevalence of hypertension
cardiac_mortality = heart[heart['Question'] == 'Mortality from total cardiovascular diseases']
print(cardiac_mortality.shape)

high_blood_pressure = heart[heart['Question']== 'Awareness of high blood pressure among adults aged >= 18 years']
print(high_blood_pressure.shape)

In [None]:
copd = pd.read_csv('Ignore/COPD.csv')
copd.shape

In [None]:
copd.Question.value_counts()
copd = copd[copd['Question']== 'Prevalence of chronic obstructive pulmonary disease among adults >= 18']
copd.shape

In [None]:
diabetes = pd.read_csv('Ignore/Diabetes.csv')
diabetes.shape

In [None]:
diabetes['Question'].value_counts()
diabetes = diabetes[diabetes['Question']== 'Prevalence of diagnosed diabetes among adults aged >= 18 years']
diabetes.shape

In [None]:
kidney = pd.read_csv('Ignore/Kidney.csv')

In [None]:
kidney['Question'].value_counts()
kidney = kidney[kidney['Question']== 'Prevalence of chronic kidney disease among adults aged >= 18 years']
kidney.shape

### Population Density

In [11]:
url3 = 'https://wisevoter.com/state-rankings/population-density-by-state/'
res3 = requests.get(url3)
soup3 = BeautifulSoup(res3.content)

In [12]:
table3 = soup3.find('table', attrs={'id': 'shdb-on-page-table'})
tbody = table3.find('tbody')
trs = tbody.find_all('tr')

In [13]:
pop_density = []
for tr in trs:
    state = tr.find(attrs={'class':'shdb-on-page-table-body-Geo'}).text
    density = tr.find(attrs={'class':'shdb-on-page-table-body-Data'}).text.split()[0]
    
    state_density = {'State':state, 'Population Density per mi²': density}
    pop_density.append(state_density)
Pop_density = pd.DataFrame(pop_density)
Pop_density.head()

Unnamed: 0,State,Population Density per mi²
0,New Jersey,1283.4
1,Rhode Island,1074.3
2,Massachusetts,919.82
3,Connecticut,746.7
4,Maryland,648.84


In [18]:
Pop_density.dtypes

State                          object
Population Density per mi²    float64
dtype: object

In [21]:
Pop_density.to_csv('../Data/Population_Density_data.csv', index=False)

# County Level Data

## Covid Vaccination Rates by county

In [None]:
covid_vax = pd.read_csv('Ignore/COVID-19_Vaccinations_by_county.csv')
covid_vax.shape()

In [None]:
covid_vax.head()

In [None]:
# keeping 9/18/2021 & 12/28/2022 
early_vax = covid_vax[covid_vax['Date'] == '9/18/2021'].copy()
late_vax = covid_vax[covid_vax['Date'] == '12/28/2022'].copy()

In [None]:
early_vax.drop(columns=['Series_Complete_Pop_Pct',
                        'Series_Complete_65PlusPop_Pct'], inplace = True)
early_vax.head()

In [None]:
late_vax.drop(columns=['Date', 'MMWR_week', 'Series_Complete_65PlusPop_Pct', 'Census2019', 'Census2019_65PlusPop'], inplace=True)
late_vax.head()

In [None]:
early_vax.to_csv('../Data/Raw/early_vax_2021.csv', index=False)
late_vax.tocsv('../Data/Raw/late_vax_2022.csv', index=False)

## Covid Cases

In [None]:
cases = pd.read_csv('Ignore/covid_confirmed_usafacts.csv')
cases.head()

In [None]:
cases['County'] = cases['County Name'].str.replace(r'\bCounty\b', '', regex=True).str.strip()

## Population

In [1]:
pop = pd.read_csv('./Ignore/covid_county_population_usafacts.csv')
pop.describe()

NameError: name 'pd' is not defined

In [None]:
pop.head()

In [None]:
# States dic
state_name = {
    'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}

In [None]:
state_pop_sum = {}
for index, row in pop.iterrows():
    state_abbr = row['State'] 
    tot_pop = row['population']  
    
    if state_abbr in state_pop_sum:
        state_pop_sum[state_abbr] += tot_pop
    else:
        state_pop_sum[state_abbr] = tot_pop

for state, pop_sum in state_pop_sum.items():
    print(f"{state}: {pop_sum}")

In [None]:
tot_pop = pd.DataFrame(list(state_pop_sum.items()), columns=['State', 'Total Population'])
tot_pop.head()

In [None]:
merged_pop = pd.merge(pop, tot_pop, on='State')
merged_pop.head()

In [None]:
merged_pop['pop %'] = (merged_pop['population'] / merged_pop['Total Population'])*100
merged_pop.head()

In [None]:
merged_pop.to_csv('./Data/population_w_percent.csv', index = False)

## Deaths by County

In [None]:
deaths = pd.read_csv('./Ignore/covid_deaths_usafacts.csv')
deaths

In [None]:
for i in range(5, len(deaths.columns)):
    deaths[deaths.columns[i]] = deaths[deaths.columns[i]] + deaths[deaths.columns[i - 1]]

deaths.head()

In [None]:
deaths['County'] = deaths['County Name'].str.replace(r'\bCounty\b', '', regex=True).str.strip()
deaths.head()