### Imports

In [2]:
import pandas as pd
import numpy as np

# State Level Data

## Pre-Existing Health Conditions Datasets:

#### For pre-existing conditions & immunizations:
1. choose crude or adj
2. drop old index
3. rename data value to question
4. drop year and question columns
5. For total number of incident data -> after concatening with population data, convert to rate

*Crude are the raw numbers. Adj are these numbers adjusted/ standardized based on population's age distribution. We can retain the crude numbers however the age adjusted numbers will probably be better for cross comparison*

In [None]:
asthma = pd.read_csv('Data/Raw/asthma_adj.csv')
asthma.rename(columns={
    'DataValue':'asthma_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [None]:
asthma.drop(columns=['YearStart', 'Question'], inplace=True)
asthma.head()

In [None]:
high_bp = pd.read_csv('Data/Raw/high_blood_pressure_adj.csv')
high_bp.head()

In [None]:
high_bp.rename(columns={
    'DataValue':'high_bp_prevalence',
    'LocationDesc':'Location'}, inplace=True)
high_bp.drop(columns=['YearStart', 'Question'], inplace=True)
high_bp.head()

In [None]:
cardiac_mortality = pd.read_csv('Data/Raw/cardiac_mortality_adj.csv')
cardiac_mortality.rename(columns={
    'DataValue':'cardiac_mortality_rate',
    'LocationDesc':'Location'}, inplace=True)
cardiac_mortality.drop(columns=['YearStart', 'Question'], inplace=True)
cardiac_mortality.head()

In [None]:
diabetes = pd.read_csv('Data/Raw/diabetes_adj.csv')
diabetes.head()
diabetes.rename(columns={
    'DataValue':'diabetes_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [None]:
diabetes.drop(columns=['YearStart', 'Question'], inplace=True)
diabetes.head()

In [None]:
kidney = pd.read_csv('Data/Raw/kidney_adj.csv')
kidney.head()

In [None]:
kidney.rename(columns={
    'DataValue':'kidney_disease_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [None]:
kidney.drop(columns=['YearStart', 'Question'], inplace=True)
kidney.head()

In [None]:
copd = pd.read_csv('Data/Raw/copd_adj.csv')
copd.rename(columns={
    'DataValue':'copd_prevalence',
    'LocationDesc':'Location'}, inplace=True)
copd.drop(columns=['YearStart', 'Question'], inplace=True)
copd.head()

In [None]:
immun = pd.read_csv('Data/Raw/immun_adj.csv')
immun.rename(columns={
    'DataValue':'flu_vaccination_rate_2019',
    'LocationDesc':'Location'}, inplace=True)
immun.drop(columns=['YearStart', 'Question'], inplace=True)
immun.head()

In [None]:
pre_con = pd.merge(immun, asthma, on='Location', how='inner')

dataframes_to_merge = [cardiac_mortality, high_bp, copd, kidney, diabetes]

for df in dataframes_to_merge:
    pre_con = pd.merge(pre_con, df, on='Location', how='inner')

In [None]:
pre_con.head()

In [None]:
pre_con.to_csv('Data/cleaned_pre_condtions.csv', index = True)

## Insurance rates by state

In [None]:
insur_2019 = pd.read_csv('Data/Raw/2019_insurance.csv')
insur_2019.head()

In [None]:
insur_2021 = pd.read_csv('Data/Raw/2021_insurance.csv')

In [None]:
insur = pd.merge(insur_2019, insur_2021, how ='inner', on ='Location', suffixes=('_2019', '_2021'))
insur.head()

In [None]:
insur.drop(columns =['Year_2019', 'Year_2021'], inplace=True)

In [None]:
insur.to_csv('Data/cleaned_insur.csv', index = False)

### Covid Deaths by State

In [None]:
deaths_covid = pd.read_csv('

In [None]:
deaths_covid = deaths_covid.pivot(index='State', columns='Location', 
                                  values=['All causes', 'covid'])

deaths_covid.columns = ['_'.join(map(str, col)) for col in deaths_covid.columns]
deaths_covid = deaths_covid.reset_index()
deaths_covid.rename(columns={'State': 'Location'}, inplace=True)
deaths_covid.rename(columns={
    'All causes_2020': 'all_causes_2020',
    'All causes_2021': 'all_causes_2021',
    'All causes_2022': 'all_causes_2022',
    'covid_2020': 'covid_2020',
    'covid_2021': 'covid_2021',
    'covid_2022': 'covid_2022',
}, inplace=True)
deaths_covid.head()

# Merging the Cleaned State Data into 1 df

In [None]:
merged_data.drop(columns=['Unnamed: 0'], inplace=True)
merged_data.head()

# County level Data

### Merging Data

In [6]:
df = pd.read_csv('../Data/Cleaned/county_df2.csv')
df.head()

Unnamed: 0,FIPS_x,County,Years of Potential Life Lost Rate (premature death),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair/Poor Health,percent_smokers,percent_obese,Food Environment Index,...,cases_2020,cases_2021,cases_2022,deaths_2020,deaths_2021,deaths_2022,Masks,FIPS_y,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct
0,1001,Autauga,8824.0,10471.0,,8707.0,18,19,38,7.2,...,4190.0,11018.0,18961.0,48.0,160.0,230.0,267.0,1001,42.2,73.8
1,1003,Baldwin,7225.0,10042.0,3087.0,7278.0,18,17,31,8.0,...,13601.0,39911.0,67496.0,161.0,593.0,719.0,267.0,1003,53.2,89.9
2,1005,Barbour,9586.0,11333.0,,7310.0,26,22,44,5.6,...,1514.0,3860.0,7027.0,32.0,81.0,103.0,267.0,1005,44.5,75.3
3,1007,Bibb,11784.0,14813.0,,11328.0,20,20,38,7.6,...,1834.0,4533.0,7692.0,46.0,95.0,108.0,267.0,1007,36.6,64.2
4,1009,Blount,10908.0,,5620.0,11336.0,21,20,34,8.5,...,4641.0,11256.0,17731.0,63.0,198.0,260.0,267.0,1009,31.9,56.6


In [7]:
# Drop rows that we will not be using 'Segregation index black/white',
df.drop(columns = ['County', 'YPLL Rate (Black)', 'YPLL Rate (Hispanic)', 'YPLL Rate (White)', 'Number Uninsured', 'Number Primary Care Physicians', 'FIPS_y', 'Number pre-mature Deaths',
                        'Preventable Hosp. Rate (Black)', 'Preventable Hosp. Rate (Hispanic)', 'Preventable Hosp. Rate (White)',  'Percent Vaccinated Flu (Black)', 'Percent Uninsured',
                        'Percent  Vaccinated (Hispanic) Flu', 'Percent Vaccinated (White) Flu', 'Number Some College', 'Number Unemployed', 'Labor Force', 'PCP Ratio', 
                        '80th Percentile Income', '20th Percentile Income', '95% CI - Low', '95% CI - High', 'Life Expectancy (Black)', 'Life Expectancy (Hispanic)', 
                        'Life Expectancy (White)', 'Number HIV Cases', 'Household income (Black)', 'Household income (Hispanic)', 'Household income (White)'], inplace = True)

In [8]:
# Make FIPS index 
df.set_index('FIPS_x', inplace=True)

# Dummify  Presence of water violation
df['water'] = df['Presence of water violation'].map({'No': 0, 'Yes': 1})
df.drop(columns = ['Presence of water violation', 'State'], inplace = True)

# Set case and deaths
df['cases'] = df['cases_2022']
df['deaths'] = df['deaths_2022']
df.drop(columns = ['cases_2020', 'cases_2021', 'cases_2022', 'deaths_2020', 'deaths_2021', 'deaths_2022'], inplace = True)

# Drop NA values
df.dropna(inplace=True)
df.shape

(1828, 47)

In [9]:
df.to_csv('../Data/Cleaned/county_df3.csv')