In [14]:
#imports
import pandas as pd

In [27]:
#load datasets
vaccinations = pd.read_csv("COVID-19_Vaccinations.csv", low_memory=False)
covid_cases = pd.read_csv("covid_confirmed_usafacts.csv", low_memory=False)
death_cases = pd.read_csv("covid_deaths_usafacts.csv", low_memory=False)
gdp_by_county = pd.read_csv("GDP by County.csv", low_memory=False)


In [35]:
def clean_gdp_by_county(df):
    # reformating
    # Drop rows where all cells are NaN
    df = df.iloc[1:]
    df = df.dropna(how='all')
    states = [
        "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
        "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois",
        "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland",
        "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana",
        "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York",
        "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
        "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah",
        "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
    ]

    # Drop rows where the index (county) is actually a state name
    df = df[~df.index.isin(states)]
    #df.set_index("County", inplace=True)
    return df

gdp_by_county = clean_gdp_by_county(gdp_by_county)


Unnamed: 0_level_0,GDP_20,GDP_21,GDP_22,GDP_23
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Baldwin,8109210,8706236,9145388,9435720
Barbour,732151,753330,744850,715994
Bibb,460844,460027,457625,466543
Blount,906582,1033615,1031061,987208
Bullock,269007,260950,252515,267926
...,...,...,...,...
Sweetwater,3404245,3339081,3242411,3414965
Teton,2688047,3541564,3543143,3695069
Uinta,781034,808678,836424,870000
Washakie,342040,346039,339019,347700


In [17]:
def clean_vax_df(df_vax):
    #clean vaccination dataset
    # Convert the 'Date' column to datetime format
    df_vax["Date"] = pd.to_datetime(df_vax["Date"], format="%m/%d/%Y")

    # Extract Year, Month, and Day
    df_vax["Year"] = df_vax["Date"].dt.year
    df_vax["Month"] = df_vax["Date"].dt.month
    df_vax["Day"] = df_vax["Date"].dt.day

    df_vax["Recip_County"] = df_vax["Recip_County"].str.replace(" County", "").str.strip()

    # Rename columns to match indexing convention
    df_vax = df_vax.rename(columns={"Recip_County": "County", "Recip_State": "State"})

    # Select only relevant columns
    df_vax = df_vax[["State", "County", "Year", "Month", "Day", "Series_Complete_Yes", 
                    "Series_Complete_Pop_Pct", "Booster_Doses_Vax_Pct"]]

    # Set the multi-index
    df_vax.set_index(["State", "County", "Year", "Month", "Day"], inplace=True)
    return df_vax

vaccinations = clean_vax_df(vaccinations)

In [18]:
def clean_cases_df(df_cases):
    # Standardize county names (remove " County")
    df_cases["County Name"] = df_cases["County Name"].str.replace(" County", "").str.strip()

    # Rename columns for consistency
    df_cases = df_cases.rename(columns={"County Name": "County", "State": "State"})

    # Drop unnecessary columns
    df_cases = df_cases.drop(columns=["countyFIPS", "StateFIPS"])

    # Convert wide format (dates as columns) to long format
    df_cases = df_cases.melt(id_vars=["State", "County"], var_name="Date", value_name="Cases")

    # Convert Date column to datetime and extract Year, Month, Day
    df_cases["Date"] = pd.to_datetime(df_cases["Date"])
    df_cases["Year"] = df_cases["Date"].dt.year
    df_cases["Month"] = df_cases["Date"].dt.month
    df_cases["Day"] = df_cases["Date"].dt.day


    # Drop the original Date column
    df_cases = df_cases.drop(columns=["Date"])
    df_cases = df_cases[df_cases["County"] != "Statewide Unallocated"]

    # Set multi-index
    df_cases.set_index(["State", "County", "Year", "Month", "Day"], inplace=True)
    return df_cases
covid_cases = clean_cases_df(covid_cases)

In [19]:
def clean_deaths_df(df_deaths):
    '''
    TODO: Write Docstring
    '''
    # Drop unnecessary columns
    df_deaths = df_deaths.drop(columns=["countyFIPS", "StateFIPS"], errors="ignore")

    # Rename columns for consistency
    df_deaths = df_deaths.rename(columns={"County Name": "County", "State": "State"})

    # Standardize county names (remove "County" and extra spaces)
    df_deaths["County"] = df_deaths["County"].str.replace(" County", "").str.strip()

    # Remove rows where County is "Statewide Unallocated"
    df_deaths = df_deaths[df_deaths["County"] != "Statewide Unallocated"]

    # Reshape from wide format to long format
    df_deaths = df_deaths.melt(id_vars=["State", "County"], var_name="Date", value_name="Deaths")

    # Convert Date column to datetime format
    df_deaths["Date"] = pd.to_datetime(df_deaths["Date"], format="%Y-%m-%d")

    # Extract Year, Month, and Day
    df_deaths["Year"] = df_deaths["Date"].dt.year
    df_deaths["Month"] = df_deaths["Date"].dt.month
    df_deaths["Day"] = df_deaths["Date"].dt.day

    # Drop the original Date column
    df_deaths = df_deaths.drop(columns=["Date"])

    # Set multi-index
    df_deaths = df_deaths.set_index(["State", "County", "Year", "Month", "Day"])
    return df_deaths
death_cases = clean_deaths_df(death_cases)

In [20]:
def merge_datasets():
    '''
    TODO: Write Docstring
    '''
    df_combined = covid_cases.merge(death_cases, on=["State", "County", "Year", "Month", "Day"], how="outer")
    df_combined = df_combined.merge(vaccinations, on=["State", "County", "Year", "Month", "Day"], how="outer")
    
    df_combined.reset_index()

    return df_combined
merge_datasets()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Cases,Deaths,Series_Complete_Yes,Series_Complete_Pop_Pct,Booster_Doses_Vax_Pct
State,County,Year,Month,Day,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AK,Aleutians East Borough,2020,1,22,0.0,0.0,,,
AK,Aleutians East Borough,2020,1,23,0.0,0.0,,,
AK,Aleutians East Borough,2020,1,24,0.0,0.0,,,
AK,Aleutians East Borough,2020,1,25,0.0,0.0,,,
AK,Aleutians East Borough,2020,1,26,0.0,0.0,,,
...,...,...,...,...,...,...,...,...,...
,Unknown,2022,7,6,,,515491.0,,34.0
,Unknown,2022,7,13,,,516527.0,,34.2
,Unknown,2022,7,20,,,517912.0,,34.5
,Unknown,2022,11,16,,,489303.0,,31.9
