In [1]:
import re
import pandas as pd
import numpy as np
import altair as alt
from pathlib import Path
from datetime import date
from ETLForElectionAndVaccinationData import *

# uses intermediate json files to speed things up
#alt.data_transformers.enable('json')

## Color global variables
TO_OTHER =   "#556B2F" 
TO_DEMOCRAT =  "#11A3D6"
TO_REPUBLICAN = "#8C1616"
STAYED_DEMOCRAT  =  '#0015BC'
STAYED_REPUBLICAN = '#FF0000'
STAYED_OTHER = '#B4D3B2'

segment_color_dict = {
    'TO_OTHER' : TO_OTHER ,
    'TO_DEMOCRAT'  : TO_DEMOCRAT,
    'TO_REPUBLICAN' : TO_REPUBLICAN,
    'STAYED_DEMOCRAT'  :  STAYED_DEMOCRAT,
    'STAYED_REPUBLICAN' : STAYED_REPUBLICAN,
    'STAYED_OTHER' : STAYED_OTHER
}

color_segment_dict = {TO_OTHER:"To other",
                      TO_DEMOCRAT:"To Democrat", 
                      TO_REPUBLICAN:"To Republican", 
                      STAYED_DEMOCRAT:"Stayed Democrat",
                      STAYED_REPUBLICAN:"Stayed Republican",
                      STAYED_OTHER:"Stayed Other"}

DataFolder = Path("../DataForPresidentialElectionsAndCovid/")


In [141]:
def getUnemploymentRate(level="county"):
    """
        THIS FUNCTION reads the county level unemployment rate from the 2020 dataset published by the BLS
        and 
        
        Functions called: None
        
        Input: None
        Returns: Dataframe election_winners_df with the following set of columns.
                 Note: Granularity = COUNTYFP.
        
            state                  (full name)
            state_po               (2-letter abbreviation)
            CTYNAME                (full name)
            COUNTYFP               (FIPS number) Questions
            party_winner_2020
            totalvotes_2020
            fractionalvotes_2020
            party_winner_2016
            totalvotes_2016
            fractionalvotes_2016
                
    """
    
    #
    # Prepare unemployment Data
    # 
    unemployment_df = pd.read_excel(DataFolder / r"laucntycur14.xlsx",
                                    names=["LAUS_code","state_FIPS","county_FIPS","county_name_and_state_abbreviation","Period","labor_force","employed","unemployed","unemployment_rate"],
                                    header=5,
                                    skipfooter=3)
    
    # Format the county FIPS as the state FIPS followed by the county FIPS
    concatenate_fips = lambda x : int(str(x["state_FIPS"]) + "{:03d}".format(x["county_FIPS"]))
    unemployment_df["COUNTYFP"] = unemployment_df.apply(concatenate_fips, axis=1)
    # Keep only US mainland states
    unemployment_df = unemployment_df[unemployment_df["COUNTYFP"] < 57000]
    # extract state and county names
    extract_names_regex = r"^(?P<CTYNAME>.*),\s(?P<state>[A-Z]{2})$"
    extract_county_names = lambda x : re.search(extract_names_regex,x).group("CTYNAME") if x != "District of Columbia" else "District of Columbia"
    extract_state_names = lambda x : re.search(extract_names_regex,x).group("state") if x != "District of Columbia" else "District of Columbia"
    unemployment_df["CTYNAME"] = unemployment_df["county_name_and_state_abbreviation"].apply(extract_county_names)
    unemployment_df["state"] = unemployment_df["county_name_and_state_abbreviation"].apply(extract_state_names)
    # Reformat present month which ends with " p"
    reformat_present_month = lambda x: x[:-2] if x[-2:] ==" p" else x 
    unemployment_df["Period"] = unemployment_df["Period"].apply(reformat_present_month)
    # Convert period to datetime
    unemployment_df["month"] = pd.to_datetime(unemployment_df["Period"], format="%b-%y")
    unemployment_df["unemployment_rate"] = unemployment_df["unemployment_rate"].astype("float64")
    unemployment_df.drop(columns=["state_FIPS", "county_FIPS", "LAUS_code","county_name_and_state_abbreviation","Period","labor_force","employed","unemployed"], inplace=True)
    
    #
    # Prepare and merge Covid case and death rates data
    #
    covid_df = getCasesRollingAveragePer100K()
    # Remove non mainland US states
    covid_df = covid_df[covid_df["COUNTYFP"] < 57000]
    # Change period to month and average cases per 100K per month and county
    covid_df["year_month"] = covid_df["date"].dt.to_period('M')
    covid_df.drop(columns=["date"], inplace=True)
    covid_df = covid_df.groupby(["year_month", "COUNTYFP"]).sum()
    covid_df.reset_index(inplace=True)
    # Get back the month period as a timestamp
    covid_df["month"] = covid_df["year_month"].apply(lambda x: x.to_timestamp(freq="D", how="start"))
    covid_df.drop(columns=["year_month"], inplace=True)
    
    unemployment_covid_df = pd.merge(unemployment_df, covid_df, how="left", on=["month", "COUNTYFP"])
    
    #
    # Merge the Covid case
    #
    county_mask_df.to_csv( r"../DataForPresidentialElectionsAndCovid/Dataset 7 Covid/mask-use-by-county.csv")
    unemployment_covid_df = pd.merge(
        unemployment_covid_df, county_mask_df, how="left", on=["COUNTYFP"])
    
    #
    # Merge election data at the state or county level
    #
    if level == "state":
        unemployment_covid_df = unemployment_covid_df.groupby(["month", "state"]).agg({
            "unemployment_rate": lambda x : x.mean(),
            "cases_avg_per_100k": lambda x : x.sum(),
            "deaths_avg_per_100k": lambda x : x.sum()
        })
        unemployment_covid_df.reset_index(inplace=True)
        election_df = getStateLevelElectionData2020()
        election_df = election_df[["state_po", "party_simplified"]]
        election_df.rename(columns={"state_po": "state", "party_simplified": "party"}, inplace = True)
        unemployment_covid_df = pd.merge(unemployment_covid_df, election_df, how="left", on="state" )
    else:
        election_df = getElectionData()
        election_df = election_df[["COUNTYFP", "party_winner_2020"]]
        election_df.rename(columns={"party_winner_2020": "party"}, inplace = True)
        unemployment_covid_df = pd.merge(unemployment_covid_df, election_df, how="left", on="COUNTYFP" )
    
    
    # uncemployment_covid_df.join(covid_df, on="COUNTYFP", how="left")
    
    return unemployment_covid_df

In [142]:
unemployment_covid_df = getUnemploymentRate("county")
unemployment_covid_df.dropna(inplace=True)
unemployment_covid_df

Unnamed: 0,unemployment_rate,COUNTYFP,CTYNAME,state,month,cases_avg_per_100k,deaths_avg_per_100k,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,party
0,10.9,1001,Autauga County,AL,2020-04-01,64.41,7.56,0.053,0.074,0.134,0.295,0.444,REPUBLICAN
1,14.5,1003,Baldwin County,AL,2020-04-01,68.61,1.35,0.083,0.059,0.098,0.323,0.436,REPUBLICAN
2,9.3,1005,Barbour County,AL,2020-04-01,142.38,1.16,0.067,0.121,0.120,0.201,0.491,REPUBLICAN
3,17.1,1007,Bibb County,AL,2020-04-01,200.16,0.00,0.020,0.034,0.096,0.278,0.572,REPUBLICAN
4,9.1,1009,Blount County,AL,2020-04-01,51.42,0.00,0.053,0.114,0.180,0.194,0.459,REPUBLICAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43969,6.7,56037,Sweetwater County,WY,2021-05-01,686.55,4.76,0.061,0.295,0.230,0.146,0.268,REPUBLICAN
43970,5.9,56039,Teton County,WY,2021-05-01,277.79,0.00,0.095,0.157,0.160,0.247,0.340,DEMOCRAT
43971,6.1,56041,Uinta County,WY,2021-05-01,498.66,4.97,0.098,0.278,0.154,0.207,0.264,REPUBLICAN
43972,4.7,56043,Washakie County,WY,2021-05-01,318.38,0.00,0.204,0.155,0.069,0.285,0.287,REPUBLICAN


In [17]:
unemployment_covid_df.isnull().sum()/len(unemployment_covid_df) * 100

unemployment_rate      0.0
COUNTYFP               0.0
CTYNAME                0.0
state                  0.0
month                  0.0
cases_avg_per_100k     0.0
deaths_avg_per_100k    0.0
party                  0.0
dtype: float64

In [18]:
unemployment_covid_df[unemployment_covid_df["party"].isnull()].reset_index()

Unnamed: 0,index,unemployment_rate,COUNTYFP,CTYNAME,state,month,cases_avg_per_100k,deaths_avg_per_100k,party


In [19]:
unemployment_covid_df[(unemployment_covid_df["party"].isnull()) & (unemployment_covid_df["state"] == "AK") & (unemployment_covid_df["month"] == unemployment_covid_df["month"][0])].reset_index()

Unnamed: 0,index,unemployment_rate,COUNTYFP,CTYNAME,state,month,cases_avg_per_100k,deaths_avg_per_100k,party


In [20]:
election_df = getElectionData()
election_df[election_df["COUNTYFP"]==51800]

Unnamed: 0,state,state_po,CTYNAME,COUNTYFP,party_winner_2020,totalvotes_2020,fractionalvotes_2020,party_winner_2016,totalvotes_2016,fractionalvotes_2016
2950,VIRGINIA,VA,SUFFOLK CITY,51800,DEMOCRAT,49642.0,0.577656,DEMOCRAT,43240.0,0.53839


In [21]:
election_df

Unnamed: 0,state,state_po,CTYNAME,COUNTYFP,party_winner_2020,totalvotes_2020,fractionalvotes_2020,party_winner_2016,totalvotes_2016,fractionalvotes_2016
0,ALABAMA,AL,AUTAUGA,1001,REPUBLICAN,27770.0,0.714368,REPUBLICAN,24973.0,0.727666
1,ALABAMA,AL,BALDWIN,1003,REPUBLICAN,109679.0,0.761714,REPUBLICAN,95215.0,0.765457
2,ALABAMA,AL,BARBOUR,1005,REPUBLICAN,10518.0,0.534512,REPUBLICAN,10469.0,0.520967
3,ALABAMA,AL,BIBB,1007,REPUBLICAN,9595.0,0.784263,REPUBLICAN,8819.0,0.764032
4,ALABAMA,AL,BLOUNT,1009,REPUBLICAN,27588.0,0.895716,REPUBLICAN,25588.0,0.893348
...,...,...,...,...,...,...,...,...,...,...
3148,WYOMING,WY,SWEETWATER,56037,REPUBLICAN,16698.0,0.732363,REPUBLICAN,17130.0,0.709515
3149,WYOMING,WY,TETON,56039,DEMOCRAT,14787.0,0.665990,DEMOCRAT,12627.0,0.579235
3150,WYOMING,WY,UINTA,56041,REPUBLICAN,9459.0,0.792473,REPUBLICAN,8470.0,0.726564
3151,WYOMING,WY,WASHAKIE,56043,REPUBLICAN,4032.0,0.804812,REPUBLICAN,3814.0,0.763241


# Plot the data

In [146]:
# Need to disable the error since data is more than 5k rows
#alt.data_transformers.disable_max_rows()

party_domain = ["DEMOCRAT", "REPUBLICAN"]
party_range = ["blue", "red"]
covid_cases_domain=[0,int(unemployment_covid_df["cases_avg_per_100k"].max()/1000+1)*1000]
covid_deaths_domain=[0,int(unemployment_covid_df["deaths_avg_per_100k"].max()/1000+1)*1000]
unemployment_domain=[0,int(unemployment_covid_df["unemployment_rate"].max()/10+1)*10]

def timestamp(t):
  return pd.to_datetime(t).timestamp() * 1000

In [144]:
min_timestamp = unemployment_covid_df["month"][0].timestamp()*1000
max_timestamp = unemployment_covid_df["month"][len(unemployment_covid_df["month"])-1].timestamp()*1000
# Create the slider for the month
month_slider = alt.binding_range(name="Month: ", min=min_timestamp, max=max_timestamp)
month_selector = alt.selection_single(name="month_selector", 
                                    fields=["month"],
                                    bind=month_slider,
                                    init={"month": timestamp("2020-04-01")})
# Create the selector for the column
columns=["cases_avg_per_100k", "deaths_avg_per_100k"]
x_axis_select_box = alt.binding_select(options=columns, name="x_axis")
x_axis_selector = alt.selection_single(name="x axis", fields=["x_axis"], bind=x_axis_select_box, init={"x_axis": "cases_avg_per_100k"})

## Average Covid Cases per 100k

In [147]:
unemployment_vs_covid_plot =  alt.Chart(unemployment_covid_df, width=800, height=400).transform_fold(
    columns,
    as_=["x_axis", "value"]
).mark_point(filled=True, size=30).encode(
    y=alt.Y(
        "unemployment_rate:Q",
        title="Unemployment Rate",
        scale=alt.Scale(domain=unemployment_domain)),
    x=alt.X(
        "cases_avg_per_100k:Q", 
        title="Monthly Total of Daily Average Covid Cases per 100k",
        scale=alt.Scale(domain=covid_cases_domain)
    ),
    color=alt.Color("party:N", scale=alt.Scale(domain=party_domain, range=party_range),
                   title="Party", legend=None)
).add_selection(
    month_selector
).transform_filter(
    "(year(datum.month) == year(month_selector.month[0])) && "
    "(month(datum.month) == month(month_selector.month[0]))"
)

unemployment_vs_covid_plot

## Average Covid Death per 100k

In [140]:
unemployment_vs_covid_plot =  alt.Chart(unemployment_covid_df, width=800, height=400).transform_fold(
    columns,
    as_=["x_axis", "value"]
).mark_point(filled=True, size=30).encode(
    y=alt.Y(
        "unemployment_rate:Q",
        title="Unemployment Rate",
        scale=alt.Scale(domain=unemployment_domain)),
    x=alt.X(
        "deaths_avg_per_100k:Q", 
        title="Monthly Total of Daily Average Covid Cases per 100k",
        scale=alt.Scale(domain=covid_deaths_domain)
    ),
    color=alt.Color("party:N", scale=alt.Scale(domain=party_domain, range=party_range),
                   title="Party", legend=None)
).add_selection(
    month_selector
).transform_filter(
    "(year(datum.month) == year(month_selector.month[0])) && "
    "(month(datum.month) == month(month_selector.month[0]))"
)

unemployment_vs_covid_plot

## Mask

In [133]:
never_wear_mask_domain=[0,int(unemployment_covid_df["ALWAYS"].max()*10+1)/10]

In [134]:
unemployment_vs_mask_plot =  alt.Chart(unemployment_covid_df, width=800, height=400).transform_fold(
    columns,
    as_=["x_axis", "value"]
).mark_point(filled=True, size=30).encode(
    y=alt.Y(
        "unemployment_rate:Q",
        title="Unemployment Rate",
        scale=alt.Scale(domain=unemployment_domain)),
    x=alt.X(
        "ALWAYS:Q", 
        title="Never wear a mask",
        scale=alt.Scale(domain=never_wear_mask_domain)
    ),
    color=alt.Color("party:N", scale=alt.Scale(domain=party_domain, range=party_range),
                   title="Party", legend=None)
).add_selection(
    month_selector
).transform_filter(
    "(year(datum.month) == year(month_selector.month[0])) && "
    "(month(datum.month) == month(month_selector.month[0]))"
)

unemployment_vs_mask_plot