In [64]:
import re
import pandas as pd
import numpy as np
import altair as alt
from vega_datasets import data
from pathlib import Path
from datetime import date
from ETLForElectionAndVaccinationData import *

# uses intermediate json files to speed things up
alt.data_transformers.enable('json')

## Color global variables
TO_OTHER =   "#556B2F" 
TO_DEMOCRAT =  "#11A3D6"
TO_REPUBLICAN = "#8C1616"
STAYED_DEMOCRAT  =  '#0015BC'
STAYED_REPUBLICAN = '#FF0000'
STAYED_OTHER = '#B4D3B2'

segment_color_dict = {
    'TO_OTHER' : TO_OTHER ,
    'TO_DEMOCRAT'  : TO_DEMOCRAT,
    'TO_REPUBLICAN' : TO_REPUBLICAN,
    'STAYED_DEMOCRAT'  :  STAYED_DEMOCRAT,
    'STAYED_REPUBLICAN' : STAYED_REPUBLICAN,
    'STAYED_OTHER' : STAYED_OTHER
}

color_segment_dict = {TO_OTHER:"To other",
                      TO_DEMOCRAT:"To Democrat", 
                      TO_REPUBLICAN:"To Republican", 
                      STAYED_DEMOCRAT:"Stayed Democrat",
                      STAYED_REPUBLICAN:"Stayed Republican",
                      STAYED_OTHER:"Stayed Other"}

DataFolder = Path("../DataForPresidentialElectionsAndCovid/")


In [53]:
def getUnemploymentRate():
    """
        THIS FUNCTION reads the county level unemployment rate from the 2020 dataset published by the BLS
        and 
        
        Functions called: None
        
        Input: None
        Returns: Dataframe election_winners_df with the following set of columns.
                 Note: Granularity = COUNTYFP.
        
            state                  (full name)
            state_po               (2-letter abbreviation)
            CTYNAME                (full name)
            COUNTYFP               (FIPS number) Questions
            party_winner_2020
            totalvotes_2020
            fractionalvotes_2020
            party_winner_2016
            totalvotes_2016
            fractionalvotes_2016
                
    """
    unemployment_df = pd.read_excel(DataFolder / r"laucntycur14.xlsx",
                                    names=["LAUS_code","state_FIPS","county_FIPS","county_name_and_state_abbreviation","Period","labor_force","employed","unemployed","unemployment_rate"],
                                    header=5,
                                    nrows=3141)
    
    # Remove the Puerto Reco rows which all have NaN values
    # extract state and county names
    unemployment_df["county_name_and_state_abbreviation"]
    extract_names_regex = r"^(?P<CTYNAME>.*),\s(?P<state>[A-Z]{2})$"
    extract_county_names = lambda x : re.search(extract_names_regex,x).group("CTYNAME") if x != "District of Columbia" else "District of Columbia"
    extract_state_names = lambda x : re.search(extract_names_regex,x).group("state") if x != "District of Columbia" else "District of Columbia"
    unemployment_df["CTYNAME"] = unemployment_df["county_name_and_state_abbreviation"].apply(extract_county_names)
    unemployment_df["state"] = unemployment_df["county_name_and_state_abbreviation"].apply(extract_state_names)
    # Format the county FIPS as the state FIPS followed by the county FIPS
    concatenate_fips = lambda x : str(x["state_FIPS"]) + "{:03d}".format(x["county_FIPS"])
    unemployment_df["COUNTYFP"] = unemployment_df.apply(concatenate_fips, axis=1)
    # Convert period to datetime
    unemployment_df["month"] = pd.to_datetime(unemployment_df["Period"], format="%b-%y")
    unemployment_df.drop(columns=["state_FIPS", "county_FIPS", "LAUS_code","county_name_and_state_abbreviation","Period","labor_force","employed","unemployed"], inplace=True)
    
    covid_df = getCasesRollingAveragePer100K()
    # Remove non mainland US states
    covid_df = covid_df[covid_df["COUNTYFP"] < 57000]
    # Change period to month and average cases per 100K per month and county
    covid_df["year_month"] = covid_df["date"].dt.to_period('M')
    covid_df.drop(columns=["date"], inplace=True)
    covid_df = covid_df.groupby(["year_month", "COUNTYFP"]).sum()
    covid_df.reset_index(inplace=True)
    # Get back the month period as a timestamp
    covid_df["month"] = covid_df["year_month"].apply(lambda x: x.to_timestamp(freq="D", how="start"))
    covid_df.drop(columns=["year_month"], inplace=True)
    covid_df
    
    return unemployment_df

In [54]:
unemployment_df = getUnemploymentRate()
unemployment_df

Unnamed: 0,unemployment_rate,CTYNAME,state,COUNTYFP,datetime
0,10.9,Autauga County,AL,1001,2020-04-01
1,14.5,Baldwin County,AL,1003,2020-04-01
2,9.3,Barbour County,AL,1005,2020-04-01
3,17.1,Bibb County,AL,1007,2020-04-01
4,9.1,Blount County,AL,1009,2020-04-01
...,...,...,...,...,...
3136,6.3,Sweetwater County,WY,56037,2020-04-01
3137,12.5,Teton County,WY,56039,2020-04-01
3138,5.5,Uinta County,WY,56041,2020-04-01
3139,4.3,Washakie County,WY,56043,2020-04-01


In [70]:
covid_df = getCasesRollingAveragePer100K()
covid_df

Unnamed: 0,date,cases_avg_per_100k,deaths_avg_per_100k,COUNTYFP
10833,2020-03-24,0.26,0.0,1001
12367,2020-03-25,1.02,0.0,1001
14029,2020-03-26,1.53,0.0,1001
15812,2020-03-27,1.53,0.0,1001
17699,2020-03-28,1.53,0.0,1001
...,...,...,...,...
1446877,2021-06-22,0.00,0.0,56045
1450123,2021-06-23,0.00,0.0,56045
1453369,2021-06-24,0.00,0.0,56045
1456614,2021-06-25,0.00,0.0,56045


In [72]:
%%timeit
covid_df = getCasesRollingAveragePer100K()
covid_df["datetime"] = pd.to_datetime(covid_df["date"], format="%Y-%m-%d")
# Remove non mainland US states
covid_df = covid_df[covid_df["COUNTYFP"] < 57000]
# group cases and averages by year, month and county
covid_df = covid_df.groupby([covid_df["datetime"].dt.year, covid_df["datetime"].dt.month, covid_df["COUNTYFP"]]).mean()
covid_df.index.set_names(["year", "month", "COUNTYFP"], inplace=True)
covid_df.reset_index(inplace=True)
recreate_datetime = lambda x : date(int(x["year"]), int(x["month"]),1)
covid_df["datetime"] = covid_df.apply(recreate_datetime, axis=1)
covid_df

3.53 s ± 73.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [93]:
%%timeit
covid_df = getCasesRollingAveragePer100K()
# Remove non mainland US states
covid_df = covid_df[covid_df["COUNTYFP"] < 57000]
#Change period to month and average cases per 100K per month and county
covid_df["year_month"] = covid_df["date"].dt.to_period('M')
covid_df.drop(columns=["date"], inplace=True)
covid_df = covid_df.groupby(["year_month", "COUNTYFP"]).sum()
covid_df.reset_index(inplace=True)
covid_df["datetime"] = covid_df["year_month"].apply(lambda x: x.to_timestamp(freq="D", how="start"))
covid_df

2.89 s ± 52.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [94]:
covid_df.dtypes

year_month                  period[M]
COUNTYFP                        int64
cases_avg_per_100k            float64
deaths_avg_per_100k           float64
datetime               datetime64[ns]
dtype: object

In [None]:
extract_names_regex = r"^(?P<CTYNAME>.*),\s(?P<state>[A-Z]{2})$"
extract_county_names = lambda x : re.search(extract_names_regex,x).group("CTYNAME")

In [54]:
extract_county_names("fdsafsda, AA")

'fdsafsda'