In [247]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.print_figure_kwargs = {'bbox_inches':None}
import seaborn as sns
import datetime
import numpy as np
import os
from pathlib import Path, PurePath
from tqdm import tqdm
from collections import defaultdict
import re

## Setup

### One-time directory setup

In [248]:
base_loc = '.'
population_loc = f'{base_loc}/resources'

# jhu_loc is the root directory of the JHU data repository
jhu_loc = f'{base_loc}/COVID-19'
csse_loc = f'{jhu_loc}/csse_covid_19_data/csse_covid_19_daily_reports'

## Functions for loading data

### Load population and region data

Note: Regions only supported for Pennsylvania and New York in the csv file

In [1]:
def load_population_data():
    """ load population and region data for counties in PA and other supported states """
    df = pd.read_csv(f'{population_loc}/county-populations.csv')
    
    return df

### Load JHU data

In [250]:
def get_data():
    """ read all the CSV files into a single data frame """
    csv_files = [fname for fname in os.listdir(csse_loc) if fname.endswith('.csv')]
    data = []
    for csv_file in csv_files:
        df = pd.read_csv(f'{csse_loc}/{csv_file}', dtype={"FIPS": str})
        # At some point JHU renamed some columns
        df = df.rename(columns={'Province/State': 'Province_State', 
                                'Last Update': 'Last_Update',
                                'Country/Region': 'Country_Region',
                                'Latitude': 'Lat',
                                'Longitude': 'Long_'})

        data.append(df)

    df = pd.concat(data, ignore_index=True)

    # Remove unneeded columns
    df = df.drop(columns=['Lat', 'Long_', 'Combined_Key', 'FIPS'])
    
    # In later data, "Active" = "Confirmed" - "Deaths". If "Active" == 0, compute it
    

    # Standardize all dates to noon
    df.Last_Update = pd.to_datetime(df.Last_Update)
    df.Last_Update = df.Last_Update.dt.strftime('%m/%d/%Y')
    df.Last_Update = pd.to_datetime(df.Last_Update)

    # Fix "active" column when it is set to 0 prior to it being reported
    def active_fn(row):
        if row.Active == 0:
            return row.Confirmed - row.Recovered - row.Deaths
        else:
            return row.Active

    df = df.assign(Active=df.apply(active_fn, axis=1)) 
    
    return df

In [251]:
def get_series_data():
    """ read the series data in the JHU directory """
    series_loc = f'{jhu_loc}/csse_covid_19_data/csse_covid_19_time_series'
    df = pd.read_csv(f'{series_loc}/time_series_covid19_confirmed_US.csv', dtype={"FIPS": str})
    return df

## Locality Selection

#### Merge and filter all data for just one state

In [252]:
def merge_state(df, state, popdf):
    """ merge individual counties in a state into a single row """
    statepop_dict = state_populations(popdf)

    merged = pd.DataFrame()
    merged['Last_Update'] = df[df.Province_State==state].groupby(df.Last_Update)['Last_Update'].unique()    
    merged['Admin2'] = 'All'
    merged['Province_State'] = state
    merged['Country_Region'] = df.Country_Region.unique()[0]

    merged['Deaths'] = df[df.Province_State==state].groupby(df.Last_Update)['Deaths'].sum()
    merged['Confirmed'] = df[df.Province_State==state].groupby(df.Last_Update)['Confirmed'].sum()
    merged['Recovered'] = df[df.Province_State==state].groupby(df.Last_Update)['Recovered'].sum()
    merged['Active'] = df[df.Province_State==state].groupby(df.Last_Update)['Active'].sum()
    merged['Population'] = statepop_dict[state]

    merged['Last_Update'] = merged.index
    merged.reset_index(drop=True, inplace=True)
    
    return merged

In [253]:
def get_state_data(state, df):
    state_matches = df[(df.Province_State==state)]
    state_matches.reset_index(drop=True, inplace=True)

    return pd.DataFrame(state_matches)

#### Merge for just one region

In [254]:
def merge_region(df, region, popdf):
    """ for PA, merge the data into regions """
    regionpop_dict = region_populations(popdf)
    
    merged = pd.DataFrame()
    merged['Last_Update'] = df[df.Region==region].groupby(df.Last_Update)['Last_Update'].unique()    
    merged['Admin2'] = region
    merged['Province_State'] = df.Province_State.unique()[0]
    merged['Country_Region'] = df.Country_Region.unique()[0]

    merged['Deaths'] = df[df.Region==region].groupby(df.Last_Update)['Deaths'].sum()
    merged['Confirmed'] = df[df.Region==region].groupby(df.Last_Update)['Confirmed'].sum()
    merged['Recovered'] = df[df.Region==region].groupby(df.Last_Update)['Recovered'].sum()
    merged['Active'] = df[df.Region==region].groupby(df.Last_Update)['Active'].sum()

    merged['Population'] = regionpop_dict[region]
    
    merged['Last_Update'] = merged.index
    merged.reset_index(drop=True, inplace=True)

    return merged

#### Filter all data for just one county

In [255]:
def get_county_data(state, county, df):
    county_matches = df[(df.Province_State==state) & (df.Admin2==county)]
    county_matches.reset_index(drop=True, inplace=True)

    return pd.DataFrame(county_matches)

#### Annotate the dataframe with region information, if available

In [256]:
'''
def annotate_regions(df, popdf):
    """ For counties in Pennsylvania, annotate the dataframe with the region """
    
    def annotator(row):
        poprow = popdf[(popdf.State==row.Province_State)&(popdf.County==row.Admin2)]
        if len(poprow) == 0:
            return np.nan
        else:
            return poprow.Region.values[0]

    df['Region'] = df.apply(annotator, axis=1)
'''
pass

In [257]:
def annotate_regions(df, popdf):
    """ Annotate the dataframe with the region, if available (PA and NYC only) """

    region_map = defaultdict(lambda: np.nan)
    for d in popdf.to_dict('records'):
        region_map[d['State'], d['County']] = d['Region']
    df['Region'] = df[['Province_State', 'Admin2']].apply(lambda x: region_map[x[0], x[1]], axis=1)

#### Annotate the dataframe with populations, if available

In [258]:
def annotate_populations(df, popdf):
    """ Annotate the dataframe with populations, if available """
        
    pop_map = defaultdict(lambda: np.nan)
    for d in popdf.to_dict('records'):
        pop_map[d['State'], d['County']] = d['Population']
    df['Population'] = df.loc[:,['Province_State','Admin2']].apply(lambda x: pop_map[x[0], x[1]], axis=1)

In [259]:
def state_populations(popdf):
    return dict(popdf.groupby(popdf.State)['Population'].sum().items())


In [260]:
def region_populations(popdf):
    """ for PA, calculate the population of each region """   
    return dict(popdf.groupby(popdf.Region)['Population'].sum().items())
    

#### Select the appropriate locality

In [261]:
def select_locality(all_df, popdf, query_type, query_state=None, query_region=None, query_county=None):
    if query_type == 'State':
        label = f'{query_state} State'
        df = get_state_data(query_state, all_df)
        df = merge_state(df, query_state, popdf)
    elif query_type == 'Region':
        label = f'{query_region} Region, {query_state}'
        df = get_state_data(query_state, all_df)
        if 'Region' in df.columns:
            df['Admin2'] = df['Region']
        else:
            annotate_regions(df, popdf)
        df = merge_region(df, query_region, popdf)
    elif query_type == 'County':
        label = f'{query_county} County, {query_state}'
        df=get_county_data(query_state, query_county, all_df)
        annotate_populations(df, popdf)
    
    return df, label

### Compute daily and average new cases

In [262]:
def new_cases(df):
    """ given a DataFrame with a .Confirmed field, add a .New_Cases field that
    has new cases per day. """
    df['New_Cases'] = df.Confirmed.subtract(df.Confirmed.shift(1), fill_value=0)
    return df

In [263]:
def average_new_cases(df, days, centered=False):
    """ this computes day the trailing average in the final day """
    """ compute the moving average over {days} days and add as day_avg_{days} to the df """
    field = f'day_avg_{days}'
    df[field] = df.New_Cases.rolling(window=14, min_periods=1, center=centered).mean()

In [264]:
def date_avg(dates):
  refdate = datetime.datetime(2019, 1, 1)
  return refdate + sum([date - refdate for date in dates], datetime.timedelta()) / len(dates)

## Functions for graphing

### Daily new cases and 7-day moving average

In [265]:
def new_case_plot(df, label, days=7, centered=False, output=None):

    if centered:
        date_field = f'Centered_Date_{days}'
    else:
        date_field='Last_Update'
    average_new_cases(df, days, centered=centered)

        
    g = sns.lineplot(df['Last_Update'], df['New_Cases'], label="Daily new cases")
    sns.lineplot(df[date_field], df[f'day_avg_{days}'], ax=g, label=f"{days} day moving average")
    g.set(xlabel="\nDate", ylabel="New Cases", title=f"New Cases Per Day\n{label}")
    leg = g.legend(loc='upper left', frameon=False)
    plt.xticks(rotation=90)
    if output == 'inline':
        plt.show()
    else:
        output = output.replace("'","").replace('.png', '_new_cases.png')
        plt.savefig(output, bbox_inches='tight')


### Yellow target: 50 new cases over 14 days per 100K people

In [266]:
def newcase_sum(df, days, perpop=1):
    """ 
    compute the sum of {days} days and {days}_sum to the df 
    if perpop is not 1, calculate the same weighted by the population pop
    """
    field = f'sum_{days}'
    df[field] = df.New_Cases.rolling(window=days, min_periods=1).sum()
    df[field] *= perpop

In [267]:
def yellow_target(df, label, output=None):
    population = set(df.Population).pop()
    newcase_sum(df, 14, perpop=100000/population)
    target = 50
    
    g = sns.lineplot(df['Last_Update'], df['sum_14'], label="14 day caseload per 100K")
    sns.lineplot(df['Last_Update'], [target]*len(df), label="Yellow Target", ax=g)
    g.set(xlabel="\nDate", ylabel="14 days cases per 100K", title=f"Progress towards yellow target\n{label}")
    leg = g.legend(loc='lower right', frameon=False)
    plt.xticks(rotation=90)
    if output == 'inline':
        plt.show()
    else:
        output = output.replace("'","").replace('.png', '_yellow_target.png')
        plt.savefig(output, bbox_inches='tight')

### Days trending downward in 14 days

In [268]:
def limit_xticks(labels, num=5):
    """
    For some reason I can't limit the number of xticks so here I'm
    just doing it myself by erasing the text of the xticks I don't want
    """
        
    target_ticks = set([0, len(labels)-1])
    for i in range(1, num-1):
        pos= int(round(len(labels)/(num-1)*i,0))
        target_ticks.add(pos)

    for i, lab in enumerate(labels):
        if i not in target_ticks:
            labels[i].set_text("")
    return labels

In [269]:
def fit(period):
    if len(period) == 1:
        return 0
    else:
        m, b = np.polyfit(np.arange(len(period)), period, 1)
        return m

In [270]:
def trend(df, days):
    """ 
    compute the trendline for the past {days} days as slope_{days} and
    the number of days within those {days} that the trend is worsening 
    (positive) or improving (negative) as {days}_trend
    """
    slopes = []
    trends = []

    # Get the slope of the trend line for the past {days} days.
    sfield=f'slope_{days}'
    df[sfield] = df.New_Cases.rolling(window=days, min_periods=1).apply(fit)

    # Get the number of times the slope was positive in last {days} days.
    field = f'trend_{days}'
    df[field] = df[sfield].rolling(window=14, min_periods=14).apply(lambda x: (x>0).sum())

    return df

In [271]:
def trending(df, label, days=14, output=None):
    df = trend(df, 14)

    tfield = f'trend_{days}'
    sfield = f'slope_{days}'
    
    formatted_dates = df['Last_Update'].apply(lambda x: x.strftime('%Y-%m-%d'))
    g=sns.barplot(formatted_dates, df[tfield], label="increasing trends", color='red')
    sns.barplot(formatted_dates, df[tfield]-14, label="decreasing trends", color='green')
    t = g.twinx()
    
    sns.lineplot(np.arange(len(df)), df[sfield], color="black", label="14-day slope", ax=t)
    #slopes = np.where(df['trend_14'].isnull(), 0, df['slope_14'])
    #sns.lineplot(np.arange(len(df)), slopes, color="black", label="14-day slope", ax=t)

    labels = limit_xticks(g.get_xticklabels())
    g.set_xticklabels(labels,rotation=90)

    g.set_ylim(-14,14)
    title=f"Number of days in the past two weeks with a positive or negative trend\n{label}"
    g.set(xlabel="\nDate", ylabel="Number of days", title=title)
    t.set(ylabel="slope of 14-day trend")
    slope_lim = max(abs(df[df[sfield].notna()][sfield]))*1.1
    t.set_ylim(-slope_lim,slope_lim)
    leg = t.legend(loc='lower left', frameon=False)

    if output == 'inline':
        plt.show()
    else:
        output = output.replace("'","").replace('.png', '_trend.png')
        plt.savefig(output, bbox_inches='tight')
    

## Read data

#### Issues

* States allocate cases to "Unassigned" if county is unknown
* "Out of CO", "Out of GA", "Out of MI", "Out of OK", "Out of TN" is listed as a county
* Dukes, MA and Nantucket, MA -> "Dukes and Nantucket"
* Federal Correctional Institution (FCI), MI; Michigan Department of Corrections (MDOC), MI
* Kansas City, MO reported as a standalone county when it actually appears in multiple counties
* New York City, NY is reported but counties are Richmond, Queens, New York, Kings and Bronx
* Counties in Utah don't align

### Read county population data

In [272]:
popdf = load_population_data()

### Read JHU data

In [273]:
series_data = True

#### Read JHU daily_reports data

In [274]:
if not series_data:
    all_df = get_data()

#### Read JHU time_series data

In [275]:
if series_data:
    all_sdf = get_series_data()

## Output all graphs for specified state, region or county

In [276]:
def clip_at_date(df, date):
    """
    Start the time series on this date
    """
    # TODO

In [277]:
def clip_at_zero_series(df):
    """
    Start the data the day before the first confirmed case
    Assumes data frame is in time_series format
    """
    rgx = re.compile(r'\d+/\d+/\d+')
    date_cols = [c for c in df.columns if rgx.search(c)]
    drops = []
    for c in date_cols:
        sm = df[c].sum()
        if sm == 0:
            drops.append(c)
        elif sm > 0:
            break
    if len(drops) < len(df.columns):
        df = df.drop(columns=drops)
    return df


In [278]:
def clip_at_zero(df):
    """
    Start the data the day before the first confirmed case
    Assumes data frame is in daily_reports format
    """
    non_zeros = df[df.Confirmed > 0]
    if len(non_zeros) > 0:
        first_non_zero = min(non_zeros.index)
    last_zero = max(0, first_non_zero-1)
    cdf = pd.DataFrame(df.iloc[last_zero:])
    cdf = cdf.reset_index(drop=True)
    return cdf

In [279]:
def pipeline_helper(df, label, output, output_directory):
    df = df.sort_values(by='Last_Update', ignore_index=True)
    if output == 'png':
        output = label.replace(' ','_') + '.png'
        output = output.replace(',','')
        if output_directory==None:
            output_directory='png'
        output = f"{output_directory}/{output}"
    else:
        output = 'inline'
    
    new_cases(df) # add a new_cases column to the dataframe
    new_case_plot(df, label, days=14, centered=False, output=output)    
    plt.close()
    yellow_target(df, label, output=output)
    plt.close()
    trending(df, label, output=output)
    plt.close()
    
    return df


In [280]:
def run_pipeline(all_df, popdf, query_type, query_state=None, query_region=None, query_county=None, output='inline', 
                output_directory=None, clip=False):
    """ 
    Run pipeline on daily_reports data
    """
    assert query_type in ['State', 'County', 'Region']
    assert output in ['inline', 'png']

    df, label = select_locality(all_df, popdf, query_type, query_state, query_region, query_county)

    if clip:
        df = clip_at_zero(df) # Start at the day before the first case
    
    return pipeline_helper(df, label, output, output_directory)

## Functions to support time series data

In [281]:
# FOR time_series DATA ONLY
def run_pipeline_series(all_sdf, popdf, query_type, query_state=None, query_region=None, query_county=None, output='inline', 
                output_directory=None, clip=False):
    """
    Run pipeline on time_series data
    """
    assert query_type in ['State', 'County', 'Region']
    assert output in ['inline', 'png']

    df = select_locality_series(all_sdf, popdf, query_type, query_state, query_region, query_county)
    
    if clip:
        df = clip_at_zero(df) # Start at the day before the first case

    label = df.Combined_Key.values[0]
    return pipeline_helper(df, label, output, output_directory)
    

In [282]:
# FOR time_series DATA ONLY
def simplify_columns(df, date_cols=None):
    if not date_cols:
        # find which columns are dates
        rgx = re.compile(r'\d+/\d+/\d+')
        date_cols = [c for c in df.columns if rgx.search(c)]
    #reorder = ['Province_State', 'Admin2', 'Country_Region', 'Combined_Key', 'Population', 'Region'] + date_cols
    reorder = ['Admin2', 'Province_State', 'Country_Region', 'Combined_Key', 'Population'] + date_cols
    df = df[reorder]
    return df

In [283]:
# FOR time_series DATA ONLY
def merge_state_series(sdf, popdf, state=None):
    merged = pd.DataFrame()

    # verify there is only one state here --> if not, select it using the paramater
    if len(set(sdf['Province_State'])) > 1:
        sdf = get_state_data(state, sdf)
    else: 
        state = sdf['Province_State'].values[0]
        
    # verify there is at least one row here
    assert len(sdf) > 0

    # find which columns are dates
    rgx = re.compile(r'\d+/\d+/\d+')
    date_cols = [c for c in sdf.columns if rgx.search(c)]

    # Merge confirmed case totals
    for date in date_cols:
        merged[date] = sdf.groupby(sdf['Province_State'])[date].sum()
    merged['Province_State'] = state
    merged['Admin2'] = 'All'
    merged['Country_Region'] = sdf['Country_Region'].values[0]
    merged['Combined_Key'] = f'{state} State'
    merged['Population'] = sdf.groupby(sdf['Province_State'])['Population'].sum()

    merged = simplify_columns(merged, date_cols)

    merged.reset_index(drop=True, inplace=True)
    
    return merged

In [284]:
# FOR time_series DATA ONLY
def merge_region_series(sdf, popdf, region=None):
    merged = pd.DataFrame()

    # verify there is only one region here --> if not, select it using the paramater
    if len(set(sdf['Region'])) > 1:
        region_matches = sdf[(sdf.Region==region)]
        region_matches.reset_index(drop=True, inplace=True)
        sdf = pd.DataFrame(region_matches)
    else:
        region = sdf['Region'].values[0]
        
    state = sdf['Province_State'].values[0]
        
    # verify there is at least one row here
    assert len(sdf) > 0

    # find which columns are dates
    rgx = re.compile(r'\d+/\d+/\d+')
    date_cols = [c for c in sdf.columns if rgx.search(c)]

    # Merge confirmed case totals
    for date in date_cols:
        merged[date] = sdf.groupby(sdf['Province_State'])[date].sum()
    merged['Province_State'] = state
    merged['Admin2'] = region
    merged['Country_Region'] = sdf['Country_Region'].values[0]
    merged['Combined_Key'] = f'{region} Region, {state}'
    merged['Population'] = sdf.groupby(sdf['Province_State'])['Population'].sum()

    merged = simplify_columns(merged, date_cols)
    
    merged.reset_index(drop=True, inplace=True)

    return merged

In [285]:
# FOR time_series DATA ONLY
def get_county_data_series(state, county, df):
    merged = pd.DataFrame(df[(df.Province_State==state) & (df.Admin2==county)])
    merged['Combined_Key'] = f'{county} County, {state}'
    
    merged = simplify_columns(merged)
    merged.reset_index(drop=True, inplace=True)
    
    return merged

In [286]:
# FOR time_series DATA ONLY
def transpose(sdf):
    """ Convert the single-row time series JHU data to the table format """
    
    # Assumes a single row
    assert len(sdf) == 1
    
    # Save columns to a dictionary so we can retrieve later
    keys = sdf.to_dict('records')[0]
    rgx = re.compile(r'\d+/\d+/\d+')
    non_date_cols = [c for c in sdf.columns if not rgx.search(c)]
    sdf = sdf.drop(columns=non_date_cols)

    # Transpose the data
    df = sdf.transpose()

    # Copy column 0 into Confirmed (otherwise reseting the index deletes this)
    df['Confirmed'] = df[0]

    # Create Last_Update column from the index and standardize dates to noon each day
    df['Last_Update'] = df.index
    df.Last_Update = pd.to_datetime(df.Last_Update)
    df.Last_Update = df.Last_Update.dt.strftime('%m/%d/%Y')
    df.Last_Update = pd.to_datetime(df.Last_Update)
    
    # Restore the non-date values into the columns
    for col in non_date_cols:
        df[col] = keys[col]
    
    # Reindex
    df.reset_index(drop=True,inplace=True)

    # Reorder columns
    df = df[['Last_Update', 'Confirmed'] + non_date_cols]

    return df



In [287]:
# FOR time_series DATA ONLY
def select_locality_series(sdf_all, popdf, query_type, query_state=None, query_region=None, query_county=None):
    if 'Region' not in sdf_all:
        annotate_regions(sdf_all, popdf)
    if 'Population' not in sdf_all:
        annotate_populations(sdf_all, popdf)
    if query_type == 'State':
        df = get_state_data(query_state, sdf_all) # OK
        df = merge_state_series(df, popdf) # Rewritten
    elif query_type == 'Region':
        df = get_state_data(query_state, sdf_all) # OK
        df = merge_region_series(df, popdf, region=query_region) # Rewritten
    elif query_type == 'County':
        df = get_county_data_series(query_state, query_county, sdf_all) # Rewritten

    
    return transpose(df)

### Support moving files around

## Generate state graphs

In [288]:
if 'STATEPLOT' in os.environ:
    states = [os.environ['STATEPLOT']]
else:
    #states = ['Pennsylvania', 'Georgia', 'New York', 'Florida', 'New Jersey']
    #states = ['Pennsylvania']
    #states = ['Georgia', 'New York', 'Florida', 'New Jersey']
    states = []
print(f'STATES = {states}')
statedir = 'states'
tempdir = 'staging'

# Make {tempdir} if it doesn't exist
Path(tempdir).mkdir(parents=True, exist_ok=True)

STATES = []


In [289]:
if not series_data:
    for state in states:
        outdir = f'{tempdir}/{state}'.replace(' ','_')
        # Make {outdir} if it doesn't exist
        Path(outdir).mkdir(parents=True, exist_ok=True)

        counties = set(popdf[popdf.State==state].County)
        
        if state == 'New York': # remove NYC counties; JHU conflates into a single county
            counties -= set(['Bronx', 'New York', 'Kings', 'Queens', 'Richmond', 'New York City'])

        state_df = get_state_data(state, all_df)
        state_df = clip_at_zero(state_df)

        pbar = tqdm(sorted(counties))
        for county in pbar:
            pbar.set_description(f"{state}:{county:20}")
            run_pipeline(state_df, popdf, query_type="County", query_state=state, query_county=county, output="png",
                        output_directory=outdir)

        # Annotate Regions
        region_map = defaultdict(lambda: np.nan)
        for d in popdf.to_dict('records'):
            region_map[d['State'], d['County']] = d['Region']
        state_df['Region'] = state_df[['Province_State', 'Admin2']].apply(lambda x: region_map[x[0], x[1]], axis=1)

        regions = set(popdf.Region[(popdf.Region.notnull()) & (popdf.State==state)])

        pbar = tqdm(sorted(regions))
        for region in pbar:
            pbar.set_description(f"{state}:{region:20}")
            df = run_pipeline(state_df, popdf, query_type="Region", query_state=state, query_region=region, output="png",
                         output_directory=outdir)

        run_pipeline(state_df, popdf, query_type="State", query_state=state, output='png',
                     output_directory=outdir)

    print(f"All states completed.")

In [290]:
def movefiles(olddir, newdir, glob='*.png', chmod=None):
    olddir = Path(olddir)
    newdir = Path(newdir)
    for oldsubdir in olddir.iterdir():
        if oldsubdir.is_dir():
            newsubdir = newdir.joinpath(oldsubdir.name)
            # Be sure subdir exists in newdir
            Path(newsubdir).mkdir(parents=True, exist_ok=True)
            files=oldsubdir.glob(glob)

            for file in files:
                newpath = newsubdir.joinpath(file.name)
                file.rename(newpath)
                if chmod:
                    newpath.chmod(chmod)

In [291]:
if series_data:
    for state in states:
        outdir = f'{tempdir}/{state}'.replace(' ','_')
        # Make {outdir} if it doesn't exist
        Path(outdir).mkdir(parents=True, exist_ok=True)

        counties = set(popdf[popdf.State==state].County)
        
        if state == 'New York': # remove NYC counties; JHU conflates into a single county
            counties -= set(['Bronx', 'New York', 'Kings', 'Queens', 'Richmond', 'New York City'])
        elif state in ['District of Columbia', 'Guam', 'Virgin Islands', 'Northern Mariana Islands']:
            counties = set()
            
        state_df = get_state_data(state, all_sdf)
        state_df = clip_at_zero_series(state_df)


        # COUNTIES
        pbar = tqdm(sorted(counties))
        for county in pbar:
            pbar.set_description(f"{state}:{county:20}")
            pipedf = run_pipeline_series(state_df, popdf, query_type="County", query_state=state, query_county=county, output="png",
                        output_directory=outdir)

        # REGIONS
        regions = set(popdf.Region[(popdf.Region.notnull()) & (popdf.State==state)])
        pbar = tqdm(sorted(regions))
        for region in pbar:
            pbar.set_description(f"{state}:{region:20}")
            df = run_pipeline_series(state_df, popdf, query_type="Region", query_state=state, query_region=region, output="png",
                         output_directory=outdir)

        # STATE
        run_pipeline_series(state_df, popdf, query_type="State", query_state=state, output='png',
                     output_directory=outdir)

    movefiles(tempdir, statedir, glob='*.png', chmod=0o644)
    print(f"All states completed.")


All states completed.


## One-off graphs

### Setup variables for this run

In [292]:
one_off = False
if one_off:
    series_data = True
    q_type='County'
    q_state='Pennsylvania'
    q_region='South East'
    q_county='Montour'
    output='inline'
    output_dir='png'
    if series_data:
        df = run_pipeline_series(all_sdf, popdf, q_type, query_state=q_state, query_county=q_county, 
                                 query_region=q_region, output=output, output_directory=outdir)
    else:
        df = run_pipeline(all_df, popdf, q_type, query_state=q_state, query_county=q_county, 
                                 query_region=q_region, output=output, output_directory=outdir)


# Covid Tracking

In [441]:
#ct_df = get_covidtracking()

In [361]:
abbrevs = {'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 
           'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'DC': 'District of Columbia', 
           'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 
           'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 
           'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 
           'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 
           'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 
           'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 
           'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 
           'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 
           'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'}
rabbrevs = dict([(v,k) for (k,v) in abbrevs.items()])

In [343]:
def get_covidtracking():
    tracking_loc='covidtracking/states'
    csv_file='daily.csv'
    df = pd.read_csv(f'{tracking_loc}/{csv_file}')
    df.date = pd.to_datetime(df.date, format='%Y%m%d')
    return df
    
def filter_covidtracking(df, state):
    state_df = pd.DataFrame(df[df.state==state].sort_values(by='date'))
    state_df.reset_index(inplace=True)
    return state_df
    
def augment_covidtracking(state_df, window=7):
    """ 
    only works for a single state at a time
    """
    state_df['positive'].fillna(0, inplace=True)
    state_df['negative'].fillna(0, inplace=True)
    state_df['pending'].fillna(0, inplace=True)
    

    # cumulative
    state_df['positive_rate'] = state_df.positive / (state_df.positive + state_df.negative)
    state_df['daily_positive'] = state_df.positive.subtract(state_df.positive.shift(1), fill_value=0)
    state_df['daily_negative'] = state_df.negative.subtract(state_df.negative.shift(1), fill_value=0)
    
    # {window}-day daily test rate
    dp = f'daily_positive_{window}'
    dn = f'daily_negative_{window}'
    dpr= f'daily_positive_rate_{window}'
    state_df[dp] = state_df.daily_positive.rolling(window=window, min_periods=1, center=False).sum()
    state_df[dn] = state_df.daily_negative.rolling(window=window, min_periods=1, center=False).sum()
    state_df[dpr]= state_df[dp]/(state_df[dp] + state_df[dn])
    
    # {window}-day daily number of tests average
    state_df['tests'] = state_df.positive + state_df.negative
    state_df['new_tests'] = state_df.tests.subtract(state_df.tests.shift(1), fill_value=0)
    nt = f'new_tests_{window}'
    state_df[nt] = state_df.new_tests.rolling(window=window, min_periods=1, center=False).mean()
    return state_df

In [430]:
def positive_test_rate(df, label, window=7, mindate="2020-04-01", output=None):
    """
    run on covidtracking data
    """    
    
    if mindate is not None:
        df = df[df.date > mindate]
        
    dpr = f'daily_positive_rate_{window}'
    g = sns.lineplot(df['date'], df[dpr], label=f"positive test rate: {window} day average")
    #sns.lineplot(df['date'], df['positive_rate'], label="cumulative positive test rate", ax=g)
    g.set(xlabel="\nDate", ylabel="Positive test rate", title=f"Positive test rate over time\n{label}")

    ymax = max(0.5, max(df.daily_positive_rate_7))
    g.set_ylim(0, ymax)

    leg = g.legend(loc='best', frameon=False)
    plt.xticks(rotation=90)
    if output == 'inline':
        plt.show()
    #else:
    #    output = output.replace("'","").replace('.png', '_yellow_target.png')
    #    plt.savefig(output, bbox_inches='tight')

In [431]:
def ptr_plus(df, label, window=7, mindate="2020-04-01", output=None):
    dpr = f'daily_positive_rate_{window}'
    nt = f'new_tests_{window}'

    if mindate is not None:
        df = df[df.date > mindate]
    
    formatted_dates = df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
    g=sns.barplot(formatted_dates, df[nt], label="number of tests", color='green')
    t = g.twinx()
    
    sns.lineplot(np.arange(len(df)), df[dpr], color="black", label="positive test rate", ax=t)
    #slopes = np.where(df['trend_14'].isnull(), 0, df['slope_14'])
    #sns.lineplot(np.arange(len(df)), slopes, color="black", label="14-day slope", ax=t)

    labels = limit_xticks(g.get_xticklabels())
    g.set_xticklabels(labels,rotation=90)

    #g.set_ylim(-14,14)
    
    title=f"Number of tests and positive test rate: {window}-day average\n{label}"
    g.set(xlabel="\nDate", ylabel="Number of tests", title=title)
    t.set(ylabel="Positive test rate")

    ymax = max(0.5, max(df.daily_positive_rate_7))
    t.set_ylim(0, ymax)
    
    leg = t.legend(loc='best', frameon=False)

    if output == 'inline':
        plt.show()
    else:
        output = output.replace("'","").replace('.png', '_trend.png')
        plt.savefig(output, bbox_inches='tight')
    

In [443]:
#state = 'Oregon'
#state_df = filter_covidtracking(ct_df, rabbrevs[state])
#augment_covidtracking(state_df)
#ptr_plus(state_df, state, output='inline')
