In [None]:
from IPython.core.display import display, HTML, Markdown
import matplotlib.pyplot as plt
import pandas as pd
import dateparser
import seaborn as sns
from tqdm import tqdm

pd.set_option('display.max_rows', 100)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%matplotlib inline

In [None]:
df_states = pd.read_csv("covid-19-data/us-states.csv")
df_states.rename(columns={"cases": "total_cases"}, inplace=True)
df_counties = pd.read_csv("covid-19-data/us-counties.csv")
df_counties.rename(columns={"cases": "total_cases"}, inplace=True)

In [None]:
df_counties.loc[(df_counties.county == 'New York City') &(df_counties.state == 'New York'), 'fips'] = 100000

In [None]:
def calc(df):
    df['new_cases'] = df.groupby('fips')['total_cases'].diff()

In [None]:
calc(df_counties)
calc(df_states)

In [None]:
fips_to_state= {k:v for d in list(df_states.groupby(["state"]).first().apply(lambda x: {x.fips: x.name}, axis=1)) for k, v in d.items()}
state_to_fips= {k:v for d in list(df_states.groupby(["state"]).first().apply(lambda x: {x.name: x.fips}, axis=1)) for k, v in d.items()}

In [None]:
fips_to_county= {k:v for d in list(df_counties.groupby(["county", "state"]).first().apply(lambda x: {x.fips: x.name}, axis=1)) for k, v in d.items()}
county_to_fips= {k:v for d in list(df_counties.groupby(["county", "state"]).first().apply(lambda x: {x.name: x.fips}, axis=1)) for k, v in d.items()}

# Overview

The New York Times publishes their Covid-19 dataset daily to github here: `https://github.com/nytimes/covid-19-data.git`

This notebook was inspired by the Minute Physics YouTube video: https://www.youtube.com/watch?v=54XLXg4fYsc

I wanted to recreate at the local county level the plots that are shown on this site: https://aatishb.com/covidtrends/

# Plots

In [None]:
# Aatish B's log log plot
def ll_plot(df, geos, xlim=None, max_pop=None):
    ax = plt.gca()
    for geo in geos:
        if isinstance(geo, tuple):
            fip = county_to_fips[geo]
            label = f"{geo[0]}, {geo[1]}"
        else:
            fip = state_to_fips[geo]
            label = geo
        df.query(f"fips=={fip}").plot(
            x='total_cases', y='new_cases', label=label, ax=ax)
    plt.yscale("log")
    plt.xscale("log")
    plt.ylabel("Daily New Cases")
    if xlim:
        plt.xlim(1, xlim)
    if max_pop:
        ax.plot([max_pop[1], max_pop[1]], [1, 1e6],
                color="red", linestyle="--")
        ax.text(max_pop[1]*.75, 1e1, f"{max_pop[0]} Population", rotation=90)

In [None]:
def decade_time(county, state, end_value=None):
    df=df_counties
    state = state.replace("'", "\\'")
    county = county.replace("'", "\\'")

    if not end_value:
        end_value = df.query(f"state=='{state}' and county=='{county}'").total_cases.max()
    try:
        x = df.query(f"state=='{state}' and county=='{county}' and total_cases<{end_value}").iloc[-1].date
        y = df.query(f"state=='{state}' and county=='{county}' and total_cases<{end_value/10}").iloc[-1].date
        xd = dateparser.parse(x)
        yd = dateparser.parse(y)
        delta = xd - yd
    except IndexError:
        return None
    return delta.days

The `log-log` plot does not show time and one might wonder how long does it take for the total cases to increase by a factor of 10x. (I called this " _decade time_ " ).
In each county, for each day compute how many days have elapsed since the total number of cases was 1/10 the current total number of cases.

Social distancing should be slowing the rate at which the total cases are growing.
We calculate the _decade time_ for every day and for every county.

**Note that this calculation may take more than 5 minutes to complete**

In [None]:
tqdm.pandas()
if 'decade_time' not in df_counties.columns:
    df_counties['decade_time'] = df_counties.progress_apply(lambda row: decade_time(row.county, row.state, row.total_cases), axis=1)

## State Plots

In [None]:
ll_plot(df_states.query("total_cases>50"), geos=[
    'California',
    'New York',
    'Washington',
    'Colorado',
    "Michigan"
])

## New York City

In [None]:
ll_plot(df_counties.query("total_cases>50"), geos=[
    ('Santa Clara', 'California'),
    ('New York City', 'New York')
],
    max_pop=('New York City', 8.6e6)
)

In [None]:
county='New York City'
state='New York'
display(Markdown(f"{county}'s current decade time is {decade_time(county, state)} days"))

## Boulder Colorado

In [None]:
ll_plot(df_counties.query("total_cases>50"), geos=[
    ('Santa Clara', 'California'),
    ('Boulder', 'Colorado')
],
    max_pop=('Boulder', 333e3)
)

In [None]:
county='Boulder'
state='Colorado'
display(Markdown(f"{county}'s current decade time is {decade_time(county, state)} days"))

## Wayne MI

In [None]:
ll_plot(df_counties.query("total_cases>50"), geos=[
    ('Weld', 'Colorado'),
    ('Boulder', 'Colorado')
],
    max_pop=('Weld Co', 252e3)
)

In [None]:
county='Weld'
state='Colorado'
display(Markdown(f"{county}'s current decade time is {decade_time(county, state)} days"))

## Santa Clara, California

In [None]:
ll_plot(df_counties.query("total_cases>50"), geos=[
    ('Santa Clara', 'California'),
    ('Boulder', 'Colorado')
],
    max_pop=('Santa Clara', 2e6)
)

In [None]:
county='Santa Clara'
state='California'
display(Markdown(f"{county}'s current decade time is {decade_time(county, state)} days"))

In [None]:
ll_plot(df_counties.query("total_cases>50"), geos=[
    ('Santa Clara', 'California'),
    ('Suffolk', 'New York')
],
    max_pop=('Suffolk', 1.5e6)
)

In [None]:
county='Suffolk'
state='New York'
display(Markdown(f"{county}'s current decade time is {decade_time(county, state)} days"))

# Tables of "Decade Times"

In [None]:
def show_decade(county, state, tail=14):
    display(Markdown(f"## {county} {state} Decade Times (Past Two Weeks)"))
    display(df_counties.query(f"state=='{state}' and county=='{county}'").tail(tail))

In [None]:
show_decade('Boulder', 'Colorado')

In [None]:
show_decade('Weld', 'Colorado')

In [None]:
show_decade('Santa Clara', 'California', tail=14)

In [None]:
show_decade('Suffolk', 'New York')

# Calculate Decade Time For All Counties

In [None]:
county_decade_time = {}
for k in tqdm(county_to_fips.keys()):
    county_decade_time[k] = decade_time(k[0], k[1])

In [None]:
df_decade = pd.DataFrame.from_dict({"county":list(county_decade_time.keys()), "dec_time": list(county_decade_time.values())})

In [None]:
df_decade.sort_values('dec_time', ascending=True).head(20)

## Decade Plots

In [None]:
counties = [
    ('Boulder', 'Colorado'),
    ('Santa Clara', 'California'),
    ('Suffolk', 'New York')
]

In [None]:
def decade_plot(counties):
    fips = [county_to_fips[x] for x in counties]
    g = sns.lineplot(x='date',
                     y='decade_time',
                     hue='county',
                     data=df_counties[df_counties.fips.isin(fips)])
    x=plt.xticks(rotation=55)
    for i, label in enumerate(g.xaxis.get_ticklabels()):
        if i%5==0:
            label.set_visible(True)
        else:
            label.set_visible(False)

In [None]:
decade_plot(counties)