<a href="https://colab.research.google.com/github/srikanthrc/running-to-stand-still/blob/master/_notebooks/2020-03-26-covid19_compare_us_trajectories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Case Trajectories / US States
> Comparing state trajectories of total cases

- toc: false
- comments: false
- author: Srikanth Chinmay (inspired by Pratap Vardhan @ covid19dashboards.com) 
- categories: [covid-19]
- image: images/covid-compare-state-trajectories.png
- permalink: /compare-state-trajectories/




In [0]:
#hide
import pandas as pd
import altair as alt
from IPython.display import HTML

In [2]:
#hide
from pathlib import Path
if not Path('covid_overview.py').exists():
    ! wget https://raw.githubusercontent.com/srikanthrc/covid-19/master/covid_overview.py

--2020-05-03 13:56:22--  https://raw.githubusercontent.com/srikanthrc/covid-19/master/covid_overview.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9419 (9.2K) [text/plain]
Saving to: ‘covid_overview.py’


2020-05-03 13:56:22 (78.2 MB/s) - ‘covid_overview.py’ saved [9419/9419]



In [0]:
#hide
import covid_overview as covid

In [0]:
#hide

# get Countries data from JHU
url = ('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/'
       'csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
df = pd.read_csv(url)
# rename countries
df['Country/Region'] = df['Country/Region'].replace({'Korea, South': 'South Korea'})
df = df[~df['Country/Region'].isin(['Cruise Ship'])]   # Remove Ships

dt_cols = df.columns[~df.columns.isin(['Province/State', 'Country/Region', 'Lat', 'Long'])]

dff = (df.groupby('Country/Region')[dt_cols].sum()
       .stack().reset_index(name='Confirmed Cases')
       .rename(columns={'level_1': 'Date', 'Country/Region': 'Key', 'Confirmed Cases': 'Cases'}))
dff['Date'] = pd.to_datetime(dff['Date'], format='%m/%d/%y')
# display(dff.head(5))

In [5]:
#hide
# get States data from covidtracking
df_s = covid.get_covidtracking_data('positive')
dt_cols_s = df_s.columns[~df_s.columns.isin(['Province/State', 'Country/Region','Lat','Long'])]

dff_s = (df_s.groupby('Province/State')[dt_cols_s].sum()
          .stack().reset_index(name='positive')
          .rename(columns={'level_1': 'Date', 'Province/State': 'Key', 'positive': 'Cases'}))
#display(dff_s.head(5))

df_s = covid.get_covidtracking_data('death')
t_s = (df_s.groupby('Province/State')[dt_cols_s].sum()
        .stack().reset_index(name = 'death')
        .rename(columns={'level_1': 'Date', 'Province/State': 'Key', 'death': 'Deaths'}))
#display(t_s.head(5))

dff_s = pd.merge(dff_s, t_s, on=['Key','Date'])
dff_s['Date'] = pd.to_datetime(dff_s['Date'], format="%m/%d/%y")

dff_s.groupby(by='Date').sum().tail(5)

Unnamed: 0_level_0,Cases,Deaths
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-28,1005592.0,52525.0
2020-04-29,1033157.0,55225.0
2020-04-30,1062134.0,57316.0
2020-05-01,1095681.0,59059.0
2020-05-02,1125719.0,60710.0


In [0]:
#hide
MIN_CASES = 300
LAST_DATE = dt_cols[-1]
# sometimes last column may be empty, then go backwards
for c in dt_cols[::-1]:
    if not df[c].fillna(0).eq(0).all():
        LAST_DATE = c
        break

countries = dff[dff['Date'].eq(LAST_DATE) & dff['Cases'].ge(MIN_CASES) & 
        dff['Key'].isin(('Japan','South Korea','Italy','Germany','Spain','United Kingdom','US','India'))
       ].sort_values(by='Cases', ascending=False)
countries = countries['Key'].values

states = dff_s[dff_s['Date'].eq(LAST_DATE) & dff_s['Cases'].ge(MIN_CASES)].sort_values(by='Cases', ascending=False)
states = states['Key'].values

In [0]:
#hide
SINCE_CASES_NUM = 100
dff2 = dff[dff['Key'].isin(countries)].copy()
days_since = (dff2.assign(F=dff2['Cases'].ge(SINCE_CASES_NUM))
              .set_index('Date')
              .groupby('Key')['F'].transform('idxmax'))
dff2['Days since 100 cases'] = (dff2['Date'] - days_since.values).dt.days.values
dff2 = dff2[dff2['Days since 100 cases'].ge(0)]
# display(dff2[dff2['Key'].eq("US")])

In [0]:
#hide
days_since = (dff_s.assign(F=dff_s['Cases'].ge(SINCE_CASES_NUM))
              .set_index('Date')
              .groupby('Key')['F'].transform('idxmax'))
dff_s['Days since 100 cases'] = (dff_s['Date'] - days_since.values).dt.days.values

In [0]:
#hide
SINCE_DEATH_NUM = 0
days_since = (dff_s.assign(F=dff_s['Deaths'].gt(SINCE_DEATH_NUM))
              .set_index('Date')
              .groupby('Key')['F'].transform('idxmax'))
dff_s['Days since first death'] = (dff_s['Date'] - days_since.values).dt.days.values

dff_s2 = dff_s[dff_s['Days since 100 cases'].ge(0)]

In [0]:
#hide
def get_colors(x):
    mapping = {
        'Italy': 'grey',
        'South Korea': '#E45756',
        'Japan': '#9467bd',
        'Iran': '#A1BA59',
        'Spain': '#F58518',
        'Germany': '#2495D3',
        'United Kingdom': '#2495D3',
        'US': '#3469B2',
        'India': '#9D755D',
        'New York': '#2495D3',
        'California': '#F58518'
        }
    return mapping.get(x, '#3469B2')

In [0]:
#hide_input
baseline_countries = ['Italy', 'South Korea', 'Japan']
max_date = dff2['Date'].max()
color_domain = list(dff2['Key'].unique())
color_range = list(map(get_colors, color_domain))

def make_since_chart(highlight_countries=[], baseline_countries=baseline_countries):
    selection = alt.selection_multi(fields=['Key'], bind='legend', 
                                    init=[{'Key': x} for x in highlight_countries + baseline_countries])

    base = alt.Chart(dff2, width=550).encode(
        x='Days since 100 cases:Q',
        y=alt.Y('Cases:Q', scale=alt.Scale(type='log')),
        color=alt.Color(
            'Key:N',
            scale=alt.Scale(domain=color_domain, range=color_range),
            legend=alt.Legend(columns=2)),
        tooltip=list(dff2),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.05))
    )
    max_day = dff2['Days since 100 cases'].max()
    ref = pd.DataFrame([[x, 100*1.33**x] for x in range(max_day+1)], columns=['Days since 100 cases', 'Cases'])
    base_ref = alt.Chart(ref).encode(x='Days since 100 cases:Q', y='Cases:Q')
    return (
        base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3]) +
        base_ref.transform_filter(
            alt.datum['Days since 100 cases'] >= max_day
        ).mark_text(dy=-6, align='right', fontSize=10, text='33% Daily Growth') +
        base.mark_line(point=True).add_selection(selection) + 
        base.transform_filter(
            alt.datum['Date'] >= int(max_date.timestamp() * 1000)
        ).mark_text(dy=-8, align='right', fontWeight='bold').encode(text='Key:N')
    ).properties(
        title=f"Compare {', '.join(highlight_countries)} trajectory with {', '.join(baseline_countries)}"
    )

### Learning from Italy, South Korea & Japan

Italy, South Korea & Japan are three countries which show different growth rates and how it evolved over time. 

**South Korea** flattened it's growth after 2 weeks since 100 cases. **Italy** continue to grew after 3rd week.

Where does a Country of interest stand today?

<small>Click (Shift+ for multiple) on Countries legend to filter the visualization.</small>

In [12]:
#hide_input
HTML(f'<small class="float-right">Data as of {pd.to_datetime(LAST_DATE).strftime("%B, %d %Y")}</small>')

In [13]:
#hide_input
chart1 = make_since_chart(highlight_countries=['US', 'India'])
chart1

In [0]:
#hide 
dff3 = pd.concat([dff2, dff_s2], sort=False).fillna(0)
dff3['Days since first death'] = dff3['Days since first death'].apply(lambda x: int(x))

#display(dff3.info(), dff3[dff3['Key'].eq('New York')])

### State Trajectories

Select a state from the drop down list to toggle the visualization.

In [0]:
#hide
LAST_DATE = dt_cols_s[-1]

In [16]:
#hide_input 
HTML(f'<small class="float-right">Data as of {pd.to_datetime(LAST_DATE).strftime("%B, %d %Y")}</small>')

In [0]:
#hide
color_domain = list(dff3['Key'].unique())
color_range = list(map(get_colors, color_domain))
max_date = dff3['Date'].max()

country_selection = alt.selection_single(
    name='Select', fields=['Key'], 
    bind=alt.binding_select(options=list(sorted(set(countries) - set(baseline_countries)))),
    init={'Key': 'US'})

state_selection = alt.selection_single(
    name='Select', fields=['Key'], 
    bind=alt.binding_select(options=list(sorted(set(states) - set(baseline_countries)))),
    init={'Key': 'New York'})

date_filter = alt.datum['Date'] >= int(max_date.timestamp() * 1000)

max_day = min(70,dff3['Days since 100 cases'].max())
ref = pd.DataFrame([[x, 100*1.33**x] for x in range(max_day+1)], columns=['Days since 100 cases', 'Cases'])
base_ref = alt.Chart(ref).encode(x='Days since 100 cases:Q', y='Cases:Q')
base_ref_f = base_ref.transform_filter(alt.datum['Days since 100 cases'] >= max_day)

#display(dff3[dff3['Days since 100 cases'].gt(90)])

In [0]:
#hide
base = alt.Chart(dff3).mark_line(
    point=True,
    tooltip=True
).properties(
    width=600
).encode(
    x='Days since 100 cases:Q',
    y=alt.Y('Cases:Q', scale=alt.Scale(type='log')),
    color=alt.Color('Key:N', scale=alt.Scale(domain=color_domain, range=color_range), legend=None),
    tooltip=['Key', 'Date', 'Cases', 'Days since 100 cases']
).properties(
    title=f"State's Trajectory compared to {', '.join(baseline_countries)}"
)

base2 = base.transform_filter(alt.FieldOneOfPredicate('Key', baseline_countries))
base3 = base.transform_filter(state_selection)
base4 = base3.transform_filter(date_filter)

In [0]:
#hide_input
chart2 = (
 base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3])
 + base_ref_f.mark_text(dy=-6, align='right', fontSize=10, text='33% Daily Growth') 
 + base2.mark_line(point=True, tooltip=True, color="lightgrey")
 + base2.transform_filter(date_filter).mark_text(dy=-8, align='right').encode(text='Key:N')
 + base3.mark_line(point={'size':30}, tooltip=True)
 + base4.mark_text(dx=8, align='left', fontWeight='bold').encode(text='Key:N')
 + base4.mark_text(dx=8, dy=12, align='left', fontWeight='bold').encode(text='Cases:Q')
)
#chart2.add_selection(state_selection)

In [0]:
#hide
dff_s2_change = pd.DataFrame({'Date': dff_s2['Date'],
                              'Key': dff_s2['Key'],
                              'Cases': dff_s2['Cases'],
                              'DeltaChange': dff_s2['Cases'].diff(),
                              'PctChg' : dff_s2['Cases'].pct_change(periods=1)})
dff_s2_change = dff_s2_change[dff_s2_change['PctChg'].ge(0)]
# dff_s2_change[dff_s2_change['Key'].eq('California')]

In [0]:
#hide
chart3 = alt.Chart(dff_s2_change).mark_bar(
    color='lightgrey',
    width=10,
    point=True,
    tooltip=True
).properties(
    width=600,
    title=f"Day over Day Change"
).encode(
    x=alt.X('Date:T',axis=alt.Axis(format="%a %m/%d")),
    y=alt.Y('DeltaChange:Q',
            axis=alt.Axis(titleColor="grey")),
    # color=alt.Color('Key:N', scale=alt.Scale(domain=color_domain, range=color_range), legend=None),
    tooltip=['Key', 'Date', 'DeltaChange']
).transform_filter(
    state_selection
)
#chart4.add_selection(state_selection)

In [0]:
#hide
chart4 = alt.Chart(dff_s2_change).mark_line(
    point=True,
    tooltip=True
).properties(
    width=600,
    title=f"Day over Day Change"
).encode(
    x=alt.X('Date:T',axis=alt.Axis(format="%a %m/%d")),
    y=alt.Y('PctChg:Q',axis=alt.Axis(format="%")),
    color=alt.Color('Key:N', scale=alt.Scale(domain=color_domain, range=color_range), legend=None),
    tooltip=['Key', 'Date', 'PctChg']
).transform_filter(
    state_selection
)
#chart3.add_selectioN(state_selection)

In [0]:
#hide
dff4 = dff3[dff3['Days since first death'].gt(0)]
baseline_states = ['New York', 'California']

max_day = min(70,dff3['Days since first death'].max())
ref = pd.DataFrame([[x, 1.33**x] for x in range(max_day+1)], columns=['Days since first death', 'Deaths'])
base_ref = alt.Chart(ref).encode(x='Days since first death:Q', y='Deaths:Q')
base_ref_f = base_ref.transform_filter(alt.datum['Days since first death'] >= max_day)

base = alt.Chart(dff4, width=600).encode(
    x='Days since first death:Q',
    y=alt.Y('Deaths:Q', scale=alt.Scale(type='log')),
    color=alt.Color('Key:N', scale=alt.Scale(domain=color_domain, range=color_range), legend=None),
    tooltip=['Key', 'Date', 'Deaths', 'Days since first death']
)

base2 = base.transform_filter(alt.FieldOneOfPredicate('Key', baseline_states))
base3 = base.transform_filter(state_selection)
base4 = base3.transform_filter(date_filter)

In [0]:
#hide_input
chart5 = (
 base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3])
 + base_ref_f.mark_text(dy=-6, align='right', fontSize=10, text='33% Daily Growth') 
 + base2.mark_line(point=True, tooltip=True)
 + base2.transform_filter(date_filter).mark_text(dx=8, dy=-16, align='right').encode(text='Key:N') 
 + base2.transform_filter(date_filter).mark_text(dy=-8, align='right').encode(text='Deaths:Q')
 + base3.mark_line(point={'size':30}, tooltip=True)
 + base4.mark_text(dx=8, align='left', fontWeight='bold').encode(text='Key:N')
 + base4.mark_text(dx=8, dy=12, align='left', fontWeight='bold').encode(text='Deaths:Q')
).properties(
    title=f"Deaths in State compared to {', '.join(baseline_states)}"
)
# chart5.add_selection(state_selection)

In [25]:
#hide_input
growth_chart = alt.vconcat(
    chart2,
    alt.layer(chart3, chart4).resolve_scale(y='independent'), 
    chart5
).add_selection(
    state_selection
).configure_view(
    stroke=None
).configure(
    padding={'left':10, 'bottom':20}
)

growth_chart

######Visualizations by [Pratap Vardhan](https://twitter.com/PratapVardhan)[^1]

[^1]: Sources: ["COVID-19 Data Repository by Johns Hopkins CSSE"](https://systems.jhu.edu/research/public-health/ncov/) [GitHub](https://github.com/CSSEGISandData/COVID-19). [covidtracking.com](https://covidtracking.com/)