In [2]:
!git clone https://github.com/CSSEGISandData/COVID-19.git
!pip3 install -q arviz

fatal: destination path 'COVID-19' already exists and is not an empty directory.


In [3]:
import numpy as np
import pandas as pd
import datetime
import arviz
import tqdm.notebook as tqdm

In [None]:
df = pd.read_csv("COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/04-01-2020.csv")

In [None]:
def csv_filename(date):
    basedir = "COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/"
    return basedir + date.strftime("%m-%d-%Y") + ".csv"

def get_dataframe(date): 
    df = pd.read_csv(csv_filename(date))
    df.insert(0, "Date", date.strftime("%Y-%m-%d"))
    df.columns = df.columns.str.replace("/", "_")
    df.Country_Region = df.Country_Region.str.replace("Mainland China", "China")
    return df

In [None]:
yesterday = datetime.datetime.today().date() - datetime.timedelta(days=2)
date_range = pd.date_range(datetime.date(2020, 1, 23), yesterday)
data_frames = [get_dataframe(date) for date in date_range]

df = pd.concat(data_frames).reset_index(drop = True)

In [9]:
def get_dataframe_india(url):
    print("Reading: "+url)
    df = pd.read_csv(url,usecols=['Patient Number', 'Date Announced', 'Detected State'])
    return df

urls = ["https://api.covid19india.org/csv/latest/raw_data"+str(i)+".csv" for i in range(13,14)]
data_frames = [get_dataframe_india(url) for url in urls]

df = pd.concat(data_frames).reset_index(drop = True)
df['Date Announced'] = pd.to_datetime(df['Date Announced'], format="%d/%m/%Y")
df.to_csv("today_data.csv",index=False)

Reading: https://api.covid19india.org/csv/latest/raw_data13.csv


In [11]:
df

Unnamed: 0,Date Announced,Detected State,Patient Number
0,2020-08-07,Mizoram,
1,2020-08-07,Telangana,
2,2020-08-07,Telangana,
3,2020-08-07,Telangana,
4,2020-08-07,Telangana,
...,...,...,...
19222,2020-08-18,,
19223,2020-08-18,,
19224,2020-08-18,,
19225,2020-08-18,,


In [None]:
dateparse = lambda x: datetime.datetime.strptime(x, '%d/%m/%Y')

states = pd.read_csv("today_data.csv",
                     #parse_dates=['Date Announced'], date_parser=dateparse,#infer_datetime_format=True,
                     index_col=['Detected State', 'Date Announced'],squeeze=True).sort_index()

In [None]:
states = states[~states.index.duplicated(keep='last')]

In [13]:
url_us = 'https://covidtracking.com/api/v1/states/daily.csv'
states = pd.read_csv(url_us,
                     usecols=['date', 'state', 'positive'],
                     parse_dates=['date'],
                     index_col=['state', 'date'],
                     squeeze=True).sort_index()

In [18]:
states

state  date      
AK     2020-03-06       0.0
       2020-03-07       0.0
       2020-03-08       0.0
       2020-03-09       0.0
       2020-03-10       0.0
                      ...  
WY     2020-08-13    3119.0
       2020-08-14    3183.0
       2020-08-15    3227.0
       2020-08-16    3286.0
       2020-08-17    3331.0
Name: positive, Length: 9281, dtype: float64

In [None]:
states.xs('West Bengal').plot()

## By Sam

In [14]:
def Rt(states: pd.DataFrame, 
       state: str, 
       γ = 0.2, 
       Δt = 1):
    total_cases = states[state]
    good_data = total_cases.where(lambda x: x > 100).dropna()
    diff = good_data.diff()
    return 1 + 1 / (γ * Δt) * np.log(((diff.shift(1) / diff)))

In [16]:
import pymc3 as pm
import theano.tensor as tt
from types import SimpleNamespace



In [17]:
def run_model(state):
    
    with pm.Model() as model:
        
        result = SimpleNamespace()
        
        cases = states.loc[state]
        result.state = state
        result.index = cases.index[1:]

        # Random walk magnitude
        step_size = pm.HalfNormal('step_size', sigma = 0.03)

        # Theta random walk
        theta_raw_init = pm.Normal('theta_raw_init', 0.1, 0.1)
        theta_raw_steps = pm.Normal('theta_raw_steps', shape=len(cases)-2) * step_size
        theta_raw = tt.concatenate([[theta_raw_init], theta_raw_steps])
        theta = pm.Deterministic('theta', theta_raw.cumsum())

        inv_gamma = pm.Gamma('inv_gamma', alpha=6, beta=1.5)
        gamma = 1.0 / inv_gamma
        r_t = pm.Deterministic('r_t', theta/gamma + 1)

        expected_today = pm.Deterministic('expected_today', pm.math.exp(theta.cumsum()))

        mu = pm.math.maximum(0.1, expected_today) # make sure cases stay positive
        observed = cases.round().values[1:]
        cases = pm.Poisson('cases', mu=mu, observed=observed)

        result.trace = pm.sample(
            chains = 1,
            tune = 3000,
            draws = 1000,
            target_accept = 0.95
        )
    
    return result

In [None]:
from matplotlib import dates as mdates
import arviz as az

def df_from_result(result):
    
    r_t = result.trace['r_t']
    mean = np.mean(r_t, axis = 0)
    hdi90 = az.hdi(result.trace['r_t'], hdi_prob = 0.9)
    hdi50 = az.hdi(result.trace['r_t'], hdi_prob = 0.5)
    
    idx = pd.MultiIndex.from_product([
            [result.state],
            result.index
        ], names=['region', 'date'])
        
    df = pd.DataFrame(data=np.c_[mean, hdi90, hdi50], index=idx,
                      columns=['mean', 'lower_90', 'upper_90', 'lower_50','upper_50'])
    return df

def plot_rt(df, ax=None, c=(.3,.3,.3,1), ci=(0,0,0,.05)):
    if ax is None:
        fig, ax = plt.subplots()
    ax.set_ylim(0.5, 1.6)
    ax.set_title(df.index.get_level_values(0)[0])
    df = df.droplevel(0)
    ax.plot(df['mean'],
            marker='o',
            markersize=4,
            markerfacecolor='w',
            lw=1,
            c=c,
            markevery=2)
    ax.fill_between(
        df.index,
        df['lower_90'].values,
        df['upper_90'].values,
        color=ci,
        lw=0)
    ax.axhline(1.0, linestyle=':', lw=1)
    
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d'))
    ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))

In [None]:
result = run_model('West Bengal')
plot_rt(df_from_result(result))

## Kevin Systrom

In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from matplotlib.dates import date2num, num2date
from matplotlib import dates as mdates
from matplotlib import ticker
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

from scipy import stats as sps
from scipy.interpolate import interp1d

from IPython.display import clear_output

%config InlineBackend.figure_format = 'retina'

In [None]:
# state_name = 'West Bengal'

# def prepare_cases(cases, cutoff=25):
#     new_cases = cases.diff()

#     smoothed = new_cases.rolling(7,
#         win_type='gaussian',
#         min_periods=1,
#         center=True).mean(std=2).round()
    
#     zeros = smoothed.index[smoothed.eq(0)]
#     if len(zeros) == 0:
#         idx_start = 0
#     else:
#         last_zero = zeros.max()
#         idx_start = smoothed.index.get_loc(last_zero) + 1
#     smoothed = smoothed.iloc[idx_start:]
#     original = new_cases.loc[smoothed.index]
    
#     return original, smoothed

# cases = states.xs(state_name).rename(f"{state_name} cases")
# original, smoothed = prepare_cases(cases)

# original.plot(title=f"{state_name} New Cases per Day",
#                c='k',
#                linestyle=':',
#                alpha=.5,
#                label='Actual',
#                legend=True,
#              figsize=(500/72, 300/72))

# ax = smoothed.plot(label='Smoothed',
#                    legend=True)

# ax.get_figure().set_facecolor('w')

In [None]:
state_name = 'Maharashtra'

def prepare_cases(cases):
    new_cases = cases.diff()

    smoothed = new_cases.rolling(7,
        win_type='gaussian',
        min_periods=1,
        center=True).mean(std=2).round()
    
    zeros = smoothed.index[smoothed.eq(0)]
    if len(zeros) == 0:
        idx_start = 0
    else:
        last_zero = zeros.max()
        idx_start = smoothed.index.get_loc(last_zero) + 1
    smoothed = smoothed.iloc[idx_start:]
    original = new_cases.loc[smoothed.index]
    
    return original, smoothed

cases = states.xs(state_name).rename(f"{state_name} cases")

original, smoothed = prepare_cases(cases)

original.plot(title=f"{state_name} New Cases per Day",
               c='k',
               linestyle=':',
               alpha=.5,
               label='Actual',
               legend=True,
             figsize=(600/72, 400/72))

ax = smoothed.plot(label='Smoothed',
                   legend=True)
ax.get_figure().set_facecolor('w')

In [None]:
GAMMA = 1/7
R_T_MAX = 12
r_t_range = np.linspace(0, R_T_MAX, R_T_MAX*100+1)
# def get_posteriors(sr, sigma=0.15):

#     # (1) Calculate Lambda
#     lam = sr[:-1].values * np.exp(GAMMA * (r_t_range[:, None] - 1))

    
#     # (2) Calculate each day's likelihood
#     likelihoods = pd.DataFrame(
#         data = sps.poisson.pmf(sr[1:].values, lam),
#         index = r_t_range,
#         columns = sr.index[1:])
    
#     # (3) Create the Gaussian Matrix
#     process_matrix = sps.norm(loc=r_t_range,
#                               scale=sigma
#                              ).pdf(r_t_range[:, None]) 

#     # (3a) Normalize all rows to sum to 1
#     process_matrix /= process_matrix.sum(axis=0)
    
#     # (4) Calculate the initial prior
#     #prior0 = sps.gamma(a=4).pdf(r_t_range)
#     prior0 = np.ones_like(r_t_range)/len(r_t_range)
#     prior0 /= prior0.sum()

#     # Create a DataFrame that will hold our posteriors for each day
#     # Insert our prior as the first posterior.
#     posteriors = pd.DataFrame(
#         index=r_t_range,
#         columns=sr.index,
#         data={sr.index[0]: prior0}
#     )
    
#     # We said we'd keep track of the sum of the log of the probability
#     # of the data for maximum likelihood calculation.
#     log_likelihood = 0.0

#     # (5) Iteratively apply Bayes' rule
#     for previous_day, current_day in zip(sr.index[:-1], sr.index[1:]):

#         #(5a) Calculate the new prior
#         current_prior = process_matrix @ posteriors[previous_day]
        
#         #(5b) Calculate the numerator of Bayes' Rule: P(k|R_t)P(R_t)
#         numerator = likelihoods[current_day] * current_prior
        
#         #(5c) Calcluate the denominator of Bayes' Rule P(k)
#         denominator = np.sum(numerator)
        
#         # Execute full Bayes' Rule
#         posteriors[current_day] = numerator/denominator
        
#         # Add to the running sum of log likelihoods
#         log_likelihood += np.log(denominator)
    
#     return posteriors, log_likelihood

# # Note that we're fixing sigma to a value just for the example
# posteriors, log_likelihood = get_posteriors(smoothed, sigma=.25)

In [None]:
def get_posteriors(sr, window=7, min_periods=1):
    lam = sr[:-1].values * np.exp(GAMMA * (r_t_range[:, None] - 1))

    # Note: if you want to have a Uniform prior you can use the following line instead.
    # I chose the gamma distribution because of our prior knowledge of the likely value
    # of R_t.
    
    # prior0 = np.full(len(r_t_range), np.log(1/len(r_t_range)))
    prior0 = np.log(sps.gamma(a=3).pdf(r_t_range) + 1e-14)

    likelihoods = pd.DataFrame(
        # Short-hand way of concatenating the prior and likelihoods
        data = np.c_[prior0, sps.poisson.logpmf(sr[1:].values, lam)],
        index = r_t_range,
        columns = sr.index)

    # Perform a rolling sum of log likelihoods. This is the equivalent
    # of multiplying the original distributions. Exponentiate to move
    # out of log.
    posteriors = likelihoods.rolling(window,
                                     axis=1,
                                     min_periods=min_periods).sum()
    posteriors = np.exp(posteriors)

    # Normalize to 1.0
    posteriors = posteriors.div(posteriors.sum(axis=0), axis=1)
    
    return posteriors

posteriors = get_posteriors(smoothed)

In [None]:
ax = posteriors.plot(title=f'{state_name} - Daily Posterior for $R_t$',
           legend=False, 
           lw=1,
           c='k',
           alpha=.3,
           xlim=(0.4,6))

ax.set_xlabel('$R_t$');

In [None]:
def highest_density_interval(pmf, p=.9, debug=False):
    # If we pass a DataFrame, just call this recursively on the columns
    if(isinstance(pmf, pd.DataFrame)):
        return pd.DataFrame([highest_density_interval(pmf[col], p=p) for col in pmf],
                            index=pmf.columns)
    
    cumsum = np.cumsum(pmf.values)
    
    # N x N matrix of total probability mass for each low, high
    total_p = cumsum - cumsum[:, None]
    
    # Return all indices with total_p > p
    lows, highs = (total_p > p).nonzero()

    # Find the smallest range (highest density)
    best = (highs - lows).argmin()
    low = pmf.index[lows[best]]
    high = pmf.index[highs[best]]
    
    return pd.Series([low, high],
                     index=[f'Low_{p*100:.0f}',
                            f'High_{p*100:.0f}'])

def highest_density_interval_BF(pmf, p=.95):
    
    # If we pass a DataFrame, just call this recursively on the columns
    if(isinstance(pmf, pd.DataFrame)):
        return pd.DataFrame([highest_density_interval_BF(pmf[col]) for col in pmf],
                            index=pmf.columns)
    
    cumsum = np.cumsum(pmf.values)
    best = None
    for i, value in enumerate(cumsum):
        for j, high_value in enumerate(cumsum[i+1:]):
            if (high_value-value > p) and (not best or j<best[1]-best[0]):
                best = (i, i+j+1)
                break
            
    low = pmf.index[best[0]]
    high = pmf.index[best[1]]
    return pd.Series([low, high], index=['Low', 'High'])

In [None]:
# Note that this takes a while to execute - it's not the most efficient algorithm
hdis = highest_density_interval(posteriors, p=.9)

most_likely = posteriors.idxmax().rename('ML')

# Look into why you shift -1
result = pd.concat([most_likely, hdis], axis=1)

result.tail()

In [None]:
def plot_rt(result, ax, state_name):
    
    ax.set_title(f"{state_name}")
    
    # Colors
    ABOVE = [1,0,0]
    MIDDLE = [1,1,1]
    BELOW = [0,0,0]
    cmap = ListedColormap(np.r_[
        np.linspace(BELOW,MIDDLE,25),
        np.linspace(MIDDLE,ABOVE,25)
    ])
    color_mapped = lambda y: np.clip(y, .5, 1.5)-.5
    
    index = result['ML'].index.get_level_values('Date Announced')
    values = result['ML'].values
    
    # Plot dots and line
    ax.plot(index, values, c='k', zorder=1, alpha=.25)
    ax.scatter(index,
               values,
               s=40,
               lw=.5,
               c=cmap(color_mapped(values)),
               edgecolors='k', zorder=2)
    
    # Aesthetically, extrapolate credible interval by 1 day either side
    lowfn = interp1d(date2num(index),
                     result['Low_90'].values,
                     bounds_error=False,
                     fill_value='extrapolate')
    
    highfn = interp1d(date2num(index),
                      result['High_90'].values,
                      bounds_error=False,
                      fill_value='extrapolate')
    
    extended = pd.date_range(start=pd.Timestamp('2020-03-01'),
                             end=index[-1]+pd.Timedelta(days=1))
    
    ax.fill_between(extended,
                    lowfn(date2num(extended)),
                    highfn(date2num(extended)),
                    color='k',
                    alpha=.1,
                    lw=0,
                    zorder=3)

    ax.axhline(1.0, c='k', lw=1, label='$R_t=1.0$', alpha=.25);
    
    # Formatting
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
    ax.xaxis.set_minor_locator(mdates.DayLocator())
    
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}"))
    ax.yaxis.tick_right()
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.margins(0)
    ax.grid(which='major', axis='y', c='k', alpha=.1, zorder=-2)
    ax.margins(0)
    ax.set_ylim(0.0, 5.0)
    ax.set_xlim(pd.Timestamp('2020-03-1'), result.index.get_level_values('Date Announced')[-1]+pd.Timedelta(days=1))
    fig.set_facecolor('w')

    
fig, ax = plt.subplots(figsize=(600/72,400/72))

plot_rt(result, ax, state_name)
ax.set_title(f'Real-time $R_t$ for {state_name}')
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))

In [None]:
states

In [None]:
results = {}
# highest_density_interval seems to fail for China & France, haven't figured out why
# also filter out countries where smoothed data has less than 10 rows
FILTERED_STATES = [np.nan]

states_to_process = states.loc[~states.index.get_level_values('Detected State').isin(FILTERED_STATES)]
failed_states = []

for state_name, cases in states_to_process.groupby(level='Detected State'):
    clear_output(wait=True)
    print(f'Processing {state_name}')
    new, smoothed = prepare_cases(cases)
    if (len(smoothed) < 10):
      print(f"Skipping {state_name}, too few cases from smoothing algorithm")
      failed_states.append(state_name)
      continue

    print('\tGetting Posteriors')
    try:
        posteriors = get_posteriors(smoothed)
    except:
        display(cases)
    print('\tGetting HDIs')
    try:
      hdis = highest_density_interval(posteriors)
    except:
      print(f"Error with {state_name}")
      FILTERED_STATES.append(state_name)

      continue
    print('\tGetting most likely values')
    most_likely = posteriors.idxmax().rename('ML')
    result = pd.concat([most_likely, hdis], axis=1)
    results[state_name] = result.droplevel(0)
    
clear_output(wait=True)
print(f"Countries skipped due to low smoothed counts: {failed_states}")
print(f"Countries skipped due to HDI errors: {FILTERED_STATES}")
print('Done.')

In [None]:
ncols = 4
nrows = int(np.ceil(len(results) / ncols))

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, nrows*3))


for i, (state_name, result) in enumerate(results.items()):
    if (len(result) >= 2):
      plot_rt(result, axes.flat[i], state_name)
    else:
      FILTERED_STATES.append(state_name)

fig.tight_layout()
fig.set_facecolor('w')



In [None]:
overall = None

for state_name, result in results.items():
    r = result.copy()
    r.index = pd.MultiIndex.from_product([[state_name], result.index])
    if overall is None:
        overall = r
    else:
        overall = pd.concat([overall, r])

overall.sort_index(inplace=True)

In [None]:
FULL_COLOR = [.7,.7,.7]
NONE_COLOR = [179/255,35/255,14/255]
PARTIAL_COLOR = [.5,.5,.5]
ERROR_BAR_COLOR = [.3,.3,.3]

In [None]:
filtered = overall.index.get_level_values(0).isin(FILTERED_STATES)
mr = overall.loc[~filtered].groupby(level=0)[['ML', 'High_90', 'Low_90']].last()

def plot_standings(mr, figsize=None, title='Most Recent $R_t$ by County'):
    if not figsize:
        figsize = ((15.9/50)*len(mr)+.1,2.5)
        
    fig, ax = plt.subplots(figsize=figsize)

    ax.set_title(title)
    err = mr[['Low_90', 'High_90']].sub(mr['ML'], axis=0).abs()
    bars = ax.bar(mr.index,
                  mr['ML'],
                  width=.825,
                  color=FULL_COLOR,
                  ecolor=ERROR_BAR_COLOR,
                  capsize=2,
                  error_kw={'alpha':.5, 'lw':1},
                  yerr=err.values.T)

    labels = mr.index.to_series()
    ax.set_xticklabels(labels, rotation=90, fontsize=11)
    ax.margins(0)
    ax.set_ylim(0,2.)
    ax.axhline(1.0, linestyle=':', color='k', lw=1)

    fig.set_facecolor('w')
    return fig, ax

#mr.sort_values('ML', inplace=True)
#plot_standings(mr, title = 'Most Likely Recent $R_t$ by State');

In [None]:
mr.sort_values('High_90', inplace=True)
plot_standings(mr, title = 'Most Likely (High) Recent $R_t$ by State');

In [None]:
mr.sort_values('Low_90', inplace=True)
plot_standings(mr, title = 'Most Likely (Low) Recent $R_t$ by County');

In [None]:
show = mr[mr.High_90.le(1.1)].sort_values('ML')
fig, ax = plot_standings(show, title='Likely Under Control');

In [None]:
show = mr[mr.Low_90.ge(1.05)].sort_values('Low_90')
fig, ax = plot_standings(show, title='Likely Not Under Control');