In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
import sklearn

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:

pkg = mp.jupyter.open_source_package()
pkg

In [3]:
conf_raw = pkg.reference('confirmed_ts_source').dataframe()

In [13]:
def process(df, val_field_name):
    t = df.drop(columns=['Lat','Long']).rename(columns={'Country/Region': 'country', 'Province/State': 'province'}).fillna('')

    # The countries frame sums regions of a country into whole countries. 
    countries = t[(t.province != '') ].groupby('country').sum().reset_index()
    countries['province'] = ''
    t = pd.concat([t, countries], sort=True)

    # Create a location field that uses both the country and province
    t['location'] = t.apply(lambda r: f"{r.country}{' - ' if r.province else ''}{r.province}", axis=1)

    # Turn the date columns int new rows
    t = t.set_index(['country','province','location'])
    t = t.stack().to_frame().reset_index().rename(columns={0:val_field_name,'level_3':'date'})
    t['date'] = pd.to_datetime(t['date'])
    
    # Find the date at which there are more than 10 cases. 
    min_date = t[t[val_field_name] >= 10].groupby('location').date.min().to_frame().rename(columns={'date': 'date_10'})
    t = t.merge(min_date, on='location')
    t['days_10'] = (t.date-t['date_10']).dt.days
    
    # Find the date at which there are more than 100 cases. 
    min_date = t[t[val_field_name] >= 100].groupby('location').date.min().to_frame().rename(columns={'date': 'date_100'})
    t = t.merge(min_date, on='location')
    t['days_100'] = (t.date-t['date_100']).dt.days

    # The min in this dataset for China is 548, not 100, so shift it by 6 days
    # to get it to align with other countries. It's a guess, but it looks good 
    t = t.copy()
    t.loc[t.country == 'China', ['days_100']] = t.loc[t.country == 'China', 'days_100'] + 6
    
    t.loc[t.country == 'China', ['days_10']] = t.loc[t.country == 'China', 'days_10'] + 6
    
    
    # Link in values for each location from 5 days ago, to calculate short-run rates. 
    
    t['date_m5'] = t.date-pd.Timedelta(days=5)
    t_past = t[['date', 'location', val_field_name]].copy().rename(columns={val_field_name:val_field_name+'_m5d', 'date':'date_m5'})
    t = t.merge(t_past, on=['date_m5', 'location'])

    tnz = t[ (t[val_field_name] != t[val_field_name+'_m5d']) & (t[val_field_name] !=0) & (t[val_field_name+'_m5d'] != 0) ]
    t['rate_t5d'] = np.exp((np.log(tnz[val_field_name])-np.log(tnz[val_field_name+'_m5d'])) / 5)-1

    # Check that the rate calculation is correct. 
    assert (t[val_field_name+'_m5d'] * np.power(1+t.rate_t5d,5) - t[val_field_name]).abs().sum() < 1./1.0e8
    
    
    #t.drop(columns=['date_m5', val_field_name+'_m5d' ], inplace = True)

    t[val_field_name+'_log'] = np.log(t[val_field_name]+1)

    return t
    
confirmed_df = process(conf_raw, 'confirmed')
confirmed_df[confirmed_df.country=='Italy'].sort_values('days_100').tail(20)


Unnamed: 0,country,province,location,date,confirmed,date_10,days_10,date_100,days_100,date_m5,confirmed_m5d,rate_t5d,confirmed_log
736,Italy,,Italy,2020-03-01,1694,2020-02-21,9,2020-02-23,7,2020-02-25,322,0.393835,7.435438
747,Italy,,Italy,2020-03-02,2036,2020-02-21,10,2020-02-23,8,2020-02-26,453,0.350629,7.619233
749,Italy,,Italy,2020-03-03,2502,2020-02-21,11,2020-02-23,9,2020-02-27,655,0.307402,7.825245
750,Italy,,Italy,2020-03-04,3089,2020-02-21,12,2020-02-23,10,2020-02-28,888,0.283161,8.035926
751,Italy,,Italy,2020-03-05,3858,2020-02-21,13,2020-02-23,11,2020-02-29,1128,0.278824,8.258163
752,Italy,,Italy,2020-03-06,4636,2020-02-21,14,2020-02-23,12,2020-03-01,1694,0.223055,8.441823
753,Italy,,Italy,2020-03-07,5883,2020-02-21,15,2020-02-23,13,2020-03-02,2036,0.236415,8.679992
754,Italy,,Italy,2020-03-08,7375,2020-02-21,16,2020-02-23,14,2020-03-03,2502,0.241352,8.905987
755,Italy,,Italy,2020-03-09,9172,2020-02-21,17,2020-02-23,15,2020-03-04,3089,0.243166,9.12402
737,Italy,,Italy,2020-03-10,10149,2020-02-21,18,2020-02-23,16,2020-03-05,3858,0.213423,9.225229


In [14]:
10149*(1+0.195143)**5

24747.00312803565

In [7]:
death_raw = pkg.reference('death_ts_source').dataframe()
death_df = process(death_raw, 'death')


In [8]:
recov_raw = pkg.reference('recov_ts_source').dataframe()
recov_df = process(recov_raw, 'recovered')