In [40]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import os
import plotly.graph_objects as go

import numpy as np

%matplotlib inline
mpl.rcParams['figure.figsize'] = (16, 10)
pd.set_option('display.max_rows', 500)

# Load Data

In [41]:
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';',
                      parse_dates=[0])
df_analyse=df_analyse.drop(columns={'Unnamed: 0'},axis=1)
df_analyse.sort_values('date',ascending=True).tail()

Unnamed: 0,date,India,Germany,France,Spain,US
232,2020-09-10,4562414,258149,392243,554143,6396100
233,2020-09-11,4659984,259735,401890,566326,6443652
234,2020-09-12,4754356,260817,402811,566326,6485123
235,2020-09-13,4846427,261737,402893,566326,6519573
236,2020-09-14,4846427,263222,425870,593730,6553652


# Helper Function

In [42]:
def quick_plot(x_in, df_input,y_scale='log',slider=False):
    fig=go.Figure()
    
    for each in df_input.columns:
        fig.add_trace(go.Scatter(x=x_in,
                             y=df_input[each],
                             name=each,
                             opacity=0.8 
                            )
                  )

    fig.update_layout(autosize=True,
            width=1200,
            height=1000,
            font=dict(
                family="PT Sans, monospace",
                size=18,
                color="#7f7f7f"
                    )
                )
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-50,
                        nticks=20,
                        tickfont=dict(size=14,color="#7f7f7f")
                                  )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()
                 

In [43]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,1:],
           y_scale='linear',
           slider=True)

In [44]:
threshold=100

In [45]:
compare_list=[]
for pos,country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold])
                       )

In [46]:
pd_sync_timelines=pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T

In [47]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [48]:
pd_sync_timelines.head()

Unnamed: 0,India,Germany,France,Spain,US,date
0,102.0,130.0,130.0,120.0,103.0,0
1,113.0,159.0,191.0,165.0,172.0,1
2,119.0,196.0,204.0,222.0,215.0,2
3,142.0,262.0,288.0,259.0,337.0,3
4,156.0,482.0,380.0,400.0,450.0,4


In [49]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [50]:
quick_plot(pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1],
           y_scale='log',
           slider=True)

# Doubling Rate

In [51]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [52]:
max_days=34

norm_slopes={
    'doubling every day':doubling_rate(100,np.arange(20),1),
    'doubling every 2 days':doubling_rate(100,np.arange(20),2),
    'doubling every 4 days':doubling_rate(100,np.arange(20),4),
    'doubling every 10 days':doubling_rate(100,np.arange(20),10)
}

In [53]:
pd_sync_timelines_w_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [54]:
quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:,:-1],
           y_scale='log',
           slider=True)

In [55]:
pd_sync_timelines_w_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)

# Understanding Linear Regression

In [56]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)

In [57]:
l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1, 1)
y=np.log(np.array(df_analyse['Germany'][5:]))

In [58]:
reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [59]:
X_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(X_hat)

In [60]:
LR_inspect=df_analyse[['date','Germany']].copy()

In [61]:
LR_inspect['prediction']=np.exp(Y_hat)
LR_inspect

Unnamed: 0,date,Germany,prediction
0,2020-01-22,0,1.0
1,2020-01-23,0,1.079628
2,2020-01-24,0,1.165597
3,2020-01-25,0,1.258412
4,2020-01-26,0,1.358617
5,2020-01-27,1,1.466802
6,2020-01-28,4,1.583601
7,2020-01-29,4,1.709701
8,2020-01-30,4,1.845841
9,2020-01-31,5,1.992823


In [62]:
quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True)

# Doubling Rate - Piecewise Linear Regression

In [63]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)



In [64]:
from scipy import signal

In [65]:
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])  
df_analyse=df_analyse.drop(columns={'Unnamed: 0'},axis=1)
country_list=df_analyse.columns[1:]

In [66]:
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],
                           5,
                           1)

In [67]:
filter_cols=['India_filter','Germany_filter', 'France_filter', 'Spain_filter','US_filter']


In [68]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:],
           y_scale='log',
           slider=True)

In [69]:
df_analyse.head()

Unnamed: 0,date,India,Germany,France,Spain,US,India_filter,Germany_filter,France_filter,Spain_filter,US_filter
0,2020-01-22,0,0,0,0,1,0.0,0.0,-0.2,0.0,0.4
1,2020-01-23,0,0,0,0,1,0.0,0.0,0.7,0.0,1.3
2,2020-01-24,0,0,2,0,2,0.0,0.0,1.6,0.0,2.2
3,2020-01-25,0,0,3,0,2,0.0,0.2,2.2,0.0,3.0
4,2020-01-26,0,0,3,0,5,0.0,1.0,3.0,0.0,3.8


In [70]:
def get_doubling_time_via_regression(in_array):
    
    
    Y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,Y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [71]:
def doubling_time(in_array):
    
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [72]:
days_back = 3
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [73]:
days_back = 3 
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [74]:
df_analyse['India_DR_math']=df_analyse['India'].rolling(
                                window=days_back,
                                min_periods=days_back).apply(doubling_time, raw=False)

In [75]:
days_back = 3 
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [76]:
df_analyse.columns

Index(['date', 'India', 'Germany', 'France', 'Spain', 'US', 'India_filter',
       'Germany_filter', 'France_filter', 'Spain_filter', 'US_filter',
       'India_DR', 'Germany_DR', 'France_DR', 'Spain_DR', 'US_DR',
       'India_filter_DR', 'Germany_filter_DR', 'France_filter_DR',
       'Spain_filter_DR', 'US_filter_DR', 'India_DR_math'],
      dtype='object')

In [77]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[11,12,13,14,15]], 
           y_scale='linear',
           slider=True)

In [78]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[17,18,19,20,21]], 
           y_scale='linear',
           slider=True)