# DataLoad

In [260]:
import os
import matplotlib as mpl
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from datetime import datetime
mpl.rcParams['figure.figsize']= (16,9)
pd.set_option('display.max_rows', 500)

In [261]:
df_analyse= pd.read_csv(r"../data/processed/COVID_small_flat_table.csv",sep=';',
                       parse_dates=[0])
df_analyse.sort_values("date",ascending=True).tail()

Unnamed: 0,date,Italy,India,US
894,2022-07-04,18805756,43531650,87921849
895,2022-07-05,18938771,43547809,88065902
896,2022-07-06,19048788,43566739,88262908
897,2022-07-07,19157174,43585554,88381589
898,2022-07-08,19259037,43604394,88547882


In [262]:
country_list= df_analyse.columns[1:]

# Helper Function

In [263]:
def quick_plot(x_in, df_input, y_scale= "log", slider = False):

        """ Quick basic plot for quick static evaluation of a time series

        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]

        Parameters:
        ----------
        x_in : array
            array of date time object, or array of numbers
        df_input : pandas dataframe
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider


        Returns:
        ----------

    """
        fig = go.Figure()

        for each in df_input.columns:
            fig.add_trace(go.Scatter(
                            x=x_in,
                            y=df_input[each],
                            name=each,
                            opacity=0.8))

        fig.update_layout(autosize=True,
            width=1024,
            height=768,
            font=dict(
                family="PT Sans, monospace",
                size=18,
                color="#7f7f7f"
                )
            )
        fig.update_yaxes(type=y_scale),
        fig.update_xaxes(tickangle=-45,
                     nticks=20,
                     tickfont=dict(size=14,color="#7f7f7f")
                    )
        if slider==True:
            fig.update_layout(xaxis_rangeslider_visible=True)
        fig.show()


In [264]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,1:],
           y_scale='linear',
           slider=True)

In [265]:
threshold = 100

In [266]:
compare_list= []
for pos, country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))


pd_sync_timelines= pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T



In [267]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [268]:
quick_plot(pd_sync_timelines.date,pd_sync_timelines.iloc[:,:-1],
           y_scale="log",
           slider=True)

$N(t) = N_0*2^{t/T}$

In [269]:
def doubling_rate(N_0, t,T_d):
    return N_0*np.power(2, t/T_d)

In [270]:
max_days= 866
norm_slopes={
    #'doubling every day':doubling_rate(100,np.arange(10),1),
    'doubling every two days':doubling_rate(100,np.arange(20),2),
    'doubling every 4 days':doubling_rate(100,np.arange(20),4),
    'doubling every 10 days':doubling_rate(100,np.arange(20),10),}

In [271]:
pd_sync_timelines_w_slope =pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [272]:
pd_sync_timelines_w_slope

Unnamed: 0,doubling every two days,doubling every 4 days,doubling every 10 days,Italy,India,US,date
0,100.000000,100.000000,100.000000,155.0,102.0,107.0,0
1,141.421356,118.920712,107.177346,229.0,113.0,184.0,1
2,200.000000,141.421356,114.869835,322.0,119.0,237.0,2
3,282.842712,168.179283,123.114441,453.0,142.0,403.0,3
4,400.000000,200.000000,131.950791,655.0,156.0,519.0,4
5,565.685425,237.841423,141.421356,888.0,194.0,594.0,5
6,800.000000,282.842712,151.571657,1128.0,244.0,782.0,6
7,1131.370850,336.358566,162.450479,1694.0,330.0,1147.0,7
8,1600.000000,400.000000,174.110113,2036.0,396.0,1586.0,8
9,2262.741700,475.682846,186.606598,2502.0,499.0,2219.0,9


In [273]:
quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:,0:6],
           y_scale='log',
           slider=True)

# Understanding Linear Regression

In [274]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)


In [275]:
from scipy import signal

In [276]:
start_pos= 5


In [277]:
l_vec= len(df_analyse["India"])
X= np.arange(l_vec-8).reshape(-1,1)
y= np.log(np.array(df_analyse["India"][8:]))


In [278]:
reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [279]:
X_hat= np.arange(l_vec).reshape(-1,1)
Y_hat= reg.predict(X_hat)

In [280]:
LR_inspect =df_analyse[["date", "India"]].copy()

In [281]:
LR_inspect["prediction"]=np.exp(Y_hat)
LR_inspect

Unnamed: 0,date,India,prediction
0,2020-01-22,0,1.000000e+00
1,2020-01-23,0,1.028807e+00
2,2020-01-24,0,1.058443e+00
3,2020-01-25,0,1.088934e+00
4,2020-01-26,0,1.120302e+00
5,2020-01-27,0,1.152575e+00
6,2020-01-28,0,1.185777e+00
7,2020-01-29,0,1.219935e+00
8,2020-01-30,1,1.255077e+00
9,2020-01-31,1,1.291232e+00


In [282]:
quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True)

# Doubling Rate -Piecewise linear Regression

In [283]:
reg = linear_model.LinearRegression(fit_intercept=True)
l_vec= len(df_analyse["India"])
X= np.arange(l_vec-1).reshape(-1,1)
y= np.array(df_analyse["India"][1:])

In [284]:
def get_rate_via_regression(in_array):
    y= np.array(in_array)
    x= np.arange(-1,2).reshape(-1,1)
    assert len(in_array)==3
    reg.fit(x,y)
    intercept= reg.intercept_
    slope= reg.coef_
    return  intercept/slope

In [285]:
country_list= df_analyse.columns[1:]
for each in country_list:
    df_analyse[each+"_DR"]=df_analyse[each].rolling(window =3, min_periods=3 ).apply(get_rate_via_regression)











In [286]:
quick_plot(df_analyse.date,df_analyse.iloc[100:,[4,5,6]],y_scale='linear')


In [287]:
def doubling_time(in_array):
    y= np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [288]:
df_analyse["India_DT_wiki"]= df_analyse["India"].rolling(window =3, min_periods=3 ).apply(doubling_time)





In [289]:
quick_plot(df_analyse.date,df_analyse.iloc[40:,[4,5,6,7]],y_scale='linear',slider=True)

In [293]:
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])
country_list=df_analyse.columns[1:]

In [294]:
for each in country_list:
    df_analyse[each+"_filter"]= signal.savgol_filter(df_analyse[each],5,1)


Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.



In [295]:
filter_cols =[]
for each in country_list:
    filter_cols.append(each+"_filter")
print(filter_cols)

['Italy_filter', 'India_filter', 'US_filter']


In [297]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:],
           y_scale='log',
           slider=True)

In [298]:
df_analyse.head()

Unnamed: 0,date,Italy,India,US,Italy_filter,India_filter,US_filter
0,2020-01-22,0,0,1,0.0,0.0,0.4
1,2020-01-23,0,0,1,0.0,0.0,1.3
2,2020-01-24,0,0,2,0.0,0.0,2.2
3,2020-01-25,0,0,2,0.0,0.0,3.0
4,2020-01-26,0,0,5,0.0,0.0,3.8


In [299]:
days_back = 3
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_rate_via_regression, raw=False)

In [300]:
days_back = 3
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_rate_via_regression, raw=False)

In [302]:
df_analyse.columns

Index(['date', 'Italy', 'India', 'US', 'Italy_filter', 'India_filter',
       'US_filter', 'Italy_DR', 'India_DR', 'US_DR', 'Italy_filter_DR',
       'India_filter_DR', 'US_filter_DR'],
      dtype='object')

In [303]:
start_pos=100
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[7,8,9]], #
           y_scale='linear',
           slider=True)

In [304]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[10,11,12]], #
           y_scale='linear',
           slider=True)

In [305]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[7,10]], #
           y_scale='linear',
           slider=True)