In [123]:
import pandas as pd
import datetime as datetime
import matplotlib as mpl
import plotly.graph_objects as go
import dash
import requests
import json

import numpy as np
from sklearn import linear_model
from scipy import signal
from dash import dcc
from dash import html
from dash.dependencies import Input, Output

In [124]:
%matplotlib inline
mpl.rcParams['figure.figsize'] = (16, 9)
pd.set_option('display.max_rows', 500)

In [125]:
data_dir='../data/processed/Cases_pop.csv'
df_list=pd.read_csv(data_dir, sep=';')

In [126]:
#country_list = ['AUS', 'USA', 'ESP', 'IND', 'CHN', 'DEU', 'AFG']

In [127]:
# Requesting the Covid Cases data from our world in data website as a json object.
data_cases = requests.get(
    'https://covid.ourworldindata.org/data/owid-covid-data.json')
# load json object for the total number of COVID cases
json_object_cases = json.loads(data_cases.content)

countries_list = list(json_object_cases.keys())
country_remove=['OWID_INT','OWID_CYN']
country_list=list(set(countries_list) - set(country_remove))

In [128]:
df_small=df_list[['date','Cases_per_pop_AUS', 'Cases_per_pop_USA', 'Cases_per_pop_ESP', 'Cases_per_pop_IND', 'Cases_per_pop_CHN', 'Cases_per_pop_DEU', 'Cases_per_pop_AFG']]
df_small

Unnamed: 0,date,Cases_per_pop_AUS,Cases_per_pop_USA,Cases_per_pop_ESP,Cases_per_pop_IND,Cases_per_pop_CHN,Cases_per_pop_DEU,Cases_per_pop_AFG
0,2020-01-01,0.045585,0.101435,0.092299,0.014822,0.000129,0.073985,0.002368
1,2020-01-02,0.045585,0.101435,0.092299,0.014822,0.000129,0.073985,0.002368
2,2020-01-03,0.045585,0.101435,0.092299,0.014822,0.000129,0.073985,0.002368
3,2020-01-04,0.045585,0.101435,0.092299,0.014822,0.000129,0.073985,0.002368
4,2020-01-05,0.045585,0.101435,0.092299,0.014822,0.000129,0.073985,0.002368
...,...,...,...,...,...,...,...,...
929,2022-07-18,0.341932,0.266210,0.275665,0.031106,0.000629,0.357921,0.004581
930,2022-07-19,0.343818,0.266561,0.276543,0.031119,0.000629,0.359612,0.004586
931,2022-07-20,0.345940,0.267201,0.276543,0.031135,0.000630,0.361250,0.004590
932,2022-07-21,0.347939,0.267659,0.276543,0.031151,0.000630,0.362542,0.004594


# Helper Function

In [129]:
def quick_plot(x_in, df_input, y_scale='log', slider=False):
    """ Quick basic plot for quick static evaluation of a time series

        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]

        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider


        Returns:
        ----------

    """
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(
            go.Scatter(x=x_in, y=df_input[each], name=each, opacity=0.8))

    fig.update_layout(autosize=True,
                      width=1024,
                      height=768,
                      font=dict(family="PT Sans, monospace",
                                size=18,
                                color="#7f7f7f"))
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,
                     nticks=20,
                     tickfont=dict(size=14, color="#7f7f7f"))
    if slider == True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()

# Understanding Linear Regression

In [130]:
data_dir='../data/processed/Cases_pop_NoNaN.csv'
df_list=pd.read_csv(data_dir, sep=';')

In [131]:
reg = linear_model.LinearRegression(fit_intercept=True)

In [132]:
df_per=pd.concat([df_list['date'], (df_list.iloc[:,1:]*100)])

In [133]:
df_per.reset_index().drop(columns='index').rename(columns={0:'date'})

Unnamed: 0,date,Cases_per_pop_DOM,Cases_per_pop_KWT,Cases_per_pop_MUS,Cases_per_pop_BTN,Cases_per_pop_GMB,Cases_per_pop_KIR,Cases_per_pop_MYS,Cases_per_pop_MMR,Cases_per_pop_NGA,...,Cases_per_pop_PRK,Cases_per_pop_MCO,Cases_per_pop_FRO,Cases_per_pop_ESP,Cases_per_pop_SMR,Cases_per_pop_ISR,Cases_per_pop_SVK,Cases_per_pop_MLI,Cases_per_pop_POL,Cases_per_pop_LBY
0,2020-01-01,,,,,,,,,,...,,,,,,,,,,
1,2020-01-02,,,,,,,,,,...,,,,,,,,,,
2,2020-01-03,,,,,,,,,,...,,,,,,,,,,
3,2020-01-04,,,,,,,,,,...,,,,,,,,,,
4,2020-01-05,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,,5.598480,15.251732,18.219129,7.7312,0.454901,2.510980,13.778752,1.141003,0.121371,...,0.000004,37.559287,65.530933,27.566479,57.052688,48.633161,47.078028,0.142447,15.748244,7.462826
1864,,5.604534,15.251732,18.219129,7.7312,0.455621,2.510980,13.790375,1.141035,0.121595,...,0.000004,37.739192,65.530933,27.654257,57.349019,48.707803,47.107068,0.142447,15.756722,7.462826
1865,,5.614086,15.251732,18.219129,7.7312,0.455621,2.510980,13.807307,1.141065,0.121595,...,0.000004,37.853677,65.530933,27.654257,57.550525,48.774954,47.135723,0.142456,15.764165,7.462826
1866,,5.614086,15.251732,18.219129,7.7312,0.455621,2.597886,13.820970,1.141095,0.121705,...,0.000004,37.927275,65.530933,27.654257,57.707580,48.841868,47.162046,0.142470,15.771991,7.462826


In [134]:
df_per[0] = pd.to_datetime(df_per[0], format='%Y-%m-%d')
df_per.rename(columns={0:'date'})

Unnamed: 0,date,Cases_per_pop_DOM,Cases_per_pop_KWT,Cases_per_pop_MUS,Cases_per_pop_BTN,Cases_per_pop_GMB,Cases_per_pop_KIR,Cases_per_pop_MYS,Cases_per_pop_MMR,Cases_per_pop_NGA,...,Cases_per_pop_PRK,Cases_per_pop_MCO,Cases_per_pop_FRO,Cases_per_pop_ESP,Cases_per_pop_SMR,Cases_per_pop_ISR,Cases_per_pop_SVK,Cases_per_pop_MLI,Cases_per_pop_POL,Cases_per_pop_LBY
0,2020-01-01,,,,,,,,,,...,,,,,,,,,,
1,2020-01-02,,,,,,,,,,...,,,,,,,,,,
2,2020-01-03,,,,,,,,,,...,,,,,,,,,,
3,2020-01-04,,,,,,,,,,...,,,,,,,,,,
4,2020-01-05,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,NaT,5.598480,15.251732,18.219129,7.7312,0.454901,2.510980,13.778752,1.141003,0.121371,...,0.000004,37.559287,65.530933,27.566479,57.052688,48.633161,47.078028,0.142447,15.748244,7.462826
930,NaT,5.604534,15.251732,18.219129,7.7312,0.455621,2.510980,13.790375,1.141035,0.121595,...,0.000004,37.739192,65.530933,27.654257,57.349019,48.707803,47.107068,0.142447,15.756722,7.462826
931,NaT,5.614086,15.251732,18.219129,7.7312,0.455621,2.510980,13.807307,1.141065,0.121595,...,0.000004,37.853677,65.530933,27.654257,57.550525,48.774954,47.135723,0.142456,15.764165,7.462826
932,NaT,5.614086,15.251732,18.219129,7.7312,0.455621,2.597886,13.820970,1.141095,0.121705,...,0.000004,37.927275,65.530933,27.654257,57.707580,48.841868,47.162046,0.142470,15.771991,7.462826


In [135]:
df_per.dtypes

0                         datetime64[ns]
Cases_per_pop_DOM                float64
Cases_per_pop_KWT                float64
Cases_per_pop_MUS                float64
Cases_per_pop_BTN                float64
Cases_per_pop_GMB                float64
Cases_per_pop_KIR                float64
Cases_per_pop_MYS                float64
Cases_per_pop_MMR                float64
Cases_per_pop_NGA                float64
Cases_per_pop_ATG                float64
Cases_per_pop_TUR                float64
Cases_per_pop_LKA                float64
Cases_per_pop_OWID_ASI           float64
Cases_per_pop_NAM                float64
Cases_per_pop_CMR                float64
Cases_per_pop_CRI                float64
Cases_per_pop_PYF                float64
Cases_per_pop_ECU                float64
Cases_per_pop_TON                float64
Cases_per_pop_CIV                float64
Cases_per_pop_HTI                float64
Cases_per_pop_MAR                float64
Cases_per_pop_PRY                float64
Cases_per_pop_OW

In [136]:
for i in df_per.columns[df_per.isnull().any(axis=0)]:     #---Applying Only on variables with NaN values
    df_per[i].fillna(df_per[i].mean(),inplace=True)

In [137]:
df_small.fillna(df_small.mean())
df_small.drop(df_small.head(100).index,inplace=True)
df_small.drop(df_small.tail(100).index,inplace=True)


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



In [140]:
df_per.set_index(0).rename(columns={0:'date'})
for each in country_list:
    l_vec = len(df_per['Cases_per_pop_'+each])
    X = np.arange(l_vec).reshape(-1, 1)
    y = np.log(np.array(df_per['Cases_per_pop_'+each]))
    y = y[np.logical_not(np.isnan(y))]    

    if y.shape[0]!=0:
        reg.fit(X, y)
        X_hat = np.arange(l_vec).reshape(-1, 1)
        Y_hat = reg.predict(X_hat)
        # LR_inspect = df_per[[0, 'Cases_per_pop_'+each]].copy()
        # LR_inspect['prediction'] = np.exp(Y_hat)
        LR_inspect =df_per[0].copy()
        LR_inspect = pd.concat[LR_inspect,df_per['Cases_per_pop_'+each]]
    else:
        pass
quick_plot(LR_inspect[0], LR_inspect.iloc[:, 1:], y_scale='log', slider=True)

TypeError: 'function' object is not subscriptable

In [None]:
for each in country_list:
    l_vec = len(df_small['Cases_per_pop_'+each])
    X = np.arange(l_vec).reshape(-1, 1)
    y = np.log(np.array(df_small['Cases_per_pop_'+each][:]))
    reg.fit(X, y)
    X_hat = np.arange(l_vec).reshape(-1, 1)
    Y_hat = reg.predict(X_hat)
    LR_inspect = df_small[['date', 'Cases_per_pop_'+each]].copy()
    LR_inspect['prediction'] = np.exp(Y_hat)

    quick_plot(LR_inspect.date, LR_inspect.iloc[:, 1:], y_scale='log', slider=True)

# Doubling Rate - Piecewise Linear Regression

In [None]:
reg = linear_model.LinearRegression(fit_intercept=True)

In [None]:
for each in country_list:
    df_small['Cases_per_pop_'+each + '_filter'] = signal.savgol_filter(
            df_small['Cases_per_pop_'+each],
            19,  # window size used for filtering
            3)  # order of fitted polynomial
df_small

In [None]:
filter_cols = ['Cases_per_pop_AUS_filter', 'Cases_per_pop_USA_filter', 'Cases_per_pop_ESP_filter', 'Cases_per_pop_IND_filter', 'Cases_per_pop_CHN_filter', 'Cases_per_pop_DEU_filter', 'Cases_per_pop_AFG_filter']
start_pos = 0
quick_plot(df_list.date[start_pos:],
           df_list[filter_cols].iloc[start_pos:, :],
           y_scale='log',
           slider=True)


In [None]:
reg = linear_model.LinearRegression(fit_intercept=True)
l_vec=len(df_small['Cases_per_pop_AUS'])
X=np.arange(l_vec-50).reshape(-1,1)
y=np.array(df_small['Cases_per_pop_AUS'][50:])

In [None]:
reg.fit(X,y)

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
def get_rate_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''

    y = np.array(in_array)
    X = np.arange(2, 5).reshape(-1, 1) # Changed the np.arange(-1,2) to the current one

    # Check to ensure that the calculation is based only on 3 data points we use assert == 3.
    assert len(in_array) == 3

    reg.fit(X, y)
    intercept = reg.intercept_
    slope = reg.coef_
    return intercept / slope

In [None]:
df_small['Cases_per_pop_AUS_DR']=df_small['Cases_per_pop_AUS'].rolling(window=3, min_periods=3).apply(get_rate_via_regression)
df_small

In [None]:
quick_plot(df_small.date, df_small.iloc[40:,[15]], y_scale='linear')

In [None]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y) * np.log(2) / np.log(y[-1] / y[0])

In [None]:
df_small['AUS_DT']=df_small['Cases_per_pop_AUS'].rolling(window=3, min_periods=3).apply(doubling_time)

In [None]:
quick_plot(df_small.date, df_small.iloc[40:,[15,16]], y_scale='linear')

In [None]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y) * np.log(2) / np.log(y[-1] / y[0])


days_back = 3
for pos, country in enumerate(country_list):
    df_list[country + '_DR'] = df_list[country].rolling(
        window=days_back, min_periods=days_back).apply(get_rate_via_regression,
                                                       raw=False)
days_back = 3
for pos, country in enumerate(filter_cols):
    df_list[country + '_DR'] = df_list[country].rolling(
        window=days_back, min_periods=days_back).apply(get_rate_via_regression,
                                                       raw=False)
df_list['Germany_DR_math'] = df_list['Cases_per_pop_DEU'].rolling(
    window=days_back, min_periods=days_back).apply(doubling_time, raw=False)
days_back = 3
for pos, country in enumerate(filter_cols):
    df_list[country + '_DR'] = df_list[country].rolling(
        window=days_back, min_periods=days_back).apply(get_rate_via_regression,
                                                       raw=False)
df_list.columns
start_pos = 40
quick_plot(df_list.date,
           df_list.iloc[start_pos:, [11, 12, 13, 14]],
           y_scale='linear',
           slider=True)


In [None]:
start_pos = 40
quick_plot(df_list.date,
           df_list.iloc[start_pos:, [16, 17, 18, 19]],
           y_scale='linear',
           slider=True)