# One run full walktrhough 

* Do the full walk through on the large data set
* Refactor the source code and bring it to individual scripts
* Ensure a full run with one click

In [1]:
## check some parameters
## depending where you launch your notebook, the relative path might not work
## you should start the notebook server from your base path
## when opening the notebook, typically your path will be ../ads_covid-19/notebooks
import os
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")

'Your base path is at: '+os.path.split(os.getcwd())[-1]

'Your base path is at: Lecture_Covid_19_data_analysis'

## 1 Update all data

In [1]:
# %load src/data/get_data.py

import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

def get_johns_hopkins():
    ''' Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predifined csv structure
    '''
    git_pull = subprocess.Popen( "/mingw64/bin/git pull" ,
                         cwd = os.path.dirname( 'data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('data/raw/NPGEO/GER_state_data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_germany()


NotADirectoryError: [WinError 267] The directory name is invalid

## 2. Process pipeline 

In [5]:
# %load src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='..\\data\\raw\\COVID-19\\csse_covid_19_data\\csse_covid_19_time_series\\time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('..\\data\\processed\\COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
if __name__ == '__main__':

    store_relational_JH_data()


 Number of rows stored: 54530
 Latest date is: 2020-08-13 00:00:00


## 3  Filter and Doubling Rate Calculation

In [7]:
# %load src/features/build_features.py

import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)



    return result




def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())

    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()





def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_JH_data=pd.read_csv('..\\data\\processed\\COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

    #test_structure=pd_JH_data[((pd_JH_data['country']=='US')|
    #                  (pd_JH_data['country']=='Germany'))]

    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')


    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('..\\data\\processed\\COVID_final_set.csv',sep=';',index=False)
    print(pd_result_larg[pd_result_larg['country']=='Germany'].tail())


the test slope is: [2.]


  b = a[a_slice]


            date state  country  confirmed  confirmed_filtered  confirmed_DR  \
29720 2020-08-09    no  Germany   217288.0            217687.0    397.061661   
29721 2020-08-10    no  Germany   218508.0            218619.6    271.110696   
29722 2020-08-11    no  Germany   219540.0            219695.2    194.001184   
29723 2020-08-12    no  Germany   220859.0            220928.9    186.844463   
29724 2020-08-13    no  Germany   222281.0            222162.6    161.177186   

       confirmed_filtered_DR  
29720             243.698141  
29721             237.557617  
29722             217.774392  
29723             190.315593  
29724             179.078301  


In [8]:
print(pd_result_larg[pd_result_larg['country']=='US'].tail())

            date state country  confirmed  confirmed_filtered  confirmed_DR  \
51860 2020-08-09    no      US  5044864.0           5044031.2     96.884837   
51861 2020-08-10    no      US  5094400.0           5095162.4    104.606172   
51862 2020-08-11    no      US  5141208.0           5145347.4    105.735503   
51863 2020-08-12    no      US  5197411.0           5196446.5     99.879424   
51864 2020-08-13    no      US  5248854.0           5247545.6     96.535391   

       confirmed_filtered_DR  
51860              94.514426  
51861              98.267616  
51862             100.573196  
51863             101.608290  
51864             101.693503  


## 4 Visual Board

In [1]:
def SIR_model(SIR,beta,gamma, N0):
    ''' Simple SIR model
        S: susceptible population
        I: infected people
        R: recovered people
        beta: 
        
        overall condition is that the sum of changes (differnces) sum up to 0
        dS+dI+dR=0
        S+I+R= N (constant size of population)
    
    '''
    
    S,I,R=SIR
    dS_dt=-beta*S*I/N0          #S*I is the 
    dI_dt=beta*S*I/N0-gamma*I
    dR_dt=gamma*I
    return([dS_dt,dI_dt,dR_dt])


In [2]:
import pandas as pd
import numpy as np
pop = pd.read_csv('..//data/raw//country_populations.csv')[['Country Name','2019 [YR2019]']]
pop.rename(columns={'Country Name': 'country', '2019 [YR2019]': 'count'}, inplace = True)
pop.country = pop.country.map(lambda x: str(x))
df_input_large=pd.read_csv('..\\data\\processed\\COVID_final_set.csv',sep=';')
common_countries = set(df_input_large.country.map(lambda x: x.lower())).intersection(set(pop.country.map(lambda x: x.lower())))
df_input_large_filtered = df_input_large[df_input_large.country.map(lambda x: x.lower()).isin(common_countries)]

pop = pop[pop.country.map(lambda x: x.lower()).isin(common_countries)]
country_vs_pop = {row['country']:row['count'] for row in pop.to_dict('records')}

In [None]:
# %load src/visualization/visualize.py
import pandas as pd
import numpy as np

import dash
dash.__version__
import plotly.express as px
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('..\\data\\processed\\COVID_final_set.csv',sep=';')
df_input_large = df_input_large_filtered

fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    

    dcc.Tabs(id='main_tab', value='main_tab', children=[
        dcc.Tab(id='tab1', label='Confirmed Cases Analytics', value='tab1', children=[
                dcc.Markdown('''
            ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
            '''),
                 dcc.Dropdown(
                        id='doubling_time',
                        options=[
                            {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
                            {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
                            {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
                            {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
                        ],
                        value='confirmed',
                        multi=False
                        ),
                    dcc.Graph(figure=fig, id='main_window_slope')
                ]),
        #SIR simulation graph
        dcc.Tab(id='tab2', label='Infection Spread Model - SIR', value='tab2', children=[
              dcc.Markdown('''
              Initialise Infection and Recovery rates
              '''),
           html.Div(["Min Infection Rate: ", dcc.Input(id='beta_min', value = .11,  placeholder='min infection rate', type='number'),
                    html.Br(), html.Br(),
                    "Max Infection Rate: ", dcc.Input(id='beta_max', value = .4, placeholder='max infection rate', type='number'),
                    html.Br(), html.Br(),
                    "# Days without measures: ", dcc.Input(id='t_init', value=28,  placeholder='initial time period ', type='number'),
                    html.Br(), html.Br(),
                     "# Days with hard measures : ", dcc.Input(id='t_intro', value = 14, placeholder='time to introduce measures', type='number'),
                     html.Br(), html.Br(),
                     "# Days held with measures : ", dcc.Input(id='t_hold', value=21,  placeholder='hold time', type='number'),
                     html.Br(), html.Br(),
                     "# Days with relaxed measures : ", dcc.Input(id='t_relax', value = 21, placeholder='relaxed time', type='number'),
                     html.Br(), html.Br(),
                     "# Days repeated with hard measures : ", dcc.Input(id='t_repeat', value = 31, placeholder='repeated hard measures', type='number'),
                     html.Br(), html.Br(),
                     "Recovery Rate : ", dcc.Input(id='gamma', value = .1, placeholder='recovery rate', type='number'),
                     html.Br(), html.Br(),
                     html.Button('Submit', id='submit', n_clicks = 0)
                    ]
                   
                   ),
            dcc.Graph(figure=fig, id='sir_graph')
            
            
        ])
    ])
])

g_country_list = []

@app.callback(
    Output('sir_graph', 'figure'),
    [Input('submit', 'n_clicks')],
    [State('beta_min', 'value'),
    State('beta_max', 'value'),
    State('t_init', 'value'),
    State('t_intro', 'value'),
    State('t_hold', 'value'),
    State('t_relax', 'value'),
    State('t_repeat', 'value'),
    State('gamma', 'value')])
def sir_graph(n_clicks, beta_min, beta_max, t_init, t_intro, t_hold, t_relax, t_repeat,  gamma):
    global g_country_list
    if n_clicks is None or n_clicks ==0:
        return {
            'data': [],
            'layout': dict (
                width=1280,
                height=720,
                xaxis = dict(showticklabels=False, showgrid=False, zeroline = False),
                yaxis = dict(showticklabels=False, showgrid=False, zeroline = False),
        )
    }
    pd_beta=np.concatenate((np.array(int(t_init)*[beta_max]),
                       np.linspace(beta_max,beta_min,int(t_intro)),
                       np.array(int(t_hold)*[beta_min]),
                        np.linspace(beta_min,beta_max,int(t_relax)),
                        np.linspace(beta_min,beta_max,int(t_repeat)),
                       ))
    t_phases=np.array([t_init, t_intro, t_hold, t_relax, t_repeat]).cumsum()
    fig = go.Figure()
    for each in g_country_list:
        N0 = int(country_vs_pop[each])
        #considering confirmed cases as the initial infected people
        df_filter_by_country = df_input_large_filtered[df_input_large_filtered.country == each]
        I0=df_filter_by_country.confirmed.iloc[df_filter_by_country.confirmed.nonzero()[0][0]]
        S0=N0-I0
        R0=0
        propagation_rates=pd.DataFrame(columns  = {'susceptible':S0, 'infected':I0, 'recovered':R0}, index = [0])
        SIR=np.array([S0,I0,R0])
        for each_beta in pd_beta:
            new_delta_vec=SIR_model(SIR,each_beta,gamma, N0)
            SIR=SIR+new_delta_vec
            propagation_rates=propagation_rates.append({'susceptible':SIR[0],
                                                    'infected':SIR[1],
                                                  'recovered':SIR[2]}, ignore_index=True)
#         print(propagation_rates.infected)
        fig.add_trace(go.Scatter(x=propagation_rates.index,y=propagation_rates.infected, name = each ))
                        
                
    fig = update_layout(t_phases, fig)
    return fig

def update_layout(t_phases, fig):
    color = 'red'
    fig.update_layout(
    title='Scenario SIR simulations  (demonstration purposes only)',
    width=1280,
    height=720,
    xaxis = dict(showgrid=False, title = 'time in days'),
    yaxis_type="log",
    yaxis = dict(showgrid=False),
    shapes=[dict(type="rect",
            xref="x",
            yref="paper",
            x0=0,
            y0=0,
            x1=t_phases[0],
            y1=1,
            fillcolor=color,
            opacity=0.0,
            layer="below",
            line_width=0),
        dict(type="rect",
            xref="x",
            yref="paper",
            x0=t_phases[0],
            y0=0,
            x1=t_phases[1],
            y1=1,
            fillcolor=color,
            opacity=0.3,
            layer="below",
            line_width=0),
        dict(type="rect",
            xref="x",
            yref="paper",
            x0=t_phases[1],
            y0=0,
            x1=t_phases[2],
            y1=1,
            fillcolor=color,
            opacity=0.4,
            layer="below",
            line_width=0),
        dict(type="rect",
            xref="x",
            yref="paper",
            x0=t_phases[2],
            y0=0,
            x1=t_phases[3],
            y1=1,
            fillcolor=color,
            opacity=0.1,
            layer="below",
            line_width=0),
          dict(type="rect",
            xref="x",
            yref="paper",
            x0=t_phases[3],
            y0=0,
            x1= t_phases[4],
            y1=1,
            fillcolor=color,
            opacity=0.3,
            layer="below",
            line_width=0)          
    ])
    fig.add_trace(go.Scatter(
    x=[10, t_phases[0]+10, t_phases[1]+10, t_phases[2]+10, t_phases[3]+10],
    y=[10, 100, 200, 175, 10],
    text=["no measures ",
          "hard measures",
          "hold measures",
          "relax measures",
          "repeat hard measures"],
    mode="text",
))
    return fig

    

@app.callback(
    [Output('main_window_slope', 'figure')],
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):
    global g_country_list
    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }

    
    traces = []
    g_country_list = country_list
    for each in country_list:
        
        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return [{
            'data': traces,
            'layout': dict (
                width=400,
                height=200,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(
                            size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }]

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


C:\Users\pramod\Desktop\TUKL\course books\sem4\EDS\Lecture_Covid_19_data_analysis\notebooks
Dash is running on http://127.0.0.1:8050/

 in production, use a production WSGI server like gunicorn instead.

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on



Series.nonzero() is deprecated and will be removed in a future version.Use Series.to_numpy().nonzero() instead

