In [1]:
## check some parameters
## depending where you launch your notebook, the relative path might not work
## you should start the notebook server from your base path
## when opening the notebook, typically your path will be ../ads_covid-19/notebooks
import os
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")

'Your base path is at: '+os.path.split(os.getcwd())[-1]

'Your base path is at: ads_covid-20'

## 1 Update all data

In [2]:
# %load src/data/get_data.py


import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

def get_johns_hopkins():
    ''' Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predifined csv structure
    '''
    git_pull = subprocess.Popen(["git", "pull"], #"/usr/bin/git pull" ,
                        cwd = os.path.dirname("C:/Users/sohai/Videos/Python/ads_covid-20/data/raw/COVID-19/"),
                        shell = True,
                        stdout = subprocess.PIPE,
                        stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()
    print("Error : " + str(error)) 
    print("out : " + str(out))



def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('data/raw/NPGEO/GER_state_data.csv',sep=';') #storing the data to csv file
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_germany()


Error : b''
out : b'Already up to date.\n'
 Number of regions rows: 412


## 2. Process pipeline 

In [3]:
# %load src/data/process_JH_data.py
# %load src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime

def store_flat_data():
    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    DF_raw=pd.read_csv(data_path)
    EDA_Full_data=pd.DataFrame(np.array(DF_raw.columns[4:]), columns=['Date']) # converting the present dataframe into more readable and easily plotable dataframe
    allcountries= list (DF_raw['Country/Region'].unique())

    for each in allcountries:
        EDA_Full_data[each]= np.array(DF_raw[DF_raw['Country/Region']== each].iloc[:,4::].sum())


    time_idx=[datetime.strptime( each,"%m/%d/%y") for each in EDA_Full_data.Date] # convert to datetime
    time_str=[each.strftime('%Y-%m-%d') for each in time_idx] # convert back to date ISO norm (str)
    EDA_Full_data['Date']= time_idx
    EDA_Full_data.to_csv('data/processed/COVID_full_flat_table.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(EDA_Full_data.shape[0]))




def store_relational_JH_data():
    
    ''' Transformes the COVID data into a  relational data set which can be used for modeling 

    '''

    path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    raw_data = pd.read_csv(path)

    base_df =raw_data.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    base_df['state']=base_df['state'].fillna('no')

    base_df=base_df.drop(['Lat','Long'],axis=1)


    pd_relational_model=base_df.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))

def store_population_data():
    ''' Transformes the Population data to a required form and matching data to covid data

    '''

    df_pop=pd.read_csv('data/raw/Data_Extract_From_World_Development_Indicators/Population.csv ')
    df_P= df_pop[['Country Name', '2019 [YR2019]']] #considering latest population
    df_P=df_P.rename(columns={'Country Name':'country',
                             '2019 [YR2019]':'population'})
    df_P = df_P.iloc[0:217,:] # as we hust need distinct countries
    df_P['country'] = df_P['country'].replace(['Bahamas, The', 'Brunei Darussalam', 'Myanmar','Congo, Dem. Rep.',
                                       'Congo, Rep.','Czech Republic','Egypt, Arab Rep.','Gambia, The','Iran, Islamic Rep.','Korea, Rep.',
                                      'Kyrgyz Republic','Lao PDR', 'Russian Federation','St. Kitts and Nevis','St. Lucia','St. Vincent and the Grenadines',
                                      'Slovak Republic', 'Syrian Arab Republic','United States','Venezuela, RB','Yemen, Rep.'],
                                      ['Bahamas','Brunei','Burma','Congo (Brazzaville)','Congo (Kinshasa)','Czechia','Egypt',
                                       'Gambia','Iran','Korea, South', 'Kyrgyzstan', 'Laos', 'Russia', 'Saint Kitts and Nevis',
                                        'Saint Lucia', 'Saint Vincent and the Grenadines', 'Slovakia', 'Syria', 'US',
                                       'Venezuela', 'Yemen'])
    df_P['population'] = df_P['population'].replace('..',3214000)
    df_P2 = pd.DataFrame([['Diamond Princess', 2670], ['Holy See', 825],['MS Zaandam', 1432],['Taiwan', 23780000],['Western Sahara',652271]], columns=['country', 'population'])
    df_P=df_P.append(df_P2, ignore_index=True) # Adding additional countries
    df_P['population']=df_P.population.astype(int)
    df_P.to_csv('data/processed/world_population.csv',sep=';',index=False)
    print(str(df_P.shape[0])+ ' countries population information stored: ')



if __name__ == '__main__':

    store_flat_data()
    store_relational_JH_data()
    store_population_data()

 Number of rows stored: 236
 Number of rows stored: 62776
 Latest date is: 2020-09-13 00:00:00
222 countries population information stored: 


## 3  Filter and Doubling Rate Calculation

In [4]:
# %load src/features/build_features.py

import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)     # fitting a curve for  3 points using sklearn(Linear regression)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope # this gives the doublin rate w.r.t those 3 points


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)



    return result




def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    
    return df_output.copy()





def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_relational_model=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_relational_model=pd_relational_model.sort_values('date',ascending=True).copy()

    pd_filtered_data=calc_filtered_data(pd_relational_model)
    
    pd_doubling_rate_data=calc_doubling_rate(pd_filtered_data)
    
    pd_result_larg=calc_doubling_rate(pd_doubling_rate_data,'confirmed_filtered')


    mask=pd_result_larg['confirmed']>100
    
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_larg[pd_result_larg['country']=='India'].tail())


the test slope is: [2.]
            date state country  confirmed  confirmed_filtered  confirmed_DR  \
37283 2020-09-09    no   India  4465863.0           4467762.2     47.153948   
37284 2020-09-10    no   India  4562414.0           4562549.0     46.453044   
37285 2020-09-11    no   India  4659984.0           4657808.8     47.009377   
37286 2020-09-12    no   India  4754356.0           4753115.8     48.545060   
37287 2020-09-13    no   India  4846427.0           4848422.8     50.992411   

       confirmed_filtered_DR  
37283              48.421074  
37284              48.089699  
37285              48.016714  
37286              48.883904  
37287              49.871634  


# 4 Modeling SIR



In [5]:
# %load src/models/train_model.py
import pandas as pd
import numpy as np

from datetime import datetime
import pandas as pd

from scipy import optimize
from scipy import integrate

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns


sns.set(style="darkgrid")

mpl.rcParams['figure.figsize'] = (16, 9)
pd.set_option('display.max_rows', 500)

from PIL import Image


def model_sir():
    df_pop=pd.read_csv('data/processed/world_population.csv',sep=";")

    df_data=pd.read_csv('data/processed/COVID_full_flat_table.csv',sep=';')
    df_data=df_data.iloc[60:,:] #removing first 50 days of covid spread as the data is inconsistent
    df_data=df_data.drop(['Taiwan*'], axis= 1) # dropping taiwan as the data is inconsistent


    df_data=df_data.reset_index()
    df_data=df_data.drop(['index'], axis=1)
    df_data=df_data.rename(columns={'level_0':'index'})


    df= pd.DataFrame(df_data.loc[0])
    df=df.reset_index()
    df = df.iloc[1:]
    country_list= list(df[df[0]>38]['index']) #finding countries with significant number of covid cases i.e,>38
    country_list.insert(0, 'Date')

    df_data=df_data[country_list] # confining data frame to that perticular countries



    for each in country_list[1:]:
        ydata = np.array(df_data[each])
        t=np.arange(len(ydata))
        N0= df_pop[df_pop['country']== each]['population']
        I0=ydata[0]
        S0 = N0-I0
        R0=0
        def SIR_model_t(SIR,t,beta,gamma):


            ''' Simple SIR model
        S: susceptible population (populatin that can be effected)
        I: infected people (population already infected)
        R: recovered people (population recovered from COVID)
        beta:

        overall condition is that the sum of changes (differnces) sum up to 0
        dS+dI+dR=0
        S+I+R= N (constant size of population)

    '''
            S,I,R=SIR
            dS_dt=-beta*S*I/N0          #S*I is the
            dI_dt=beta*S*I/N0-gamma*I
            dR_dt=gamma*I
            return dS_dt,dI_dt,dR_dt

        def fit_odeint(t, beta, gamma):
            '''
    helper function for the integration
    '''
            return integrate.odeint(SIR_model_t, (S0, I0, R0), t, args=(beta, gamma))[:,1] # we only would like to get dI

        popt, pcov = optimize.curve_fit(fit_odeint, t, ydata,maxfev=50000)
        perr = np.sqrt(np.diag(pcov))
        fitted = fit_odeint(t, *popt).reshape((-1,1))
        df_data[each+'_SIR']= fitted
    df_data.to_csv('data/processed/COVID_SIR_model.csv',sep=';',index=False)
    print(' SIR simulation for 100 countries accomplished')


    return df_data

if __name__ == '__main__':

    model_sir()




 SIR simulation for 100 countries accomplished


# 6 visualization


In [None]:
# %load src/visualization/visualize_SIR.py


import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')
df_input_SIR=pd.read_csv('data/processed/COVID_SIR_model.csv',sep=';')


fig = go.Figure()

app = dash.Dash()

colors = {
    'background': '#111111',
    'text': '#7FDBFF' }

app.layout = html.Div([html.H1(children='Applied Data Science on COVID-20 data', style={'color':'red'}),

    dcc.Markdown('''
    The goal of the project is to track Coronavirus spread across countries as the general information available on the internet is not so relevant and informative and to have a deep dive into local development of the spread and predict the future spread.
    This project has been tackled using CRISP-DM approach and major emphasis has been laid on automating the data gathering process, filtered and transformed the gathered data, using machine learning for calculating the doubling rate, developing a SIR Model for forecasting the future spread and finally deploying on a responsive dashboard

    '''),

    html.Div([dcc.Markdown('''
    ## Select Multiple Country for visualization
    ''', style={'color':'green'}),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    )], style={'width': '30%', 'display': 'inline-block','border':'2px black solid', 'borderRadius':5}),

    html.Div([dcc.Markdown('''
        ## Select Timeline of confirmed COVID-20 cases or the approximated doubling time
        ''', style={'color':'green'}),


    dcc.RadioItems(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',

    labelStyle={'display': 'inline-block'}
    )],style={'width': '68%', 'float': 'right', 'display': 'inline-block','border':'2px black solid', 'borderRadius':5}),

    dcc.Graph(figure=fig, id='main_window_slope'),


    html.Div([html.H1(
        children='SIR Simulation Curve',
        style={
            'textAlign': 'center',
            'color': '#FFA37F'
        }
    )]),

    html.Div([dcc.Markdown('''
    ## Select Multiple Country for SIR modeling curve
    ''', style={'color':'green'}),

    dcc.Dropdown(
        id='country_drop_SIR_down',
        options=[ {'label': each,'value':each} for each in df_input_SIR.columns[1:101]],
        value=['Australia', 'Brazil','India'], # which are pre-selected
        multi=True
    )], style={'display': 'inline-block','border':'2px black solid', 'borderRadius':5}),


    dcc.Graph(figure=fig, id='main_SIR_slope'),


     ], style={'padding':10})





@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                marker={'size': 3, 'opacity': 0.5},
                                line= {'width':2, 'opacity' :0.9,},
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },
                 hovermode='closest',

                yaxis=my_yaxis,
                plot_bgcolor=colors['background'],
                paper_bgcolor= colors['background'],
        )
    }

@app.callback(
    Output('main_SIR_slope', 'figure'),
    [Input('country_drop_SIR_down', 'value')])

def update_figure(country_list):

    traces = []

    for each in country_list:



        traces.append(dict(x=df_input_SIR.Date,
                                y=df_input_SIR[each],
                                mode='markers+lines',
                                marker={'size': 3, 'opacity': 0.5},
                                line= {'width':2, 'opacity' :0.9,},
                                name=each
                        )
                )
        traces.append(dict(x=df_input_SIR.Date,
                                y=df_input_SIR[each+'_SIR'],
                                mode='markers+lines',
                                marker={'size': 3, 'opacity': 0.5},
                                line= {'width':2, 'opacity' :0.9,},
                                name=each+'_SIR'
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },
                 hovermode='closest',

                yaxis={'type':"log",
               'title':'SIR prediction'
              },plot_bgcolor=colors['background'],
                paper_bgcolor= colors['background'],
        )
    }


if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


C:\Users\sohai\Videos\Python\ads_covid-20
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720
Debugger PIN: 716-321-720


 * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
