# Setting up the full walkthrough

# 1.Update All Data

In [35]:
# %load ../src/data/get_data.py
"""
Created on Fri Aug 21 13:02:59 2020

@author: Sriram
"""

import subprocess
import os
import pandas as pd
import numpy as np
from datetime import datetime

# Check Working directory and set the path
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")


# Function to pull latest data from John Hopkins GITHUB page
def get_john_hopkins():
    'We use git pull to save the data in the folder COVID-19. Data saved as csv files under various names'
    git_pull = subprocess.Popen( "git pull" , 
                     cwd = os.path.dirname( 'data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()
    
    print("Error : " + str(error)) 
    print("out : " + str(out))

if __name__ == '__main__':
    get_john_hopkins()

Error : b'From https://github.com/CSSEGISandData/COVID-19\n   37b2fa02..b6cf765a  master     -> origin/master\n   4fe70bba..f759b906  web-data   -> origin/web-data\n'
out : b'Updating 37b2fa02..b6cf765a\nFast-forward\n csse_covid_19_data/README.md                       |    1 +\n .../csse_covid_19_daily_reports/08-29-2020.csv     | 3951 ++++++++++++\n .../csse_covid_19_daily_reports/08-30-2020.csv     | 3951 ++++++++++++\n .../csse_covid_19_daily_reports_us/08-29-2020.csv  |   59 +\n .../csse_covid_19_daily_reports_us/08-30-2020.csv  |   59 +\n .../time_series_covid19_confirmed_US.csv           | 6682 ++++++++++----------\n .../time_series_covid19_confirmed_global.csv       |  534 +-\n .../time_series_covid19_deaths_US.csv              | 6682 ++++++++++----------\n .../time_series_covid19_deaths_global.csv          |  534 +-\n .../time_series_covid19_recovered_global.csv       |  508 +-\n 10 files changed, 15491 insertions(+), 7470 deletions(-)\n create mode 100644 csse_covid_19_data/c

# 2.Process Pipeline

In [36]:
#checking current working directory
os.getcwd()

'C:\\Users\\Sriram\\eds_covid-19'

In [37]:
# %load src/data/process_JH_data
"""
Created on Fri Aug 21 18:59:53 2020

@author: Sriram
"""

import pandas as pd
import numpy as np
from datetime import datetime

#defining a function to process raw JH data into a relational data structure
def store_relational_JH_data():
    "process raw JH data into a relational data structure"
    
    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)
    pd_data_base=pd_raw.rename(columns={'Country/Region':'country','Province/State':'state'})    
    pd_data_base['state']=pd_data_base['state'].fillna('no')
    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)
    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                      )
    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')
    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
#running the function
if __name__ == '__main__':
    store_relational_JH_data()

 Number of rows stored: 59052
 Latest date is: 2020-08-30 00:00:00


# 3. Filtering and Slope Calculation

In [38]:
#checking current working directory
os.getcwd()

'C:\\Users\\Sriram\\eds_covid-19'

In [40]:
# %load src/features/build_features.py
"""
Created on Sat Aug 22 10:32:53 2020

@author: Sriram
"""

import numpy as np
from sklearn import linear_model
import pandas as pd
from scipy import signal

# we define the linear regression object
reg=linear_model.LinearRegression(fit_intercept=True)

def get_doubling_time_via_regression(in_array):
    " Use linear regression to find the doubling rate"
    y=np.array(in_array)
    X=np.arange(-1,2).reshape(-1,1)
    # for safety we are asserting that the length of the input array is 3
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    return intercept/slope

def savgol_filter(df_input,column='confirmed',window=5):
    df_result=df_input
    degree=1
    # we fill the missing entries with zero
    filter_in=df_input[column].fillna(0)
    result=signal.savgol_filter(np.array(filter_in),
                        window,
                        degree)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    "Input is dataframe"
    "return value is a single series of doubling rates"
    days_back=3
    result=df_input[col].rolling(window=days_back,min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

def calc_filtered_data(df_input,filter_on='confirmed'):
    "Apply SavGol filter on the dataset and return the merged dataset"
    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)),'Error in calc_filtered_data not all columns in data Frame'
    df_output=df_input.copy()
    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    
    return df_output.copy()


def calc_doubling_rate(df_input,filter_on='confirmed'):
    "Calculate doubling rate and return the dataframe"
    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)),'Error in calc_filtered_data not all columns in data Frame'
    pd_DR_result=df_input[['state','country',filter_on]].groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()
    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR','level_2':'index'})
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])
    return df_output


if __name__=='__main__':
    #test_data=np.array([2,4,6])
    #doubling_time=get_doubling_time_via_regression(test_data)
    #print('Test slope is :'+str(doubling_time))
    # We read the data from file
    pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).reset_index(drop=True).copy()
    # We process the data calculating filtered data and doubling rate
    pd_JH_result_large=calc_filtered_data(pd_JH_data)
    pd_JH_result_large=calc_doubling_rate(pd_JH_result_large)
    pd_JH_result_large=calc_doubling_rate(pd_JH_result_large,filter_on='confirmed_filtered')
    # we apply a threshold on confirmed column since if values are small doubling rate goes to infinity
    mask=pd_JH_result_large['confirmed']>100
    pd_JH_result_large['confirmed_filtered_DR']=pd_JH_result_large['confirmed_filtered_DR'].where(mask,other=np.NaN)
    pd_JH_result_large.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_JH_result_large.head())

            date    state       country  confirmed  confirmed_filtered  \
0     2020-01-22  Alberta        Canada        0.0                 0.0   
37518 2020-01-22       no  Korea, South        1.0                 0.8   
37740 2020-01-22       no        Kosovo        0.0                 0.0   
37962 2020-01-22       no        Kuwait        0.0                 0.0   
38184 2020-01-22       no    Kyrgyzstan        0.0                 0.0   

       confirmed_DR  confirmed_filtered_DR  
0               NaN                    NaN  
37518           NaN                    NaN  
37740           NaN                    NaN  
37962           NaN                    NaN  
38184           NaN                    NaN  


# 4. Dynamic DashBoard for COVID-19 Data

In [41]:
os.getcwd()

'C:\\Users\\Sriram\\eds_covid-19'

In [None]:
# %load src/visualization/visualize.py
"""
Created on Mon Aug 24 17:00:40 2020

@author: Sriram
"""

import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input,Output

df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')

fig=go.Figure()
app=dash.Dash()
app.layout=html.Div([
        dcc.Markdown('''
                     # Applied Datascience on COVID-19 Data
                     Goal of the project is to create a responsive Dashboard 
                     with data from many countries in an automated way through:
                    data gathering , data transformation,
                    filtering and machine learning to approximate doubling time.
                    '''),
        # For Country dropdown menu
        dcc.Markdown(''' ## Multi-Select Country for Visualization'''),
        
        dcc.Dropdown( id='country_drop_down',
                     options=[{'label':each,'value':each} for each in df_input_large['country'].unique()],
                     value=['Germany','India','US'],
                     multi=True),
        # For Doubling rate or conformed cased drop down mneu
        dcc.Markdown(''' ## Select Timeline of confirmed COVID-19 cases or approximated doubling time'''),
        
        dcc.Dropdown( id='doubling_time',
                     options=[
                             {'label':'Timeline Confirmed','value':'confirmed'},
                             {'label':'Timeline Confirmed Filtered','value':'confirmed_filtered'},
                             {'label':'Timeline Doubling Rate','value':'confirmed_DR'},
                             {'label':'Timeline Doubling Rate Filtered','value':'confirmed_filtered_DR'},
                             ],
                     value='confirmed',
                     multi=False),
        dcc.Graph(figure=fig,id='main_window_slope')
        
                    ])

@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):
    if 'DR' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days'
                 }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source: johns hopkins csse, log-scale)'
                 }
    #Define the traces for the countries
    traces = []
    for each in country_list:
        df_plot=df_input_large[df_input_large['country']==each]
        if show_doubling=='confirmed_filtered_DR':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       
        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                    )

    return {'data':traces,
            'layout':dict(
                width=1280,
                height=600,
                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f")},
                yaxis=my_yaxis)
            }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)

Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Debugger PIN: 

 * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
