In [1]:
import pandas as pd
from numpy import inf, NaN, where
from datetime import datetime
from os import listdir, path
from pycovid import pycovidfunc as cv

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# 1 - DATA TO BE READ BY TABLEAU

In [5]:
def world_data_formatter(raw_data):
    '''
    Creates the world data report from the raw data dataframe.
    
    This function works along the raw_data as returned by the
    raw_data_formatter function. Changes in raw_data_formatter
    affect directly this function.
    
    It creates all columns necessary for analysis with Tableau
    from the John Hopkins Data Science Center and it returns a
    new DataFrame object with calculated columns.
    
    Parameters
    ----------
    raw_data: obj, DataFrame
        the raw data DataFrame as returned by the raw_data_formatter
        function.
    '''
    import pandas as pd
    from numpy import inf, NaN, where
    from datetime import datetime
    
    df_by_country = df.groupby(['Country/Region','Date']).sum().reset_index()

    # Calculating the number of days since the 1st case:
    df_by_country['Days_since_1st_case'] = df_by_country['Date']

    countries = df_by_country['Country/Region'].unique()
    for country in countries:
        idx = where(df_by_country['Country/Region'] == country)
        first_date = pd.to_datetime(df_by_country['Date'].loc[min(idx[0])])
        for index in idx[0]:
            date_diff = (pd.to_datetime(df_by_country.at[index,'Days_since_1st_case']) - 
                         first_date).days
            df_by_country.at[index,'Days_since_1st_case'] = date_diff


    # columns over which the calculations will be performed
    root_columns = ['Active','Confirmed','Deaths','Recovered']

    # creating columns of daily percentage of increase in values:
    for col in root_columns:
        col_daily_inc = col + "_daily_%inc_by_country"
        col_new_cases = col + '_new_cases'
        col_new_cases_inc = col + '_new_cases_inc_rate'
        col_new_cases_inc_speed = col + '_new_cases_inc_rate_speed'

        df_by_country[col_new_cases] = (df_by_country[col] - 
                                        df_by_country[col].shift(periods=1)
                                       ).fillna(value=0)
        df_by_country[col_daily_inc] = df_by_country[col].pct_change().replace([inf, NaN], 0)*100

        # 1st derivative of column datas. It represents the rate of change in new cases:
        df_by_country[col_new_cases_inc] = (df_by_country[col_new_cases] - 
                                            df_by_country[col_new_cases].shift(periods=1)
                                           ).fillna(value=0)        

        # 2nd derivative of column datas. It represents the acceleration of the increase rate
        # of the new cases:
        df_by_country[col_new_cases_inc_speed] = (df_by_country[col_new_cases_inc] -
                                                  df_by_country[col_new_cases_inc].shift(periods=1)
                                                  ).fillna(value=0)
        
    return df_by_country

In [6]:
df_by_country.to_json(config.loc['formatted_data'].path,orient='records')

# DATA FOR PYTHON USE

In [5]:
# drop columns that won't be used in the calculations:
column_labels = ['Last Update','Latitude','Longitude','Year','Month','Week','Day','Mortality rate in %']
df_formatted.drop(axis=1,inplace=True,labels=column_labels)

df_by_country = df_formatted.groupby(['Country/Region','Date']).sum().reset_index()

# Formatting dataframe for further calculations:
root_columns = ['Active','Confirmed','Deaths','Recovered']       # columns over which the calculations will be performed
MA = [3,7,15]                                                    # Moving Average intervals for computation

# creating columns of daily percentage of increase in values:
for col in root_columns:
    df_aux = df_by_country.groupby('Country/Region')

    country_daily_inc = col + "_daily_%inc_by_country"
   
    df_by_country[country_daily_inc] = df_aux[col].pct_change().fillna(value=0)*100
    for interval in MA:
        interval_col_name = col + "_" + str(interval) + "day_MA"
        df_by_country[interval_col_name] = df_aux.rolling(interval)[col].mean().fillna(method='bfill').reset_index(drop=True)    
        
        if interval == min(MA):
            column_name_first = col + '_1st_derivative'
            column_name_second = col + '_2nd_derivative'

            df_by_country[column_name_first] = (df_by_country[interval_col_name] - df_by_country[interval_col_name].shift(periods=1)).fillna(method='bfill')
            df_by_country[column_name_second] = (df_by_country[column_name_first] - df_by_country[column_name_first].shift(periods=1)).fillna(method='bfill')        

In [6]:
df_by_country.sort_values(by=['Country/Region','Date']).head(5)

Unnamed: 0,Country/Region,Date,Active,Confirmed,Deaths,Recovered,Active_daily_%inc_by_country,Active_3day_MA,Active_1st_derivative,Active_2nd_derivative,Active_7day_MA,Active_15day_MA,Confirmed_daily_%inc_by_country,Confirmed_3day_MA,Confirmed_1st_derivative,Confirmed_2nd_derivative,Confirmed_7day_MA,Confirmed_15day_MA,Deaths_daily_%inc_by_country,Deaths_3day_MA,Deaths_1st_derivative,Deaths_2nd_derivative,Deaths_7day_MA,Deaths_15day_MA,Recovered_daily_%inc_by_country,Recovered_3day_MA,Recovered_1st_derivative,Recovered_2nd_derivative,Recovered_7day_MA,Recovered_15day_MA
0,Azerbaijan,2020-02-28 00:00:00,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.4,0.0,1.0,0.0,0.0,1.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,2020-02-24 00:00:00,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.4,0.0,1.0,0.0,0.0,1.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Afghanistan,2020-02-25 00:00:00,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.4,0.0,1.0,0.0,0.0,1.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,2020-02-26 00:00:00,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.4,0.0,1.0,0.0,0.0,1.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Afghanistan,2020-02-27 00:00:00,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.4,0.0,1.0,0.0,0.0,1.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df_by_country.to_json('country_report.json',orient='records')

In [1]:
import os

In [11]:
%run pycovid

No existing files were updated
0 new files found. No further action necessary
