In [1]:
import pandas as pd
from os import listdir, path

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# 1 - DATA TO BE READ BY POWER BI

In [2]:
config=pd.read_csv('config.csv',index_col='var')
label_map = pd.read_csv('label_map.csv',header=None,index_col=0)

province_mapping_dict = label_map.loc[label_map[2] == 'province'][1].to_dict()

file = config.loc['raw_data'].path

In [8]:
df = pd.read_csv(file)

In [12]:
df = df[df['Province/State'] != '-']
df = df[df['Province/State'] != 'Recovered']
df['Province/State'] = df['Province/State'].transform(lambda x: province_mapping_dict[x]
                                                      if x in province_mapping_dict.keys()
                                                      else x)

columns = ['Province/State','Country/Region','Date','Confirmed',
           'Active','Recovered','Deaths']
df = df[columns].groupby(['Province/State','Country/Region','Date']).sum().reset_index()
#df.set_index('Province/State',inplace=True)
df.head(5)

df.to_csv('province_report.csv',index=False)

In [43]:
df_province.to_csv('province_report.csv',index=False,header=False,mode='a')

In [5]:
from datadotworld import api_client as dw
dw().sync_files('psychopresley/covid19tracking')

In [13]:
def province_data_formatter(raw_data):
    '''
    Creates the world data report from the raw data dataframe.
    
    This function works along the raw_data as returned by the
    raw_data_formatter function. Changes in raw_data_formatter
    affect directly this function.
    
    It creates all columns necessary for analysis with Power BI
    from the John Hopkins Data Science Center and it returns a
    new DataFrame object with calculated columns.
    
    Parameters
    ----------
    raw_data: obj, DataFrame
        the raw data DataFrame as returned by the raw_data_formatter
        function.
    '''
    raw_data = raw_data[raw_data['Province/State'] != '-']
    raw_data = raw_data[raw_data['Province/State'] != 'Recovered']
    raw_data['Province/State'] = raw_data['Province/State'].transform(lambda x: province_mapping_dict[x]
                                                                      if x in province_mapping_dict.keys()
                                                                      else x)

    columns = ['Province/State','Country/Region','Date','Confirmed',
               'Active','Recovered','Deaths']
    raw_data = raw_data[columns].groupby(['Province/State','Country/Region','Date']).sum().reset_index()

    return raw_data

In [24]:
# Alternative function:

def province_data_formatter(df):
    '''
    Creates the world data report from the raw data dataframe.
    
    This function works along the raw_data as returned by the
    raw_data_formatter function. Changes in raw_data_formatter
    affect directly this function.
    
    It creates all columns necessary for analysis with Power BI
    from the John Hopkins Data Science Center and it returns a
    new DataFrame object with calculated columns.
    
    Parameters
    ----------
    raw_data: obj, DataFrame
        the raw data DataFrame as returned by the raw_data_formatter
        function.
    '''
    from pandas import concat
    
    columns = ['Province/State','Country/Region','Date','Confirmed',
               'Active','Recovered','Deaths']
    df = df[columns].groupby(['Province/State','Country/Region','Date']).sum().reset_index()

    columns = ['Confirmed','Active','Recovered','Deaths']
    new_cases = [item + ' new cases' for item in columns]
    df[new_cases] = df.groupby('Province/State')[columns].diff().fillna(value=0)

    columns_mov_avg = columns.copy()
    columns_mov_avg.extend(new_cases)

    mov_avg = [3,7,15]
    df_province = df.copy()
    for day in mov_avg:
        new_columns = [item + ' {}-day mov avg'.format(day) for item in columns_mov_avg]
        df_aux = df.groupby('Province/State').rolling(day).mean().fillna(value=0).reset_index()
        df_aux.drop(['Province/State','level_1'],axis=1,inplace=True)
        df_aux.columns = new_columns

        df_province = concat([df_province,df_aux],axis=1)
    
    return df_province