In [2]:
print('Initializing data engineering!')
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import json, requests

Initializing data engineering!


### World data engineering
#### Fetching worldwide data

In [3]:
# df = pd.read_json('https://pomber.github.io/covid19/timeseries.json')
# df = pd.read_json('https://covidapi.info/api/v1/country/BRA')
# df = pd.read_json('https://api.covid19api.com/dayone/country/brazil/status/confirmed')
#https://documenter.getpostman.com/view/10808728/SzS8rjbc?version=latest#cc76052f-6601-4645-80e5-ca7aaa36f8ef
df_countries = pd.read_csv('../data/world_countries_2019.csv')

df = pd.DataFrame()
url = "https://pomber.github.io/covid19/timeseries.json"
req = requests.get(url)
# r = r.json()
j = json.loads(req.text)

#### Fetching countries's pandemic data from Pomber's JSON to a dataframe 

In [4]:
# Loading countries names to dict
countries = []
df = pd.DataFrame()
for country in j:
    countries.append(country)

df['country'] = pd.Series(countries)

# Loading countries data do dict then to dataframe
dic = []
for country in countries:
    i = 0
    while i < len(j[country]):
        if j[country][i]['confirmed'] == 0:
            i += 1
            continue
        row = {'country': country, 'date': j[country][i]['date'], 'cases':j[country][i]['confirmed'],
               'deaths':j[country][i]['deaths'], 'recoveries':j[country][i]['recovered']}
        dic.append(row)
        i += 1 
df = pd.DataFrame.from_dict(dic)
df[df['country']=='France'].tail()

Unnamed: 0,country,date,cases,deaths,recoveries
2961,France,2020-4-18,149149,19345,36587
2962,France,2020-4-19,154097,19744,37183
2963,France,2020-4-20,156480,20292,38036
2964,France,2020-4-21,159297,20829,39819
2965,France,2020-4-22,157125,21373,41326


#### Feature engineering

In [5]:
for country in countries:
    qtdeDays = len(df[df.country == country])+1
    df.loc[df.country == country, 'day'] = (np.arange(1,qtdeDays,1))
#     df.drop(df[case].index, inplace=True)
    # valores diários
    df.loc[df.country == country, 'case_day'] = df[df.country == country]['cases'].diff()    
    df.loc[df.country == country, 'death_day'] = df[df.country == country]['deaths'].diff()
    df.loc[df.country == country, 'recovery_day'] = df[df.country == country]['recoveries'].diff()

    # % daily variations
    df.loc[df.country == country, '%var_case_day'] = ((df[df.country == country]['case_day'] - df[df.country == country]['case_day'].shift()) / df[df.country == country]['case_day'].shift()*100).replace([np.inf, -np.inf], 0).replace([np.nan], 0).round(2)
    df.loc[df.country == country, '%var_death_day'] = ((df[df.country == country]['death_day'] - df[df.country == country]['death_day'].shift()) / df[df.country == country]['death_day'].shift()*100).replace([np.inf, -np.inf], 0).replace([np.nan], 0).round(2)
    df.loc[df.country == country, '%var_recovery_day'] = ((df[df.country == country]['recovery_day'] - df[df.country == country]['recovery_day'].shift()) / df[df.country == country]['recovery_day'].shift()*100).replace([np.inf, -np.inf], 0).replace([np.nan], 0).round(2)
    
    # Igualo o valor da primeira linha igual ao primeiro número do acumulado, pois se o acumulado começa em 1 o primeiro diff fica igual a 0
    df.loc[(df.country == country) & (df.day == 1), 'case_day']= df.loc[(df.country == country) & (df.day==1), 'cases']
    df.loc[(df.country == country) & (df.day == 1), 'death_day']= df.loc[(df.country == country) & (df.day==1), 'deaths']
    df.loc[(df.country == country) & (df.day == 1), 'recovery_day']= df.loc[(df.country == country) & (df.day==1), 'recoveries']
    
    # Buscando a população do país e calculado os indicador per milhão
    if not df_countries[df_countries['country']==country].empty:
        million = df_countries[df_countries['country']==country]['PopTotal'].values[0] / 1000
        cases_million = (df[df.country == country]['case_day'] / million).round(1)
        deaths_million = (df[df.country == country]['death_day'] / million).round(1)
        recoveries_million = (df[df.country == country]['recovery_day'] / million).round(1)
    else:
        cases_million = 0
        deaths_million = 0
        recoveries_million = 0
        
    df.loc[df.country == country, 'cases_million'] = cases_million
    df.loc[df.country == country, 'deaths_million'] = deaths_million
    df.loc[df.country == country, 'recoveries_million'] = recoveries_million
    
    # moving averages (from last 7 days)
    df.loc[df.country == country, 'avg7_cases'] = df[df.country == country]['case_day'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    df.loc[df.country == country, 'avg7_deaths'] = df[df.country == country]['death_day'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    df.loc[df.country == country, 'avg7_recoveries'] = df[df.country == country]['recovery_day'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    
    df.loc[df.country == country, 'avg7_cases_million'] = df[df.country == country]['cases_million'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    df.loc[df.country == country, 'avg7_deaths_million'] = df[df.country == country]['deaths_million'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    df.loc[df.country == country, 'avg7_recoveries_million'] = df[df.country == country]['recoveries_million'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    

df['perc_death'] = (df['deaths']/df['cases'] * 100).round(2)
df['perc_recovery'] = (df['recoveries']/df['cases'] * 100).round(2)
df['active_cases'] = df['cases'] - df['recoveries'] - df['deaths']

df.fillna(0, inplace=True)

df['day'] = df['day'].astype('int')
df['case_day'] = df['case_day'].astype('int')
df['death_day'] = df['death_day'].astype('int')
df['recovery_day'] = df['recovery_day'].astype('int')

df.tail()

Unnamed: 0,country,date,cases,deaths,recoveries,day,case_day,death_day,recovery_day,%var_case_day,...,recoveries_million,avg7_cases,avg7_deaths,avg7_recoveries,avg7_cases_million,avg7_deaths_million,avg7_recoveries_million,perc_death,perc_recovery,active_cases
9268,Yemen,2020-4-18,1,0,0,9,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9269,Yemen,2020-4-19,1,0,0,10,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9270,Yemen,2020-4-20,1,0,0,11,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9271,Yemen,2020-4-21,1,0,0,12,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9272,Yemen,2020-4-22,1,0,0,13,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


#### Saving CSV

In [8]:
df.to_csv('../data/world_corona19_data.csv', index = False)

In [9]:
df[df['country']=='France'].tail()

Unnamed: 0,country,date,cases,deaths,recoveries,day,case_day,death_day,recovery_day,%var_case_day,...,recoveries_million,avg7_cases,avg7_deaths,avg7_recoveries,avg7_cases_million,avg7_deaths_million,avg7_recoveries_million,perc_death,perc_recovery,active_cases
2961,France,2020-4-18,149149,19345,36587,86,19,642,1581,-99.07,...,24.3,7755.0,784.0,1417.0,119.0,12.0,21.0,12.97,24.53,93217
2962,France,2020-4-19,154097,19744,37183,87,4948,399,596,25942.11,...,9.2,4626.0,761.0,1387.0,71.0,11.0,21.0,12.81,24.13,97170
2963,France,2020-4-20,156480,20292,38036,88,2383,548,853,-51.84,...,13.1,4440.0,758.0,1433.0,68.0,11.0,22.0,12.97,24.31,98152
2964,France,2020-4-21,159297,20829,39819,89,2817,537,1783,18.21,...,27.4,3990.0,725.0,1531.0,61.0,11.0,23.0,13.08,25.0,98649
2965,France,2020-4-22,157125,21373,41326,90,-2172,544,1507,-177.1,...,23.1,3220.0,597.0,1408.0,49.0,9.0,21.0,13.6,26.3,94426


#### countries not located in UN dataset

In [10]:
for country in countries:
    if df_countries[df_countries['country']==country]['PopTotal'].empty:
        print(country)
# df[df['pais'] == pais].tail()
# df_countries[df_countries['Location']==pais]['PopTotal']

Bosnia and Herzegovina
Brunei
Congo (Brazzaville)
Congo (Kinshasa)
Cote d'Ivoire
Diamond Princess
Eswatini
Holy See
Iran
Korea, South
Moldova
North Macedonia
Russia
Taiwan*
Tanzania
United Arab Emirates
Venezuela
Vietnam
Syria
Laos
Libya
West Bank and Gaza
Kosovo
Burma
MS Zaandam


### Brazil data engineering

In [14]:
df = pd.read_csv('../data/gov_brazil_corona19_data.csv', sep=';')
df.rename(columns={'regiao': 'region', 'estado':'state', 'data':'date','casosNovos': 'case_day', 'casosAcumulados':'cases', 'obitosNovos':'death_day','obitosAcumulados':'deaths'}, inplace= True)
df['date'] = df['date'].astype('datetime64[ns]')

df.tail()

Unnamed: 0,region,state,date,case_day,cases,death_day,deaths
2290,Centro-Oeste,DF,2020-04-19,65,827,0,24
2291,Centro-Oeste,DF,2020-04-20,45,872,0,24
2292,Centro-Oeste,DF,2020-04-21,9,881,0,24
2293,Centro-Oeste,DF,2020-04-22,65,946,1,25
2294,Centro-Oeste,DF,2020-04-23,17,963,0,25


#### Feature engineering

In [16]:
states = df.state.unique()
df.drop(df[df['cases'] == 0 ].index, axis=0, inplace= True)

for state in states:
    qtdeDays = len(df[df.state == state])+1
    df.loc[df.state == state, 'day'] = (np.arange(1,qtdeDays,1))
#     df.drop(df[case].index, inplace=True)
    # valores diários
    df.loc[df.state == state, 'case_day'] = df[df.state == state]['cases'].diff()    
    df.loc[df.state == state, 'death_day'] = df[df.state == state]['deaths'].diff()
#     df.loc[df.state == state, 'recovery_day'] = df[df.state == state]['recoveries'].diff()

    # % daily variations
    df.loc[df.state == state, '%var_case_day'] = ((df[df.state == state]['case_day'] - df[df.state == state]['case_day'].shift()) / df[df.state == state]['case_day'].shift()*100).replace([np.inf, -np.inf], 0).replace([np.nan], 0).round(2)
    df.loc[df.state == state, '%var_death_day'] = ((df[df.state == state]['death_day'] - df[df.state == state]['death_day'].shift()) / df[df.state == state]['death_day'].shift()*100).replace([np.inf, -np.inf], 0).replace([np.nan], 0).round(2)
#     df.loc[df.state == state, '%var_recovery_day'] = ((df[df.state == state]['recovery_day'] - df[df.state == state]['recovery_day'].shift()) / df[df.state == state]['recovery_day'].shift()*100).replace([np.inf, -np.inf], 0).replace([np.nan], 0).round(2)
    
    # Igualo o valor da primeira linha igual ao primeiro número do acumulado, pois se o acumulado começa em 1 o primeiro diff fica igual a 0
    df.loc[(df.state == state) & (df.day == 1), 'case_day']= df.loc[(df.state == state) & (df.day==1), 'cases']
    df.loc[(df.state == state) & (df.day == 1), 'death_day']= df.loc[(df.state == state) & (df.day==1), 'deaths']
    
    # moving averages (from last 7 days)
    df.loc[df.state == state, 'avg7_cases'] = df[df.state == state]['case_day'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    df.loc[df.state == state, 'avg7_deaths'] = df[df.state == state]['death_day'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).astype('int')
    df.loc[df.state == state, 'perc_death'] = (df[df.state == state]['deaths']/df[df.state == state]['cases']*100).round(2) 
    df.loc[df.state == state, 'avg7_perc_death'] = df[df.state == state]['perc_death'].rolling(window=7).mean().replace([np.inf, -np.inf], 0).replace([np.nan], 0).round(2)

df['perc_death'] = (df['deaths']/df['cases'] * 100).round(2)
# df['perc_recovery'] = (df['recoveries']/df['cases'] * 100).round(2)
# df['active_cases'] = df['cases'] - df['recoveries'] - df['deaths']

df.fillna(0, inplace=True)

df['day'] = df['day'].astype('int')
df['case_day'] = df['case_day'].astype('int')
df['death_day'] = df['death_day'].astype('int')
# df['recovery_day'] = df['recovery_day'].astype('int')

df.tail()

Unnamed: 0,region,state,date,case_day,cases,death_day,deaths,day,%var_case_day,%var_death_day,avg7_cases,avg7_deaths,perc_death,avg7_perc_death
2290,Centro-Oeste,DF,2020-04-19,65,827,0,24,44,306.25,-100.0,30.0,1.0,2.9,2.71
2291,Centro-Oeste,DF,2020-04-20,45,872,0,24,45,-30.77,0.0,33.0,1.0,2.75,2.77
2292,Centro-Oeste,DF,2020-04-21,9,881,0,24,46,-80.0,0.0,32.0,1.0,2.72,2.78
2293,Centro-Oeste,DF,2020-04-22,65,946,1,25,47,622.22,0.0,37.0,1.0,2.64,2.8
2294,Centro-Oeste,DF,2020-04-23,17,963,0,25,48,-73.85,-100.0,35.0,0.0,2.6,2.78


In [17]:
df.to_csv('../data/brazil_corona19_data.csv', index = False)

In [18]:
# df[df['country']=='Belgium']
print('Data engineering done!')

Data engineering done!
