# Processing the Italian COVID-19 Data

## Imports

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import os
import pycountry

In [None]:
# papermill parameters
output_folder = '../output/'

## Parameters

In [None]:
INPUT_FILE = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'
OUTPUT_FILE_FULL = 'PCM_DPS_COVID19-DETAILS.csv'
OUTPUT_FILE_SUMMARY = 'PCM_DPS_COVID19.csv'

## Input data

In [None]:
data = pd.read_csv(INPUT_FILE)

In [None]:
data.columns

In [None]:
data.columns = ['Date', 'State', 'Region_Code', 'Region', 'Lat', 'Long', 
                    'Hospitalized', 'Intensive_Care', 'Total_Hospitalized', 
                    'Home_Isolation', 'Total_Positive', 'Variation_Total_Positive', 'New_Positive', 
                    'Discharged_Healed', 'Deceased', 'Total_Cases', 'Tested', "Note_IT", "Note_EN"]

In [None]:
subdivisions = {k.name: k.code.replace("IT-", "") for k in pycountry.subdivisions.get(country_code="IT")}

In [None]:
data.replace({"P.A. Bolzano": "Bolzano",
              "P.A. Trento": "Trento",
              "Emilia Romagna": "Emilia-Romagna",
              "Friuli Venezia Giulia": "Friuli-Venezia Giulia"}, inplace=True)

In [None]:
data["ISO3166_1"] = "IT"
data["ISO3166_2"] = data["Region"].apply(lambda x: subdivisions.get(x))

## Transform data

In [None]:
data.Date = pd.to_datetime(data.Date).dt.floor('d')

In [None]:
# calculate day-to-day changes for all figures (except new positive)
data = data.sort_values(by=['Region_Code', 'Date', 'ISO3166_1', 'ISO3166_2'])

In [None]:
data['Hospitalized_Since_Prev_Day'] = data.groupby(['Region_Code'])['Hospitalized'].diff().fillna(0).astype(int)
data['Intensive_Care_Since_Prev_Day'] = data.groupby(['Region_Code'])['Intensive_Care'].diff().fillna(0).astype(int)
data['Total_Hospitalized_Since_Prev_Day'] = data.groupby(['Region_Code'])['Total_Hospitalized'].diff().fillna(0).astype(int)
data['Home_Isolation_Since_Prev_Day'] = data.groupby(['Region_Code'])['Home_Isolation'].diff().fillna(0).astype(int)
data['Total_Positive_Since_Prev_Day'] = data.groupby(['Region_Code'])['Total_Positive'].diff().fillna(0).astype(int)
data['Discharged_Healed_Since_Prev_Day'] = data.groupby(['Region_Code'])['Discharged_Healed'].diff().fillna(0).astype(int)
data['Deceased_Since_Prev_Day'] = data.groupby(['Region_Code'])['Deceased'].diff().fillna(0).astype(int)
data['Total_Cases_Since_Prev_Day'] = data.groupby(['Region_Code'])['Total_Cases'].diff().fillna(0).astype(int)
data['Tested_Since_Prev_Day'] = data.groupby(['Region_Code'])['Tested'].diff().fillna(0).astype(int)

In [None]:
data.rename(columns={"State": "Country/Region", "Region": "Province/State"}, inplace=True)
data["Country/Region"] = "Italy"
data.drop("Region_Code", axis=1)

In [None]:
data.to_csv(output_folder + OUTPUT_FILE_FULL, index=False, 
            header=True,
            columns=["Country/Region", "Province/State", "Date", "Hospitalized", "Intensive_Care", "Total_Hospitalized",  'Home_Isolation', 'Total_Positive', 'New_Positive', 'Discharged_Healed', 'Deceased', 'Total_Cases', 'Tested', 
                  'Hospitalized_Since_Prev_Day', 'Intensive_Care_Since_Prev_Day',
                  'Total_Hospitalized_Since_Prev_Day', 'Home_Isolation_Since_Prev_Day',
                  'Total_Positive_Since_Prev_Day', 'Discharged_Healed_Since_Prev_Day',
                  'Deceased_Since_Prev_Day', 'Total_Cases_Since_Prev_Day',
                  'Tested_Since_Prev_Day', "ISO3166_1", "ISO3166_2", "Note_IT", "Note_EN"])

In [None]:
columns_summary = ['Country/Region', 'Province/State', 'Date', 'Cases', 'Lat', 'Long', 'Difference', 'ISO3166_1', 'ISO3166_2']

data_confirmed = data[['Country/Region', 'Province/State', 'Date', 'Total_Cases' , 'Lat', 'Long', 'Total_Cases_Since_Prev_Day', 'ISO3166_1', 'ISO3166_2']].copy()
data_confirmed.columns = columns_summary
data_confirmed['Case_Type'] = 'Confirmed'

data_deceased = data[['Country/Region', 'Province/State', 'Date', 'Deceased' , 'Lat', 'Long', 'Deceased_Since_Prev_Day', 'ISO3166_1', 'ISO3166_2']].copy()
data_deceased.columns = columns_summary
data_deceased['Case_Type'] = 'Deceased'

data_recovered = data[['Country/Region', 'Province/State', 'Date', 'Discharged_Healed' , 'Lat', 'Long', 'Discharged_Healed_Since_Prev_Day', 'ISO3166_1', 'ISO3166_2']].copy()
data_recovered.columns = columns_summary
data_recovered['Case_Type'] = 'Recovered'

data_active = data[['Country/Region', 'Province/State', 'Date', 'Total_Positive' , 'Lat', 'Long', 'Total_Positive_Since_Prev_Day', 'ISO3166_1', 'ISO3166_2']].copy()
data_active.columns = columns_summary
data_active['Case_Type'] = 'Active'

In [None]:
data_summary = pd.concat([data_confirmed, data_deceased, data_recovered, data_active], ignore_index = True)
data_summary = data_summary[['Country/Region', 'Province/State','Date','Case_Type', 'Cases', 'Lat', 'Long', 'Difference', 'ISO3166_1', 'ISO3166_2']]

In [None]:
data_summary["Last_Update_Date"] = dt.datetime.utcnow()
data_summary.to_csv(output_folder + OUTPUT_FILE_SUMMARY, index=False)