# Process Italian Covid-19 Data

* Load latest Covid-19 data from [https://github.com/pcm-dpc/COVID-19](https://github.com/pcm-dpc/COVID-19)
* Transform for easy reporting (calcualte day-to-day changes, rename columns)
* Create summary file, similar to international data
* Upload to S3 bucket


## Imports

In [22]:
import numpy as np
import pandas as pd
import datetime as dt
import os

import boto3
from botocore.exceptions import ClientError

## Parameters

In [24]:
INPUT_FILE = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'
OUTPUT_FILE_FULL = 'dpc-covid19-ita-regions-full.csv'
OUTPUT_FILE_SUMMARY = 'dpc-covid19-ita-regions-summary.csv'



# You need to set up the AWS Access Key ID and AWS Secret Access Key to make it work
# https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html
BUCKET = 'test-covid19'# TODO update when we have the final s3 bucket

## Temp

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Input data

In [25]:
data_ita = pd.read_csv(INPUT_FILE)

In [26]:
# data_ita.columns: 
# ['data', 'stato', 'codice_regione', 'denominazione_regione', 'lat', 'long', 'ricoverati_con_sintomi', 'terapia_intensiva', 'totale_ospedalizzati', 
# 'isolamento_domiciliare', 'totale_attualmente_positivi', 'nuovi_attualmente_positivi', 'dimessi_guariti', 'deceduti', 'totale_casi', 'tamponi']

data_ita.columns = ['Date', 'State', 'Region_Code', 'Region', 'Lat', 'Long', 
                    'Hospitalized', 'Intensive_Care', 'Total_Hospitalized', 
                    'Home_Isolation', 'Total_Positive', 'New_Positive', 
                    'Discharged_Healed', 'Deceased', 'Total_Cases', 'Tested']


In [27]:
data_ita.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 16 columns):
Date                  399 non-null object
State                 399 non-null object
Region_Code           399 non-null int64
Region                399 non-null object
Lat                   399 non-null float64
Long                  399 non-null float64
Hospitalized          399 non-null int64
Intensive_Care        399 non-null int64
Total_Hospitalized    399 non-null int64
Home_Isolation        399 non-null int64
Total_Positive        399 non-null int64
New_Positive          399 non-null int64
Discharged_Healed     399 non-null int64
Deceased              399 non-null int64
Total_Cases           399 non-null int64
Tested                399 non-null int64
dtypes: float64(2), int64(11), object(3)
memory usage: 50.0+ KB


In [28]:
# number of regions, number of dates
r = data_ita.Region.nunique()
d = data_ita.Date.nunique()
r, d, r*d

(21, 19, 399)

## Transform data

In [29]:
data_ita.Date = pd.to_datetime(data_ita.Date).dt.floor('d')


In [30]:
# calculate day-to-day changes for all figures (except new positive)
data_ita = data_ita.sort_values(by=['Region_Code', 'Date'])
data_ita['Hospitalized_DIFF'] = data_ita.groupby(['Region_Code'])['Hospitalized'].diff().fillna(0).astype(int)
data_ita['Intensive_Care_DIFF'] = data_ita.groupby(['Region_Code'])['Intensive_Care'].diff().fillna(0).astype(int)
data_ita['Total_Hospitalized_DIFF'] = data_ita.groupby(['Region_Code'])['Total_Hospitalized'].diff().fillna(0).astype(int)
data_ita['Home_Isolation_DIFF'] = data_ita.groupby(['Region_Code'])['Home_Isolation'].diff().fillna(0).astype(int)
data_ita['Total_Positive_DIFF'] = data_ita.groupby(['Region_Code'])['Total_Positive'].diff().fillna(0).astype(int)
data_ita['Discharged_Healed_DIFF'] = data_ita.groupby(['Region_Code'])['Discharged_Healed'].diff().fillna(0).astype(int)
data_ita['Deceased_DIFF'] = data_ita.groupby(['Region_Code'])['Deceased'].diff().fillna(0).astype(int)
data_ita['Total_Cases_DIFF'] = data_ita.groupby(['Region_Code'])['Total_Cases'].diff().fillna(0).astype(int)
data_ita['Tested_DIFF'] = data_ita.groupby(['Region_Code'])['Tested'].diff().fillna(0).astype(int)


In [31]:
#data_ita.head(10)

In [32]:
data_ita.to_csv(OUTPUT_FILE_FULL, index=False)

In [33]:
data_ita.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399 entries, 12 to 392
Data columns (total 25 columns):
Date                       399 non-null datetime64[ns]
State                      399 non-null object
Region_Code                399 non-null int64
Region                     399 non-null object
Lat                        399 non-null float64
Long                       399 non-null float64
Hospitalized               399 non-null int64
Intensive_Care             399 non-null int64
Total_Hospitalized         399 non-null int64
Home_Isolation             399 non-null int64
Total_Positive             399 non-null int64
New_Positive               399 non-null int64
Discharged_Healed          399 non-null int64
Deceased                   399 non-null int64
Total_Cases                399 non-null int64
Tested                     399 non-null int64
Hospitalized_DIFF          399 non-null int64
Intensive_Care_DIFF        399 non-null int64
Total_Hospitalized_DIFF    399 non-null int64
Home_

In [34]:
columns_summary = ['Country/Region', 'Province/State', 'Date', 'Cases', 'Long', 'Lat', 'Difference']

data_ita_confirmed = data_ita[['State', 'Region', 'Date', 'Total_Positive' , 'Long', 'Lat', 'Total_Positive_DIFF']].copy()
data_ita_confirmed.columns = columns_summary
data_ita_confirmed['Case_Type'] = 'Confirmed'

data_ita_deceased = data_ita[['State', 'Region', 'Date', 'Deceased' , 'Long', 'Lat', 'Deceased_DIFF']].copy()
data_ita_deceased.columns = columns_summary
data_ita_deceased['Case_Type'] = 'Deceased'

data_ita_recovered = data_ita[['State', 'Region', 'Date', 'Discharged_Healed' , 'Long', 'Lat', 'Discharged_Healed_DIFF']].copy()
data_ita_recovered.columns = columns_summary
data_ita_recovered['Case_Type'] = 'Recovered'

data_ita_active = data_ita[['State', 'Region', 'Date', 'Total_Cases' , 'Long', 'Lat', 'Total_Cases_DIFF']].copy()
data_ita_active.columns = columns_summary
data_ita_active['Case_Type'] = 'Active'

In [35]:
data_ita_summary = pd.concat([data_ita_confirmed, data_ita_deceased, data_ita_recovered, data_ita_active], ignore_index = True)

In [36]:
data_ita_summary.to_csv(OUTPUT_FILE_SUMMARY, index=False)

## Upload to S3

**NOT TESTED YET !!!**

In [37]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket
    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        print(e)
        return False
    return True

In [38]:
upload_file(OUTPUT_FILE_FULL, BUCKET, object_name=None)
upload_file(OUTPUT_FILE_SUMMARY, BUCKET, object_name=None)

NoCredentialsError: Unable to locate credentials