In [40]:
import csv
import json
import requests
import pandas as pd
from os.path import join

# INFN ISS Scraper

There are no open sources of complete data for new cases, hospitalizations, intensive care occupancy and deaths for the COVID-19 related crisis in Italy. However, in an agreement with the ISS (Istituto Superiore di Sanità), the INFN (Istituto Nazionale di Fisica Nucleare) is publishing an updated dashboard with these information for each and every province of Italy.

This is why I have made this little script that scrapes informations from the dashboard and saves them into a csv file.

## List of units

I have manually written a csv file with the list of available provinces.

In [6]:
with open(join('data', 'province.csv'), newline='') as f:
    reader = csv.reader(f)
    province = [provincia[0] for provincia in reader]

## List of datasets

There are four available datasets, all already averaged out on a weekly basis:
- `positivi`, which refers to new positive cases;
- `ricoveri`, which refers to hospitalizations;
- `terapia_intensiva`, which refers to the occupancy of ICUs;
- `deceduti`, which refers to COVID-19 related deaths.

In [67]:
dataset_names = ['positivi', 'ricoveri', 'terapia_intensiva', 'deceduti']

We can cycle through the datasets and scrape the data from the requests that the dashboard sends to the server.

In [75]:
datasets=[]
for dataset_name in dataset_names:
    url=f'https://covid19.infn.it/iss/plots/iss_bydate_agrigento_{dataset_name}.div'
    s=requests.get(url).text
    datasets.append(json.loads(s.split('\n')[11].strip()[:-1])[0]['y'])

In [76]:
datasets

[[None,
  None,
  None,
  3.2857142857142856,
  3.142857142857143,
  1.8571428571428572,
  2.0,
  2.0,
  2.142857142857143,
  2.5714285714285716,
  3.4285714285714284,
  4.428571428571429,
  4.571428571428571,
  4.714285714285714,
  5.285714285714286,
  5.428571428571429,
  4.857142857142857,
  5.285714285714286,
  5.142857142857143,
  5.0,
  4.714285714285714,
  4.142857142857143,
  3.4285714285714284,
  3.4285714285714284,
  2.5714285714285716,
  2.2857142857142856,
  2.7142857142857144,
  3.2857142857142856,
  3.857142857142857,
  4.571428571428571,
  4.857142857142857,
  4.285714285714286,
  4.0,
  3.7142857142857144,
  3.2857142857142856,
  2.2857142857142856,
  1.7142857142857142,
  1.2857142857142858,
  1.5714285714285714,
  1.4285714285714286,
  1.4285714285714286,
  1.2857142857142858,
  1.2857142857142858,
  1.4285714285714286,
  1.7142857142857142,
  1.2857142857142858,
  1.2857142857142858,
  1.1428571428571428,
  1.2857142857142858,
  1.5714285714285714,
  1.28571428571428

In [77]:
json.loads(s.split('\n')[11].strip()[:-1])[0]['x']

['2020-03-20T00:00:00',
 '2020-03-21T00:00:00',
 '2020-03-22T00:00:00',
 '2020-03-23T00:00:00',
 '2020-03-24T00:00:00',
 '2020-03-25T00:00:00',
 '2020-03-26T00:00:00',
 '2020-03-27T00:00:00',
 '2020-03-28T00:00:00',
 '2020-03-29T00:00:00',
 '2020-03-30T00:00:00',
 '2020-03-31T00:00:00',
 '2020-04-01T00:00:00',
 '2020-04-02T00:00:00',
 '2020-04-03T00:00:00',
 '2020-04-04T00:00:00',
 '2020-04-05T00:00:00',
 '2020-04-06T00:00:00',
 '2020-04-07T00:00:00',
 '2020-04-08T00:00:00',
 '2020-04-09T00:00:00',
 '2020-04-10T00:00:00',
 '2020-04-11T00:00:00',
 '2020-04-12T00:00:00',
 '2020-04-13T00:00:00',
 '2020-04-14T00:00:00',
 '2020-04-15T00:00:00',
 '2020-04-16T00:00:00',
 '2020-04-17T00:00:00',
 '2020-04-18T00:00:00',
 '2020-04-19T00:00:00',
 '2020-04-20T00:00:00',
 '2020-04-21T00:00:00',
 '2020-04-22T00:00:00',
 '2020-04-23T00:00:00',
 '2020-04-24T00:00:00',
 '2020-04-25T00:00:00',
 '2020-04-26T00:00:00',
 '2020-04-27T00:00:00',
 '2020-04-28T00:00:00',
 '2020-04-29T00:00:00',
 '2020-04-30T00:

In [78]:
datasets.insert(0, json.loads(s.split('\n')[11].strip()[:-1])[0]['x'])

We obtain the four datasets as columnar data, then zip the lists together and store them in a Pandas DataFrame.

In [86]:
df = pd.DataFrame.from_records(zip(*datasets), columns=['datetime', 'new_cases', 'hospitalizations', 'intensive', 'deaths']).dropna()
df['province'] = 'agrigento'

In [87]:
df

Unnamed: 0,datetime,new_cases,hospitalizations,intensive,deaths,province
3,2020-03-23T00:00:00,3.285714,0.285714,0.428571,0.428571,agrigento
4,2020-03-24T00:00:00,3.142857,0.142857,0.428571,0.428571,agrigento
5,2020-03-25T00:00:00,1.857143,0.285714,0.428571,0.285714,agrigento
6,2020-03-26T00:00:00,2.000000,0.571429,0.428571,0.285714,agrigento
7,2020-03-27T00:00:00,2.000000,0.571429,0.428571,0.285714,agrigento
...,...,...,...,...,...,...
448,2021-06-11T00:00:00,33.285714,2.857143,0.000000,0.571429,agrigento
449,2021-06-12T00:00:00,31.857143,2.857143,0.000000,0.571429,agrigento
450,2021-06-13T00:00:00,27.428571,2.857143,0.285714,0.571429,agrigento
451,2021-06-14T00:00:00,25.571429,3.142857,0.285714,0.571429,agrigento


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 450 entries, 3 to 452
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   datetime          450 non-null    object 
 1   new_cases         450 non-null    float64
 2   hospitalizations  450 non-null    float64
 3   intensive         450 non-null    float64
 4   deaths            450 non-null    float64
 5   province          450 non-null    object 
dtypes: float64(4), object(2)
memory usage: 24.6+ KB


In [97]:
(len(province)*25)/1024

2.6123046875

In total the upper bound of the entire dataset should be less than 3MB, which is a reasonable size.

## Writing the script that automates all of this

We can now cycle through all the provinces and gather the data for all of them. We can then store the data in a csv file.

In [103]:
prov_dfs=[]
i=0
tot=len(province)
for provincia in province:
    datasets=[]
    for dataset_name in dataset_names:
        url=f'https://covid19.infn.it/iss/plots/iss_bydate_{provincia}_{dataset_name}.div'
        s=requests.get(url).text
        datasets.append(json.loads(s.split('\n')[11].strip()[:-1])[0]['y'])
    datasets.insert(0, json.loads(s.split('\n')[11].strip()[:-1])[0]['x'])
    df = pd.DataFrame.from_records(zip(*datasets), columns=['datetime', 'new_cases', 'hospitalizations', 'intensive', 'deaths']).dropna()
    df['province'] = provincia
    prov_dfs.append(df)
    print(f'Downloading files and preparing dataframe... {i}/{tot}', end='\r')
    i+=1
df = pd.concat(prov_dfs)
print('\nMade the df dataframe!')

Downloading files and preparing dataframe... 106/107
Made the df dataframe!


In [106]:
df.to_csv(join('data', 'covid_iss.csv'), index=False)