<h1>Extract Covid Data Dynamically from Romanian Authorities Official Comms</h1>

Because Romanian authorities are not exposing directly the data, we will have to scrap this data from their official communication site, [here](https://www.mai.gov.ro/).

We intend to retrieve the County-level data for Romania.

# Prepare extraction

In order to extract the data, we will use an utility modified from [Parsing HTML Tables in Python with BeautifulSoup and pandas](https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/).


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

class HTMLTableParser:

    def parse_url(self, url):
        response = requests.get(url)
        #print(response)
        soup = BeautifulSoup(response.text, 'html')
        #print(soup)
        return [(self.parse_html_table(table))\
            for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        #print("new table")
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        df = pd.DataFrame() 
        try:                    
            # Safeguard on Column Titles
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")

            columns = column_names if len(column_names) > 0 else range(0,n_columns)

            #print(n_rows, n_columns)
            df = pd.DataFrame(columns = columns,
                  index= range(0,n_rows))

            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')

                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1

            # Convert to float if possible
            for col in df:
                    df[col] = df[col]
        except Exception as ex:
            print(ex)
            pass
        #df.head(10)
        return df


# Run the extraction

We define a small table with the list of dates and corresponding urls.

In [None]:
url_data =\
    [('2020-04-02',"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-2-aprilie-ora-13-00/"),
    ('2020-04-03',"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-3-aprilie-2020-ora-13-00/"),
    ('2020-04-04',"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-4-aprilie-2020-ora-13-00/")]

We initialize the HTMLTableParser object. We then parse the tables in all the list of urls, and we just retain and concatenate our main payload table, a table with number of confirmed cases / county.  

We also filter out the first rows (with the header - containing the column names) and the last row (the summary, or footer) from each page.

In [None]:
hp = HTMLTableParser()
all_data_df = pd.DataFrame()
for current_date, current_url in url_data:
    tables = hp.parse_url(current_url)
    payload_table = tables[0]
    print(payload_table.shape)
    payload_table['date'] = current_date
    payload_table = payload_table.iloc[1:]
    payload_table = payload_table.iloc[:-1]
    
    all_data_df = all_data_df.append(payload_table)
all_data_df.columns = ['No', 'County', 'Confirmed', 'Date']

The errors reported are from parsing one table that we are not interested in.

# Check the data

In [None]:
print(f"{all_data_df.shape}")

In [None]:
all_data_df.County.unique()

In [None]:
all_data_df = all_data_df.loc[~(all_data_df.County=='–')]
all_data_df = all_data_df.loc[~(all_data_df.Confirmed=='–')]

In [None]:
all_data_df.County.unique()

In [None]:
all_data_df.Date.unique()

In [None]:
all_data_df.Confirmed = all_data_df.Confirmed.astype(int)

In [None]:
all_data_df.tail()

Our method included as well the headers for the tables. We will remove these rows.

# Visualize the data

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import datetime as dt
%matplotlib inline
import datetime as dt
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

def plot_bars_time_variation(d_df, feature_x, feature_y, title, color='Red'):
    
    hover_text = []
    for index, row in d_df.iterrows():
        hover_text.append(('Date: {}<br>'+
                          'Confirmed cases: {}<br>'+
                          'County: {}').format(row['Date'],row['Confirmed'], row['County']))
    d_df['hover_text'] = hover_text

    d_df['text'] = hover_text
    trace = go.Bar(
        x = d_df[feature_x],y = d_df[feature_y],
        name=feature_y,
        marker=dict(color=color),
        text = hover_text
    )

    data = [trace]
    layout = dict(title = title,
              xaxis = dict(title = feature_x, showticklabels=True), 
              yaxis = dict(title = title),
              hovermode = 'closest'
             )
    fig = dict(data=data, layout=layout)
    iplot(fig, filename='cases-covid19')


In [None]:
data_df = all_data_df.loc[all_data_df.Date == '2020-04-04']
plot_bars_time_variation(data_df, 'County', 'Confirmed', 'Confirmed cases / county [2020-04-04]')

In [None]:
data_df = pd.DataFrame(all_data_df.groupby(['Date'])['Confirmed'].sum()).reset_index()
data_df['County'] = 'All'
data_df.columns = ['Date', 'Confirmed', 'County']
plot_bars_time_variation(data_df, 'Date', 'Confirmed', 'Confirmed cases / date')