## WHO Coronavirus disease (COVID-2019) situation reports

PDF Reports from https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports in Tabular format.


In [3]:
import tabula
import pandas as pd
import pycountry
import requests
import re
from datetime import datetime

In [4]:
# papermill parameters
output_folder = "../output/"

In [5]:
pattern = r'/docs/default-source/coronaviruse/situation-reports/(\d+-sitrep-\d+-covid-19).pdf'

r = requests.get('https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports')

reports_to_fetch = list(set(re.findall(pattern, r.text)))



In [25]:
reports_to_fetch.sort()
reports_to_fetch.reverse()
reports_to_fetch = reports_to_fetch[0:7]
reports_to_fetch

['20200324-sitrep-64-covid-19',
 '20200323-sitrep-63-covid-19',
 '20200322-sitrep-62-covid-19',
 '20200321-sitrep-61-covid-19',
 '20200320-sitrep-60-covid-19',
 '20200319-sitrep-59-covid-19',
 '20200318-sitrep-58-covid-19']

In [26]:
all_reports = {}

for report in reports_to_fetch:
    report_url = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
    all_tables = tabula.read_pdf(report_url, pages='all', pandas_options={'header': None},silent=True)
    all_reports[report] = all_tables


In [27]:
country_data = pd.DataFrame([],columns=[])

for report, all_tables in all_reports.items():

    # Remove all tables without 7 columns
    for df in all_tables:
        if len(df.columns) == 7:
            df = df.rename(columns={0:'Country',1:'Total_Cases',2:'Cases_New',3:'Deaths',4:'Deaths_New',5:'Transmission_Classification',6:'Days_Since_Last_Reported_Case'})
            df["ISO3166-1"] = ""
            df['Country/Region'] = ""
            df["Date"] = datetime.strptime(report[0:8], '%Y%m%d')
            df["Situation_Report_name"] = report
            df["Situation_Report_URL"] = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
            country_data = country_data.append(df,ignore_index=True)

# Remove columns with null country or cases 
country_data = country_data[country_data['Country'].notnull()]
country_data = country_data[country_data['Total_Cases'].notnull()]
# header row
country_data = country_data[country_data['Days_Since_Last_Reported_Case']!='reported case']
country_data = country_data[country_data['Days_Since_Last_Reported_Case'].notnull()]



In [28]:
country_data["Country"].unique()

array(['China', 'Republic of Korea', 'Australia', 'Malaysia', 'Japan',
       'Singapore', 'Philippines', 'Viet Nam', 'New Zealand',
       'Brunei Darussalam', 'Cambodia', 'Mongolia', 'Fiji',
       'Papua New Guinea', 'Guam', 'French Polynesia', 'New Caledonia',
       'Italy', 'Spain', 'Germany', 'France', 'Switzerland',
       'The United Kingdom', 'Netherlands', 'Austria', 'Belgium',
       'Norway', 'Portugal', 'Sweden', 'Turkey', 'Denmark', 'Israel',
       'Czechia', 'Ireland', 'Luxembourg', 'Poland', 'Finland', 'Greece',
       'Iceland', 'Romania', 'Slovenia', 'Russian Federation', 'Estonia',
       'Croatia', 'Serbia', 'Armenia', 'Bulgaria', 'Slovakia', 'Hungary',
       'San Marino', 'Latvia', 'Lithuania', 'Andorra', 'North Macedonia',
       'Bosnia and\rHerzegovina', 'Albania', 'Cyprus',
       'Republic of Moldova', 'Malta', 'Ukraine', 'Belarus', 'Azerbaijan',
       'Georgia', 'Kazakhstan', 'Liechtenstein', 'Uzbekistan', 'Monaco',
       'Montenegro', 'Kyrgyzstan', 'Hol

In [29]:
changed_names = {
    "The United Kingdom": "United Kingdom",
    "Serbia††": "Serbia",
    "Iran (Islamic Republic of)": "Iran",
    "occupied Palestinian territory": "Palestine",
    "occupied Palestinian Territory": "Palestine",  
    "Occupied Palestinian Territory": "Palestine",        
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Bolivia (Plurinational State of)": "Bolivia",
    "Côte d’Ivoire": "Côte d'Ivoire",
    "Cote d’Ivoire": "Côte d'Ivoire",
    "conveyance": "International conveyance (Diamond Princess)",
    "Kosovo[1]": "Kosovo",
    "United States Virgin Islands": "Virgin Islands",
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the"
}

country_data["Country"] = country_data["Country"].str.replace('\r', ' ')
country_data["Country"] = country_data["Country"].replace(changed_names)

countries = country_data["Country"].unique()

country_dict = {}

for country in countries:
    if not "conveyance" in country:
        country_dict[country] = pycountry.countries.search_fuzzy( country )[0]
    else:
        country_dict[country] = None

        
#country_dict

In [30]:
def resolve_iso3166_1_row(row):
    country = country_dict[ row["Country"] ]
    if country:
        row["ISO3166-1"] = country.alpha_2
        row['Country/Region'] = country.name
    else:
        row["ISO3166-1"] = ""
        row['Country/Region'] = row["Country"]
    return row
    

data = country_data.apply(resolve_iso3166_1_row, axis="columns")
        


In [None]:
data.tail()

## Adding Metadata

Before we save the file locally, we add the `Last_Update_Date` in `UTC` time zone.


In [None]:
data["Last_Update_Date"] = datetime.utcnow()


In [None]:
data.to_csv(output_folder + "WHO_SITUATION_REPORTS.csv", index=False)