## WHO Coronavirus disease (COVID-2019) situation reports

PDF Reports from https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports in Tabular format.


In [55]:
import tabula
import pandas as pd
import pycountry
import requests
import os
import re
import json
from datetime import datetime

In [56]:
# papermill parameters
output_folder = "../output/"


In [57]:
pattern = r'/docs/default-source/coronaviruse/situation-reports/(\d+-sitrep-\d+-covid-19).pdf'

r = requests.get('https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports')

reports_to_fetch = list(set(re.findall(pattern, r.text)))



In [58]:
reports_to_fetch.sort()
reports_to_fetch.reverse()
reports_to_fetch = [report for report in reports_to_fetch if report > '20200229']


if os.getenv("ENVIRONMENT") == "CI":
    reports_to_fetch = reports_to_fetch[1:2]
    
reports_to_fetch    

['20200330-sitrep-70-covid-19',
 '20200329-sitrep-69-covid-19',
 '20200328-sitrep-68-covid-19',
 '20200327-sitrep-67-covid-19',
 '20200326-sitrep-66-covid-19',
 '20200325-sitrep-65-covid-19',
 '20200324-sitrep-64-covid-19',
 '20200323-sitrep-63-covid-19',
 '20200322-sitrep-62-covid-19',
 '20200321-sitrep-61-covid-19',
 '20200320-sitrep-60-covid-19',
 '20200319-sitrep-59-covid-19',
 '20200318-sitrep-58-covid-19',
 '20200317-sitrep-57-covid-19',
 '20200316-sitrep-56-covid-19',
 '20200315-sitrep-55-covid-19',
 '20200314-sitrep-54-covid-19',
 '20200313-sitrep-53-covid-19',
 '20200312-sitrep-52-covid-19',
 '20200311-sitrep-51-covid-19',
 '20200310-sitrep-50-covid-19',
 '20200309-sitrep-49-covid-19',
 '20200308-sitrep-48-covid-19',
 '20200307-sitrep-47-covid-19',
 '20200306-sitrep-46-covid-19',
 '20200305-sitrep-45-covid-19',
 '20200304-sitrep-44-covid-19',
 '20200303-sitrep-43-covid-19',
 '20200302-sitrep-42-covid-19',
 '20200301-sitrep-41-covid-19',
 '20200229-sitrep-40-covid-19']

In [59]:
all_reports = {}

for report in reports_to_fetch:
    report_url = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
    all_tables = tabula.read_pdf(report_url, pages='all', pandas_options={'header': None},silent=True)
    all_reports[report] = all_tables


In [60]:
country_data = pd.DataFrame([],columns=[])

for report, all_tables in all_reports.items():

    # Remove all tables without 7 columns
    for df in all_tables:
        if len(df.columns) == 7:
            df = df.rename(columns={0:'Country',1:'Total_Cases',2:'Cases_New',3:'Deaths',4:'Deaths_New',5:'Transmission_Classification',6:'Days_Since_Last_Reported_Case'})
            df["ISO3166-1"] = ""
            df['Country/Region'] = ""
            df["Date"] = datetime.strptime(report[0:8], '%Y%m%d')
            df["Situation_Report_name"] = report
            df["Situation_Report_URL"] = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
            country_data = country_data.append(df,ignore_index=True)

# Remove columns with null country or cases 
country_data = country_data[country_data['Country'].notnull()]
country_data = country_data[country_data['Total_Cases'].notnull()]
# header row
country_data = country_data[~country_data.Days_Since_Last_Reported_Case.isin(['reported case','last reported'])]
country_data = country_data[country_data['Days_Since_Last_Reported_Case'].notnull()]

# remove `*` from numbers
country_data["Total_Cases"] = country_data["Total_Cases"].str.replace('*', '')
country_data["Days_Since_Last_Reported_Case"] = country_data["Days_Since_Last_Reported_Case"].str.replace('*', '')



In [65]:
country_data["Country"].unique()

country_data[country_data["Country"]=='the)'] # 20200314-sitrep-54-covid-19

#country_data[country_data["Situation_Report_name"]=='20200314-sitrep-54-covid-19'][100:110]


[Country(alpha_2='MP', alpha_3='MNP', name='Northern Mariana Islands', numeric='580', official_name='Commonwealth of the Northern Mariana Islands'),
 Country(alpha_2='US', alpha_3='USA', name='United States', numeric='840', official_name='United States of America')]

In [62]:
changed_names = {
    "The United Kingdom": "United Kingdom",
    "Serbia††": "Serbia",
    "Iran (Islamic Republic of)": "Iran",
    "occupied Palestinian territory": "Palestine",
    "occupied Palestinian Territory": "Palestine",  
    "Occupied Palestinian Territory": "Palestine",        
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Bolivia (Plurinational State of)": "Bolivia",
    "State of)": "Bolivia", # 20200314-sitrep-54-covid-19 
    "Republic of)": "Venezuela", # 20200314-sitrep-54-covid-19
    "Côte d’Ivoire": "Côte d'Ivoire",
    "Cote d’Ivoire": "Côte d'Ivoire",
    "Cote d Ivoire": "Côte d'Ivoire",
    "conveyance": "International conveyance (Diamond Princess)",
    "Kosovo[1]": "Kosovo",
    "United States Virgin Islands": "Virgin Islands",
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "Kingdom¶": "United Kingdom",
    "the United Kingdom": "United Kingdom",
    "the)": "Northern Mariana Islands"

}

country_data["Country"] = country_data["Country"].str.replace('\r', ' ')
country_data["Country"] = country_data["Country"].str.replace('^', '')
country_data["Country"] = country_data["Country"].replace(changed_names)

countries = country_data["Country"].unique()

country_dict = {}

for country in countries:
    if not "conveyance" in country:
        country_dict[country] = pycountry.countries.search_fuzzy( country )[0]
    else:
        country_dict[country] = None

        
#country_dict

LookupError: the)

In [None]:
def resolve_iso3166_1_row(row):
    country = country_dict[ row["Country"] ]
    if country:
        row["ISO3166-1"] = country.alpha_2
        row['Country/Region'] = country.name
    else:
        row["ISO3166-1"] = ""
        row['Country/Region'] = row["Country"]
    return row
    

data = country_data.apply(resolve_iso3166_1_row, axis="columns")
        


In [None]:
data.tail()

## Adding Metadata

Before we save the file locally, we add the `Last_Update_Date` in `UTC` time zone.


In [None]:
data["Last_Update_Date"] = datetime.utcnow()


In [None]:
data.to_csv(output_folder + "WHO_SITUATION_REPORTS.csv", index=False)