## WHO Coronavirus disease (COVID-2019) situation reports

PDF Reports from https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports in Tabular format.


In [None]:
import tabula
import pandas as pd
import pycountry
import requests
import os
import re
import json
from datetime import datetime

In [None]:
# papermill parameters
output_folder = "../output/"


In [None]:
pattern = r'/docs/default-source/coronaviruse/situation-reports/(\d+-sitrep-\d+-covid-19).pdf'

r = requests.get('https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports')

reports_to_fetch = list(set(re.findall(pattern, r.text)))



In [None]:
reports_to_fetch.sort()
reports_to_fetch.reverse()
reports_to_fetch = [report for report in reports_to_fetch if report > '20200229']


if os.getenv("ENVIRONMENT") == "CI":
    reports_to_fetch = reports_to_fetch[1:2]
    
reports_to_fetch    

In [None]:
all_reports = {}

for report in reports_to_fetch:
    report_url = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
    all_tables = tabula.read_pdf(report_url, pages='all', pandas_options={'header': None},silent=True)
    all_reports[report] = all_tables


In [None]:
country_data = pd.DataFrame([],columns=[])

for report, all_tables in all_reports.items():

    # Remove all tables without 7 columns
    for df in all_tables:
        if len(df.columns) == 7:
            df = df.rename(columns={0:'Country',1:'Total_Cases',2:'Cases_New',3:'Deaths',4:'Deaths_New',5:'Transmission_Classification',6:'Days_Since_Last_Reported_Case'})
            df["ISO3166-1"] = ""
            df['Country/Region'] = ""
            df["Date"] = datetime.strptime(report[0:8], '%Y%m%d')
            df["Situation_Report_name"] = report
            df["Situation_Report_URL"] = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
            country_data = country_data.append(df,ignore_index=True)

# Remove columns with null country or cases 
country_data = country_data[country_data['Country'].notnull()]
country_data = country_data[country_data['Total_Cases'].notnull()]
# header row
country_data = country_data[~country_data.Days_Since_Last_Reported_Case.isin(['reported case','last reported'])]
country_data = country_data[country_data['Days_Since_Last_Reported_Case'].notnull()]



In [None]:
country_data["Country"].unique()

country_data[country_data["Country"]=='the United Kingdom'] # 20200314-sitrep-54-covid-19

#country_data[country_data["Situation_Report_name"]=='20200314-sitrep-54-covid-19'][100:110]


In [None]:
changed_names = {
    "The United Kingdom": "United Kingdom",
    "Serbia††": "Serbia",
    "Iran (Islamic Republic of)": "Iran",
    "occupied Palestinian territory": "Palestine",
    "occupied Palestinian Territory": "Palestine",  
    "Occupied Palestinian Territory": "Palestine",        
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Bolivia (Plurinational State of)": "Bolivia",
    "State of)": "Bolivia", # 20200314-sitrep-54-covid-19 
    "Republic of)": "Venezuela", # 20200314-sitrep-54-covid-19
    "Côte d’Ivoire": "Côte d'Ivoire",
    "Cote d’Ivoire": "Côte d'Ivoire",
    "Cote d Ivoire": "Côte d'Ivoire",
    "conveyance": "International conveyance (Diamond Princess)",
    "Kosovo[1]": "Kosovo",
    "United States Virgin Islands": "Virgin Islands",
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "Kingdom¶": "United Kingdom",
    "the United Kingdom": "United Kingdom"

}

country_data["Country"] = country_data["Country"].str.replace('\r', ' ')
country_data["Country"] = country_data["Country"].str.replace('^', '')
country_data["Country"] = country_data["Country"].replace(changed_names)

countries = country_data["Country"].unique()

country_dict = {}

for country in countries:
    if not "conveyance" in country:
        country_dict[country] = pycountry.countries.search_fuzzy( country )[0]
    else:
        country_dict[country] = None

        
#country_dict

In [None]:
def resolve_iso3166_1_row(row):
    country = country_dict[ row["Country"] ]
    if country:
        row["ISO3166-1"] = country.alpha_2
        row['Country/Region'] = country.name
    else:
        row["ISO3166-1"] = ""
        row['Country/Region'] = row["Country"]
    return row
    

data = country_data.apply(resolve_iso3166_1_row, axis="columns")
        


In [None]:
data.tail()

## Adding Metadata

Before we save the file locally, we add the `Last_Update_Date` in `UTC` time zone.


In [None]:
data["Last_Update_Date"] = datetime.utcnow()


In [None]:
data.to_csv(output_folder + "WHO_SITUATION_REPORTS.csv", index=False)