# Fetching WHO's situation reports on COVID-19 as DataFrames

## Scrape the data - functions

In [1]:
from urllib.request import urlopen
from shutil import copyfileobj
from requests import get
from requests.utils import default_headers
from bs4 import BeautifulSoup
from datetime import datetime
from pandas import DataFrame
from dateparser import parse as dateparse
from os.path import exists

def get_filename_from_link(link):
    return link.rsplit('/', 1)[-1].rsplit('?',1)[0]

def get_data(url='https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'):
    headers = default_headers()
    soup = BeautifulSoup(get(url).content, 'html.parser')
    links = ['https://www.who.int' + link.get('href') for link in soup.find_all('a', href=True) if link.get('href').find('.pdf') > -1]
    dates = [datetime.strptime(get_filename_from_link(link).rsplit('-')[0], '%Y%m%d') for link in links]
    reportid = [get_filename_from_link(link).rsplit('-')[2] for link in links]
    list_of_tuples = list(zip(reportid, dates, links))
    return DataFrame(list_of_tuples, columns = ['Report_ID', 'Date', 'Link'])

def download_for_date(datearg,
                      url='https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'):
    df = get_data(url)
    dt = dateparse(datearg)
    if dt is None:
        print('the date {} couldnt be parsed'.format(datearg))
        return []
    else:
        link = list(df.loc[df['Date'] == dt]['Link'])
        if len(link) == 0:
            print('no record for the date {} found'.format(dt.strftime("%Y/%m/%d")))
            return []
        else:
            link = link[0]
            filename = get_filename_from_link(link)
            if not exists(filename):
                with urlopen(link) as response, open(filename, 'wb') as out_file:
                    copyfileobj(response, out_file)
                    print('file for date {} downloaded at {}'.format(dt.strftime("%Y/%m/%d"), filename))
                    return [dt, filename]
            else:
                print("file for the date {} already exists at {}. didn't re-download".format(dt.strftime("%Y/%m/%d"), filename))
                return [dt, filename]

## Visualise the scraped data

In [2]:
get_data()

Unnamed: 0,Report_ID,Date,Link
0,70,2020-03-30,https://www.who.int/docs/default-source/corona...
1,69,2020-03-29,https://www.who.int/docs/default-source/corona...
2,69,2020-03-29,https://www.who.int/docs/default-source/corona...
3,68,2020-03-28,https://www.who.int/docs/default-source/corona...
4,67,2020-03-27,https://www.who.int/docs/default-source/corona...
...,...,...,...
69,5,2020-01-25,https://www.who.int/docs/default-source/corona...
70,4,2020-01-24,https://www.who.int/docs/default-source/corona...
71,3,2020-01-23,https://www.who.int/docs/default-source/corona...
72,2,2020-01-22,https://www.who.int/docs/default-source/corona...


## Download for a given date

In [3]:
downloaded_file = download_for_date('21st of March')

file for date 2020/03/21 downloaded at 20200321-sitrep-61-covid-19.pdf


## Send to Parsr for extraction

In [4]:
from parsr_client import ParserClient
parsr = ParserClient('localhost:3001')
job = parsr.send_document(
    file=downloaded_file[1],
    config='defaultConfig.json',
    wait_till_finished=True,
    save_request_id=True,
)

> Polling server for the job a479e78da7d799664eb4deb0d1a06a...
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Progress percentage: 0
>> Job done!


## Fetch all tabular data and assemble the data together

In [5]:
from pandas import concat as pdconcat

table_info = parsr.get_tables_info()

dfs = list([parsr.get_table(page=table_info[0][0],table=table_info[0][1])])
columns = dfs[0].columns
dfs += [parsr.get_table(page=i[0],table=i[1],column_names=columns) for i in table_info[1:]]

data = pdconcat(dfs, ignore_index=True)

## Final output for that given date

In [6]:
data

Unnamed: 0,Reporting Country/ Territory/Area†,Total confirmed ‡ cases,Total confirmed new cases,Total deaths,Total new deaths,Transmission classification§,Days since last reported case
0,Western Pacific Region,,,,,,
1,China,81416,116,3261,8,Local transmission,0
2,Republic of Korea,8799,147,102,8,Local transmission,0
3,Malaysia,1030,130,3,1,Local transmission,0
4,Japan,996,46,35,2,Local transmission,0
...,...,...,...,...,...,...,...
192,Mayotte,4,0,0,0,Imported cases only,1
193,Subtotal for all regions,265361,32000,11176,1343,,
194,International conveyance,712,0,7,0,Local transmission,5
195,(Diamond Princess),,,,,,
