<a href="https://colab.research.google.com/github/slhoefel/notebooks/blob/main/scrape_WHO_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import os
import pandas as pd
from datetime import datetime
from time import sleep
from google.colab import drive
drive.mount('/content/drive')
path = ('/content/drive/MyDrive/WHO')

Mounted at /content/drive


This script retrieves reports from WHO's Disease Outbreaks Newsfeed. Links from each page are extracted and stored as a text file by the first function "get_urls."

To retrieve multiple pages at once, loop the "get_pages" function through the rage of pages you want to download

In [3]:
#p=page of WHO DON feed to search
def get_urls(p):
    url = f'https://www.who.int/en/emergencies/disease-outbreak-news/{str(p)}'
    grab = requests.get(url)
    soup = BeautifulSoup(grab.text, 'html.parser')

    # open a file in write mode
    f = open(f"{path}/pages/WHO_page_{p}.txt", "w")
    # search paragraphs from soup
    for link in soup.find_all('a',class_= 'sf-list-vertical__item'):
        #extract url
        data = link.get('href')
        f.write(data)
        f.write("\n")
    f.close()

In [None]:
!mkdir $path/pages/
!mkdir $path/urls/
!mkdir $path/downloaded_articles
for n in range(1996,2024):
    !mkdir $path/downloaded_articles/$n

In [19]:
#use get_urls() for single page, use get_pages() for multiple pages

#retrieve by page
all_pages = []
def get_pages(p):
    get_urls(p)
    f = open(f'{path}/pages/WHO_page_{p}.txt')
    urls = [x.strip('\n') for x in f.readlines()]
    all_pages.extend(urls)
#there are currently 142 pages of DON newsfeed results
for n in range(1,143):
    get_pages(n)

#save all downloaded urls from all pages in a time-stamped text file for documentation purposes
now = datetime.now().strftime("%d:%B:%Y_%H:%M:%S")
with open(f'{path}/urls/all_{now}.txt', 'w') as f:
    f.writelines([x +'\n'for x in all_pages][:-1]+[[x for x in all_pages][-1]])

In [4]:
#import and simplify country names to match article locations with ISO codes in ISO key
def arrange(x):
    x = x.split(', ')
    x = x[1]+' '+x[0]
    x = x.replace('Province of China ','')
    return x

ISO = pd.read_csv(f'{path}/ISO.csv')[['name','alpha-3']]
ISO.name = ISO.name.apply(lambda x: arrange(x.split('(')[0]) if (', ' in x) and (' and ' not in x) else x.split(' (')[0])
ISO.name = ISO.name.apply(lambda x: x.split(' of America')[0].split(' of Great')[0].split('n Federation')[0])
iso_dict = dict(zip(ISO['name'],ISO['alpha-3']))

Once urls are downloaded and stored, they can be passed to the "get_articles" fuction. This function uses bs4 (BeautifulSoup) to extract data from the linked page by HTML tags

In [5]:
def next_key(key,temp):
  #get next key in order of appearance to create section boundary
  try:
      next = temp[temp.index(key) + 1]
  #no further headings
  except (ValueError, IndexError):
      next = None
  return next

In [66]:
#retrieve text from DON report with the article url
def get_article(url):
    #print for troubleshooting
    print(url)
    #retrieve article
    grab = requests.get(url)
    #read text as html
    soup = BeautifulSoup(grab.text, 'html.parser')

    #get date from <timestamp> html tag
    s = soup.find('span',class_= 'timestamp')
    date = s.get_text()

    #get report title from <h1> tag within <div class='sf-item-header-wrapper'> division
    s = soup.find('div', class_="sf-item-header-wrapper")
    title = s.h1.text.replace('Saint-','Saint ')
    title = title.replace('–','-').replace('\n','')

    #split title into disease and country names
    what,where = title.split('-')[:-1],title.split('-')[-1].strip()
    #rejoin diseases names like "COVID-19" that are split by disease-country separation
    if type(what)==list:
        what = '-'.join(what)
    #occassionally disease is listed second instead of first
    if where.strip().count(' ') > 5:
        if ((any(x.lower() in what.lower() for x in iso_dict.keys())) or ('multi-country' in what.lower())) and \
         ((all(x.lower() not in where.lower() for x in iso_dict.keys())) and ((all(x.lower() not in where.lower() for x in ['multi-country','Europe','America','Asia','Africa','Pacific'])))):
            where = what
            what = where

    #open write file
    f = open(f"{path}/WHO_report_text.txt", "w")
    #get main text with <article class='sf-detail-body-wrapper> tag
    body = soup.find('article',class_='sf-detail-body-wrapper')
    #use multiple searches for headers due to inconsistent sectioning
    #sometimes <h3> headers, sometimes single-line paragraphs with <strong> tag
    #<h5 class="section_head3">Situation update: </h5> in earlier posts
    headers = [x.text for x in body.findChildren('strong')] + [x.text for x in body.findChildren('h3')] + [x.text for x in body.findChildren('h5',{'class':'section_head3'})]
    headers = [x.replace('*','').split(':')[0].split('(')[0] for x in headers if len(x)>5]
    body = body.text
    #sort headers by order of appearance by searching text
    headers = re.findall('|'.join(headers),body)
    #write to file
    f.write(body)
    f.close()

    #break article into sections
    #section names and order are inconsistent, so many conditional statements are needed to organize the many different configurations

    sections = {}
    for header in headers:
        if 'at a glance' in header:
            sections.update({'summary':header})
        if 'Situation' in header:
            sections.update({'summary':header})
        if 'Description of' in header:
            sections.update({'background':header})
        if 'Public health' in header:
            sections.update({'response':header})
        if 'Epidemiology' in header:
            sections.update({'epidemiology':header})

    f = open(f'{path}/WHO_report_text.txt')
    text_body = f.read()
    f.close()

    if len([v for v in sections.values() if v != 'None']) > 0:

        for k,v in sections.items():
            section_break = next_key(v,headers)
            if v == 'None':
                continue
            elif section_break == None:
              try:
                  section_text = re.search(f"(?s){v}(.*$)",text_body)[0].replace(v,f'{v}\n')
              except Exception:
                  section_text = text_body.split(v)[-1]
            else:
              try:
                section_text = re.search(f"(?s){v}(.*?){section_break}",text_body)[0].replace(v,f'{v}\n').replace(section_break,'')
              except Exception:
                  section_text = text_body.split(v)[-1].split(section_break)[0]
            sections.update({k:section_text})

        if 'summary' not in sections.keys():
            section_break = [x for x in sections.values() if x != 'None'][0].split('\n')[0]
            try:
                sections.update({'summary':text_body.split(section_break)[0]})
            except ValueError:
                sections.update({'summary':text_body})

    #if sections missing, unidentified article text is input to the 'summary' heading
    else:
        sections.update({'summary':text_body})

    for s in ['background','response','epidemiology']:
        if s not in sections:
            sections.update({s:'None'})

    #extract DON ID from url
    DON = url.split('/item/')[-1]
    #format date as day month year
    dt = datetime.strptime(date,'%d %B %Y')
    locate = where.strip()
    #get iso codes from countries in title
    #if no country iso found, return unknown
    #usually means area specified as region instead of country, like 'Europe' or 'Latin America'
    try:
        #separate multi-country titles and get iso codes as a list
        if ' and ' in locate:
            iso = []
            for i in locate.split(' and '):
                for country, code in iso_dict.items():
                    i = i.replace(country, code)
                try:
                    i = re.search('[A-Z]{3}',i.strip())[0]
                    iso.append(i)
                except Exception:
                    #titles with 'multi-country' shortened to 'country' during earlier disease-country splitting
                    for word, initial in {'the':'','of':'','Region':'','European':'Europe','African':'Africa','situation':''}.items():
                        i = i.replace(word.lower(), initial.lower()).capitalize()
                    iso.append(i)

        #get single-country title iso codes
        else:
            for country, code in iso_dict.items():
                locate = locate.replace(country, code)
            try:
                iso = re.search('[A-Z]{3}',locate.strip())[0]
            except Exception:
                for word, initial in {'the':'','of':'','Region':'','European':'Europe','African':'Africa','situation':''}.items():
                    locate = locate.replace(word.lower(), initial.lower()).capitalize()
                iso = locate
            for i in ["[","]","'"]:
                iso = iso.replace(i,'')
    except TypeError:
        iso = 'None'


    #create unique ID using date of report and country iso code
    new_id = f'{dt.year}_{dt.month}_{dt.day}_{iso}'

    #write finished file that can be read by python as a dictionary with eval(file_text)
    f = open(f'{path}/downloaded_articles/{dt.year}/{new_id}.txt','w')
    lines = ['"id": ',f'"{new_id}"',',\n','"date": ',f'"{date}"',',\n', '"disease": ',f'"{what}"',',\n',
             '"location": ',f'"{where}"',',\n','"iso": ',f'"{iso}"',',\n','"DON": ',f'"{DON}"',',\n',
             '"summary": ',f'"""\n{sections["summary"].strip()}\n"""',',\n','"background": ',f'"""\n{sections["background"].strip()}\n"""',',\n',
             '"response": ',f'"""\n{sections["response"].strip()}\n"""',',\n','"epidemiology": ',f'"""\n{sections["epidemiology"].strip()}\n"""']
    #write line by line
    lines = ['{']+lines+['}']
    f.writelines(lines)
    f.close()
    #print ID for troubleshooting
    print(new_id)

In [None]:
#example code to get all urls for range(of pages) from saved text files
pages = {}
for p in range(1,143):
    f = open(f'{path}/pages/WHO_page_{p}.txt')
    urls = [x.strip('\n') for x in f.readlines()]
    pages.update({f'page_{p}':urls})

#example code to get article text for range(of pages)
for v in pages.values():
    for url in v:
        get_article(url)

In [24]:
#example code to get all article urls from most recent url list
with open(f'{path}/urls/all_{now}.txt', 'r') as f:
    urls = f.read()

#example code to get article text for all articles
for u in urls.split('\n'):
  get_article(u)