**<center style="font-size: 16pt;">BCG per country schedule from WHO</center>**

# Introduction

In this kernel we will extend BCG World Atlas data, by the [WHO BCG policies table](https://apps.who.int/immunization_monitoring/globalsummary/schedules?sc%5Br%5D%5B%5D=AFRO&sc%5Br%5D%5B%5D=AMRO&sc%5Br%5D%5B%5D=EMRO&sc%5Br%5D%5B%5D=EURO&sc%5Br%5D%5B%5D=SEARO&sc%5Br%5D%5B%5D=WPRO&sc%5Bd%5D=&sc%5Bv%5D%5B%5D=BCG&sc%5BOK%5D=OK) (Last updated 15-July-2020).

# Libraries

In [None]:
# The unavoidable
import pandas as pd
import numpy as np

# Fuzzy string matching
from fuzzywuzzy import process
# Regular expression operations (RegEx) 
import re
# Download
import urllib.request, urllib.error, urllib.parse

# Data loading

## BCG World Atlas

Load BCG World Atlas data to be extended.

In [None]:
atlas_df = pd.read_csv('/kaggle/input/hackathon/BCG_world_atlas_data-2020.csv')

# Remove spaces at the beginning and at the end of the string
atlas_df.loc[:,'Contry Name (Mandatory field)'] = atlas_df.loc[:,'Contry Name (Mandatory field)'].str.strip()
atlas_df.loc[:,'Vaccination Timing (age)'] = atlas_df.loc[:,'Vaccination Timing (age)'].str.strip()

In [None]:
atlas_df.head()

Fix a typo in Atlas where Uzbekistan named as 'mexico' (lowercase).

In [None]:
field = 'Contry Name (Mandatory field)'
atlas_df.loc[atlas_df[field] == 'mexico', field] = 'Uzbekistan'

In [None]:
field = 'Vaccination Timing (age)'

(atlas_df[field].value_counts()
 .rename_axis(field)
 .reset_index(name='Counts')
 .head())

## WHO BCG table

WHO immunization schedule table for BCG, for 171 countries. We download the table then convert it to dataframe.

In [None]:
bcg_url = 'https://apps.who.int/immunization_monitoring/globalsummary/schedules?\
sc%5Br%5D%5B%5D=AFRO&sc%5Br%5D%5B%5D=AMRO&sc%5Br%5D%5B%5D=EMRO&\
sc%5Br%5D%5B%5D=EURO&sc%5Br%5D%5B%5D=SEARO&sc%5Br%5D%5B%5D=WPRO&\
sc%5Bd%5D=&sc%5Bv%5D%5B%5D=BCG&sc%5BOK%5D=OK'
    
def get_who_table(url):


    """
    Note:
    pd.read_html function can read the tables directly from internet,
    but we donload it manualy to fix the error (invalid literal for int() with base 10: '100%')
    """
    response = urllib.request.urlopen(url)
    content = response.read().decode('utf-8')

    # Fix colspan="100%" error, by removing it.
    content = re.sub('colspan="100%"', '', content)

    # flavor & match are optional
    tables = pd.read_html(content,
                          flavor='bs4',
                          match='Africa|Americas|Eastern Mediterranean|Europe|South-East Asia|Western Pacific')


    """
    1. Filter table rows, keep only rows where Antigens column is BCG (It can be continent name or NaN);
    2. Copy the table to avoid SettingWithCopyWarning in the future updates.
    """
    who_df = tables[0].query('Antigens=="BCG"').copy()
    
    return who_df


In [None]:
who_df = get_who_table(bcg_url)

In [None]:
(who_df['Schedules'].value_counts()
 .rename_axis('Schedules')
 .reset_index(name='Counts')
 .head())

# Unify country names

Make WHO country names like the ones in Atlas data.

In [None]:
# Atlas countries names not in WHO
atlas_countries = [c for c in atlas_df.iloc[:,0].values if c not in who_df.iloc[:,0].values]
atlas_countries = list(set(atlas_countries))

In [None]:
# WHO countries names not in Atlas
who_countries = [c for c in who_df.iloc[:,0].values if c not in atlas_df.iloc[:,0].values]
who_countries = list(set(who_countries))

Automatic mapping, using fuzzy match.

In [None]:
# List of tuple (WHO, Atlas) country names
map_countries = []
for c in who_countries:
    # Fuzzy match
    r = process.extractOne(c, atlas_countries)
    # Keep when confidence is higher than 90
    if r[1]>=90:
        map_countries.append((c, r[0]))

Manual mapping.

In [None]:
manual_map = [
    ('North Macedonia', 'Macedonia, FYR'),
    ('Eswatini', 'Swaziland'),
    ('Democratic People\'s Republic of Korea', 'Korea, Dem. Rep.'),
    ('Cabo Verde', 'Cape Verde'),
    ('Saint Lucia', 'St. Lucia'),
    ('Republic of Korea', 'Korea, Rep.'),
    ('Iran (Islamic Republic of)', 'Iran, Islamic Rep.'),
    ('Micronesia (Federated States of)', 'Micronesia, Fed. Sts.'),
    ('Lao People\'s Democratic Republic', 'Lao PDR'),
    ('Democratic Republic of the Congo', 'Congo, Dem. Rep.'),
    ('Kyrgyzstan', 'Kyrgyz Republic')
 ]

map_countries.extend(manual_map)

In [None]:
pd.DataFrame(map_countries, columns=['WHO', 'Atlas'])

Update WHO country names.

In [None]:
for who_c, atlas_c in map_countries:
    index = (who_df['Country'] == who_c)
    who_df.loc[index, 'Country'] = atlas_c

In [None]:
who_df

# Atlas extension

We add or update Atlas BCG country policy, in case the WHO BCG table contains different Vaccination Timing/High-Risk groups.

In [None]:
extend = []

for i, who_row in who_df.iterrows():
    in_atlas = False
    
    # WHO BCG schedule is NA
    if who_row.Schedules != who_row.Schedules: continue
    
    # Split WHO vaccination timing
    vacc_timing = who_row.Schedules.split(';')
    vacc_timing = [t.strip() for t in vacc_timing if len(t)]
    
    y = atlas_df.query(f'`Contry Name (Mandatory field)`=="{who_row.Country}"')[['Vaccination Timing (age)',
                                                                                 'Definition of High-risk groups (if applicable) which receive BCG?']]
    
    # Always true
    if len(y):
        choices = []
        # not_atlas_risk is True, if all Atlas High-risk groups rows for a country are NA
        not_atlas_risk = True
        for j, atlas_row in y.iterrows():
            atlas_vacc_timing = atlas_row[0]
            
            not_atlas_risk &= atlas_row[1] == atlas_row[1]
            
            # Atlas Vaccination Timing is NA
            if atlas_vacc_timing == atlas_vacc_timing:
                # Split Atlas vaccination timing
                choices.extend(atlas_vacc_timing.split(','))
        
        is_new = False
        for t in vacc_timing:
            r = process.extractOne(t, choices)
            # Do we have a new/different BCG schedule?
            is_new |= (r == None) or (r[1] < 50)
        
        if is_new or (not_atlas_risk and who_row.Comments==who_row.Comments):
            extend.append( (who_row.Country,
                            ','.join(choices),
                            who_row.Schedules,
                            who_row.Comments) )
    else:
        raise KeyError(f'Country {who_row.Country}, not found!')
        

In [None]:
pd.DataFrame(extend, columns=['Country', 'Old timing', 'New timing', 'risk group'])

In [None]:
columns = atlas_df.columns.values

extend_lst = []
for country, _, t, r in extend:
    s = pd.Series([np.nan]*len(columns), columns)
    
    y = atlas_df.query(f'`Contry Name (Mandatory field)`=="{country}"')
    assert len(y)
    
    s['Contry Name (Mandatory field)'] = y['Contry Name (Mandatory field)'].values[0]
    s['Country Code (Mandatory field)'] = y['Country Code (Mandatory field)'].values[0]
    s['BCG Policy Link (Mandatory field)'] = bcg_url
    s['Is it from bcgatlas.org (Mandatory field)'] = 'no'
    s['Vaccination Timing (age)'] = t
    s['Definition of High-risk groups (if applicable) which receive BCG?'] = r
    
    extend_lst.append(s)
    
extend_df = pd.concat(extend_lst, axis=1).T
extend_df

In [None]:
# Save output
extend_df.to_csv('atlas_extension.csv', index=False)