**<center style="font-size: 16pt;"><a href="https://www.kaggle.com/atmarouane/cord-19-step3-enm/">Ensemble Model (EnM) for document retrieval results</a></center>**

<h1><span class="tocSkip"></span>Table of Contents</h1>
<div id="toc-wrapper"></div>
<div id="toc"></div>

# Prerequisites

## Configuration class

We set variables like from where we load, where to store and some parameters.

In [None]:
class config():    
    CORPUS_FN = '/kaggle/input/cord-19-step2-corpus/corpus.pkl'
    ENM_FN = '/kaggle/input/cord-19-step3-enm/ranker_enm.pickle'
    TOC2_FN = '/kaggle/input/toc2js/toc2.js'
    
    n_relevant = 200
    
    # Threshold: Keep sentences if their scrore is higher than 'th'
    th=0.52
    # Sentence/Snippet score = alpha*NMF_cosine_similarity' + (1-alpha)*jaccard_similarity
    alpha=0.5
    
    query_txt = 'Is BCG vaccination causally related to reduced COVID‐19 mortality?' 

## Libraries

### SE libraries

All our libraries are made public under open source.

In [None]:
import cord_19_container as container
import cord_19_rankers as rankers
import cord_19_lm as lm
import cord_19_vis as vis

from cord_19_container import Sentence, Document, Paper, Corpus

from cord_19_metrics import compute_queries_perf

from cord_19_helpers import load, save
from cord_19_text_cleaner import Cleaner
from cord_19_wn_phrases import wn_phrases

### Commun libraries

In [None]:
from gensim import matutils
from sklearn.metrics.pairwise import cosine_similarity

import copy
from collections import defaultdict
import re
from textwrap import wrap

import numpy as np
import pandas as pd

### Visualization libraries

In [None]:
%matplotlib inline

from IPython.display import display, HTML, Markdown, Latex

import matplotlib.pyplot as plt
import seaborn as sns

HTML("""
<style>
.output_png {
    text-align: center;
    vertical-align: middle;
}

.rendered_html table{
    display: table;
}
</style>
""")

## Load data

Load the corpus, papers talking about COVID-19/SARS-CoV-2, done in our previous kernel.

In [None]:
corpus = load(config.CORPUS_FN)
dictionary = corpus.dictionary

# Rebuild id2token from token2id, only token2id is saved
for k,v in dictionary.token2id.items():
    dictionary.id2token[v]=k

# Set the dictionary as global, we have to find better way
container.dictionary = dictionary
rankers.dictionary = dictionary
vis.dictionary = dictionary

print(f'#Papers {len(corpus)}, #Tokens {len(dictionary)}')

## Load model

Loading our model.

In [None]:
ranker_enm = load(config.ENM_FN)
ranker_nmf = ranker_enm.models['NMF']

# Introduction

BCG vaccination has been reported to offer broad protection to respiratory infections [[1]](https://www.medrxiv.org/content/10.1101/2020.03.24.20042937v1.full.pdf).

The goal of this kernel is to search in [CORD-19 database](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge) for papers talking about the **"Is BCG vaccination causally related to reduced COVID‐19 mortality?"**.

Figure 1 shows one of the hypothesis that link between BCG country policy with COVID-19 mortality.

In [None]:
def rename_field(df, field, from_to):
    """
    Rename DataFrame columns (from, to)
    
    df: DataFrame
    field: string, column name to update
    from_to list: of tuples (from, to)
    """
    for old, new in from_to:
        index = (df[field] == old)
        df.loc[index, field] = new
    
def load_atlas():
    """
    Load BCG world atlas
    """
    df = pd.read_csv('/kaggle/input/hackathon/BCG_world_atlas_data-2020.csv', dtype={'BCG Policy First Year':str},
                          na_values=None,
                          keep_default_na=False)
    
    # Rename some columns to simplest form.
    df.rename(columns={'Contry Name (Mandatory field)':'country_name',
                             'BCG Policy First Year':'BCG_First_Year',
                             'BCG Policy Last Year': 'BCG_Last_Year'}, 
                    inplace=True)
    
    # Remove spaces at the beginning and at the end of the string
    df.loc[:,'country_name'] = df.loc[:,'country_name'].str.strip()

    # Fix a typo in Atlas where Uzbekistan named as 'mexico' (lowercase).
    df.loc[df['country_name'] == 'mexico', 'country_name'] = 'Uzbekistan'
    
    # We mark Italy as never had BCG Policy, please look at www.bcgatlas.org
    df.loc[df['country_name'] == 'Italy', 'BCG_First_Year'] = 'N/A'
    
    # Drop Germany (East), we will use Germany (West) as Germany
    df = df.query('country_name != "Germany (East)"').copy() # Better to use drop
    
    # Countries to rename (from, to)
    cntrs_map = [
        ('United States of America', 'United States'),
        ('Macedonia, FYR', 'Macedonia'),
        ('Germany (West)' , 'Germany')
    ]
    rename_field(df, 'country_name', cntrs_map)

    '''
    Filter BCG Atlas:
    BCG_Last_Year == "ongoing": Ongoing BCG policy
    BCG_First_Year == "N/A": Never had BCG policy
    '''
    df = df.query('(BCG_Last_Year == "ongoing") \
or (BCG_First_Year == "N/A")').copy() # Better to use drop
    
    df['universal_BCG'] = df.BCG_First_Year != 'N/A'
    
    return df[['country_name', 'universal_BCG']]

def load_wwb():
    """
    Load World Bank economies classification;
    Economy groups: Low income, Lower middle income,
                    High income, Upper middle income
    """
    
    df = pd.read_excel('http://databank.worldbank.org/data/download/site-content/CLASS.xls',
                  skiprows=list(range(4))+[5],
                  nrows=218)

    # Rename some columns to simplest form.
    df.rename(columns={'Economy':'country_name',
                           'Income group':'Income_group'}, 
                  inplace=True)
    
    # Countries to rename (from, to)
    cntrs_map = [
        ('Hong Kong SAR, China', 'Hong Kong'),
        ('North Macedonia', 'Macedonia'), # Better to keep 'North Macedonia'
        ('Taiwan, China' , 'Taiwan')
    ]
    rename_field(df, 'country_name', cntrs_map)
    
    return df[['country_name', 'Income_group']]

def load_dpm():
    """
    Load COVID-19 Deaths/million database
    """
    
    df = pd.read_csv('/kaggle/input/hackathon/task_2-COVID-19-death_cases_per_country_after_frist_death-till_26_June.csv')
    
    # Get max of from DPM from '10 days after first death' until '100 days after first death'
    df['deaths_per_million'] = df.iloc[:,3:].T.max()
    
    # Countries to rename (from, to)
    cntrs_map = [
        ('Iran', 'Iran, Islamic Rep.'),
        ('South Korea', 'Korea, Rep.'),
        ('Russia' , 'Russian Federation'),
        ('Slovakia', 'Slovak Republic')
    ]
    rename_field(df, 'country_name', cntrs_map)
    
    df.dropna(subset=['deaths_per_million'], inplace=True)
    
    return df[['country_name', 'deaths_per_million']]

In [None]:
# Load BCG world atlas, COVID-19 Deaths/million, World Bank economies classification
atlas_df = load_atlas()
dpm_df = load_dpm()
wwb_df = load_wwb()

data = atlas_df.merge(wwb_df, on='country_name').merge(dpm_df, on='country_name')
# Merge Upper middle, High income together
data.loc[data.Income_group.isin(['Upper middle income', 'High income']),
         'Income_group'] = 'Upper middle & High income'

"""
Set country category as done in [1]
Categories:
    Lower middle income countries with universal BCG policy;
    Upper middle & High income countries with universal BCG policy;
    Upper middle & High income countries that never had universal BCG policy.
"""

data['category'] = data['Income_group'] + ' countries'
data.loc[data['universal_BCG']==True,  'category'] += ' with universal BCG policy'
data.loc[data['universal_BCG']==False, 'category'] += ' that never had universal BCG policy'

"""
The countries with low-income levels (18) reported few number of cases of COVID-19 per million
inhabitants: 0.32 ± 0.09. However, the issue of underreporting might be more critical for
estimating the number of cases and we have excluded the low income countries from further
analysis. [1]
"""
data = data[data['Income_group'] != 'Low income']


display(HTML('Countries that never had universal BCG policy: <b>' + 
             ', '.join(data.query('universal_BCG==False')['country_name'].values.tolist()) + 
             '</b>.<BR>' + 
             'Number of countries with universal BCG policy: <b>' + 
             str(len(data.query('universal_BCG==True')['country_name'].values.tolist())) + 
             '</b>.'
            ))

In [None]:
columns_order = ['Lower middle income countries with universal BCG policy',
                 'Upper middle & High income countries with universal BCG policy',
                 'Upper middle & High income countries that never had universal BCG policy']

fig, ax = plt.subplots(figsize=(14,7))

chart=sns.boxplot(x='category', y='deaths_per_million',
                  data=data.groupby(by='country_name').last(),
                  order=columns_order)

chart.set_xticklabels(["\n".join(wrap(t, width=30)) for t in columns_order])

plt.suptitle('')
plt.title('COVID-19 deaths per million per country till 26 June 2020')
plt.xlabel('')
plt.ylabel('Deaths per million')
plt.show()

display(HTML('<center><b>Figure 1:</b> Higher death rates were presented in countries that never \
implemented a universal BCG vaccination policy.</center>'))

# Query

In [None]:
query = container.Document([Cleaner(True).clean(config.query_txt)])
query.tokenize()
wn_phrases(query)

display(HTML(f'We are looking for:<br><br>'))
q_original_text = '<br>'.join([s.original_text for s in query.sentences])
display(HTML(f'<p style="font-size: 18pt;">{q_original_text}</p>'))

# When debuging print query.text

# Results

In [None]:
def display_results(ranker, q, th=0.5, alpha=0.5):
    
    """
    List of tuple (title, url, snippets)
    snippet: Sentences close to the query
    """
    r = []
    
    scores = ranker[q]
    
    I, R = lm.get_relevant(corpus, scores, config.n_relevant)
    total = 0
    for i, paper in enumerate(R):
        paper_id = I[i]
        enm = scores[paper_id]
        
        # Sentences with at least two distinct words
        sentences = [sent for sent in paper if len(sent.bow)>1]
        
        s_nmf = ranker_nmf.project(sentences)
        q_nmf = ranker_nmf.project(q)
        
        sim = cosine_similarity(s_nmf, q_nmf)
        sim = sim[:,0]
        
        for j,sent in enumerate(sentences):
            jaccard_sim = 1-matutils.jaccard(q.bow, sent.bow)
            sim[j] = alpha*sim[j] + (1.-alpha)*jaccard_sim
        
        found = 0
        sent_lst = []
        sI = np.argsort(sim)[::-1]
        top5_tbl = "<table>"
        top5_tbl += "<tr><th>Score</th><th style='text-align:left;'>Snippet</th></tr>"
        for j in sI[:5]:
            if sim[j] > th:
                found += 1
                top5_tbl += f"<tr><td>{sim[j]:.3f}</td><td style='text-align:left;'>{sentences[j].original_text}</td></tr>"
                sent_lst.append(sentences[j].original_text)
                
        top5_tbl += "</table>"
        
        if found:
            link='<a href="https://doi.org/'+paper.doi+'" target=blank>'+paper.title+'</a>'
            display(HTML(link))
            display(HTML(top5_tbl))
            
            r.append( (paper.title,
                       'https://doi.org/'+paper.doi,
                       '\n'.join(sent_lst)) )
            
        total += (found != 0)
    
    print('Total', total)
    
    return r
        


In [None]:
%%time

r = display_results(ranker_enm, query,
                    th=config.th, alpha=config.alpha)

In [None]:
df = pd.DataFrame(r, columns=['title', 'link', 'snippets'])
df.to_csv('COVID-19_BCG.csv', index=False)

In [None]:
from IPython.display import HTML

with open(config.TOC2_FN, 'r') as file:
    js = file.read()

    display(HTML('<script type="text/Javascript">'+js+'</script>'))
    
    del js

In [None]:
%%javascript

// Autonumbering & Table of Contents
// Using: https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tree/master/src/jupyter_contrib_nbextensions/nbextensions/toc2
table_of_contents(default_cfg);