<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at 'In [3]'.</span>

# Downloads Publication Information for PANGO Lineages from the CORD-19 Data Set
**[Work in progress]**

This notebook text-mines [PANGO lineage](https://cov-lineages.org/) mentions in the titles and abstracts of publications and preprints from the CORD-19 data set. Note, the text-mined results may contain false positive!

Data sources: [PANGO Lineage Designations](https://github.com/cov-lineages/pango-designation), 
[CORD-19](https://allenai.org/data/cord-19)

References:

Rambaut A, et al., A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology(2020) Nature Microbiology [doi:10.1038/s41564-020-0770-5](https://doi.org/10.1038/s41564-020-0770-5).

Lucy Lu Wang, et al., CORD-19: The COVID-19 Open Research Dataset (2020) [arXiv:2004.10706v4](https://arxiv.org/abs/2004.10706).

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
import io
import dateutil
import re
from pathlib import Path
import nltk
import json, requests
from urllib.request import urlopen
from xml.etree.ElementTree import parse
import urllib
import time
import numpy as np

In [2]:
pattern1 = re.compile(' [A-Z]{1,2}[.]\d+ ', re.IGNORECASE)
pattern2 = re.compile(' [A-Z]{1,2}[.]\d+[.]\d+ ', re.IGNORECASE)
pattern3 = re.compile(' [A-Z]{1,2}[.]\d+[.]\d+[.]+\d+ ', re.IGNORECASE)

# add WHO lineage
who_lineage = [' Alpha ', ' Beta ', ' Gamma ', ' Epsilon ',' Zeta ', ' Eta ', ' Theta  ',\
               ' Iota ', ' Kappa ', ' Lambda ', ' Mu ']
pattern4 = re.compile("|".join(who_lineage), re.IGNORECASE)

In [3]:
gg = pd.read_csv('lineages')

FileNotFoundError: [Errno 2] No such file or directory: 'lineages'

In [None]:
lineages = gg.iloc[:,0].to_list()

In [None]:
def get_lineages(row):
    text = ' ' + row.title + ' ' + row.abstract + ' '
    lin = pattern1.findall(text) + pattern2.findall(text) + pattern3.findall(text)
    u_lin = set()
    
    
    for l in lin:
        l = l.strip()
        # check if lineage is valid (e.g., not a withdrawn lineage or false positive)
        if l in lineages:
            u_lin.add(l)
            
    return ";".join(u_lin)

In [None]:
# download articles in XML and return body paragraph
def download_article(article_id):
    url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML'
    xmldoc = parse(urlopen(url))
    
    # get full text
    root = xmldoc.getroot()
    text = root.findall('.//p')

    # put body paragraphs together
    ptext = ""
    for p in text:
        ptext += ''.join([x for x in p.itertext()]) + '.\n' + '\n'
    return ptext

In [None]:
# get lineage for full texts
def get_full_lineage(ptext):
    # tokenize texts into sentences
    p_sentence = nltk.tokenize.sent_tokenize(ptext)
    
    # record lineages
    pair = []
    for s in p_sentence:
        s1 = re.subn('[()/,]', ' ', s)[0] # remove special chars
        lin = set(pattern1.findall(s1) + pattern2.findall(s1) + pattern3.findall(s1) + pattern4.findall(s1))

        if lin: 
            for l in lin:
                # valid lineage and not recorded
                l = l.strip()
                l = l.capitalize()
                if (l in lineages): 
                    pair.append([l, s])
                else: continue
    return pair

In [None]:
def pub_mentions_lin(article_id, real_id):
    body_text = download_article(article_id) # get body text
    record = get_full_lineage(body_text) # extract lineages in text
    [x.append(real_id) for x in record] # attach article id to lineage record
    df = pd.DataFrame(record)
    if record:
        df.columns = ['lineage', 'string', 'ID']
        df = df[['ID','lineage','string']]
    return df

In [None]:
pub.iloc[1735].pmcId.split(":")[1]

In [None]:
def run_pipeline(N, pub):
    results = []
    for i in range(N):
        article = pub.iloc[i]
        article_id = article.pmcId.split(":")[1]
        real_id = article.id
        print(f'start article {i}')
        if i%100 == 0:
            print(f'{i}/{N}')
            
        try:
            results.append(pub_mentions_lin(article_id, real_id))
        except urllib.error.HTTPError as exc:
            time.sleep(5) # wait 5 seconds and then make http request again
            continue
    return pd.concat(results)

In [None]:
pub = pd.read_csv("Publication.csv")
N = len(pub)


In [None]:
N

In [None]:
ans = run_pipeline(N, pub)
ans.columns = [['from','to','evidence']]
ans.to_csv('Publication-MENTIONS-Lineage.csv',index=False)

In [None]:
"""results = []
for i in range(144, N):
    article = pub.iloc[i]
    article_id = article.pmcId.split(":")[1]
    real_id = article.id
    print(f'start article {i}')
    if i%100 == 0:
        print(f'{i}/{N}')

    try:
        results.append(pub_mentions_lin(article_id, real_id))
    except urllib.error.HTTPError as exc:
        time.sleep(5) # wait 5 seconds and then make http request again
        continue"""

In [None]:
"""results_144_to_321[:3]"""

In [None]:
#results_144_to_321 = results

In [None]:
# partial results 

#results_0_to_144 = results

## Fulltext Regrex
This part is removed when generating knowledge graph data

In [None]:
pd.concat(results_144_to_321).to_csv('results_144_to_321')

In [None]:
pd.concat(results_0_to_144).to_csv('results_0_to_144')