# Downloads Publication Information for PANGO Lineages from the CORD-19 Data Set
**[Work in progress]**

This notebook text-mines [PANGO lineage](https://cov-lineages.org/) mentions in the titles and abstracts of publications and preprints from the CORD-19 data set. Note, the text-mined results may contain false positive!

Data sources: [PANGO Lineage Designations](https://github.com/cov-lineages/pango-designation), 
[CORD-19](https://allenai.org/data/cord-19)

References:

Rambaut A, et al., A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology(2020) Nature Microbiology [doi:10.1038/s41564-020-0770-5](https://doi.org/10.1038/s41564-020-0770-5).

Lucy Lu Wang, et al., CORD-19: The COVID-19 Open Research Dataset (2020) [arXiv:2004.10706v4](https://arxiv.org/abs/2004.10706).

Author: Peter Rose (pwrose@ucsd.edu)

In [171]:
import os
import pandas as pd
import io
import dateutil
import re
from pathlib import Path
import nltk
import json, requests
from urllib.request import urlopen
from xml.etree.ElementTree import parse
import urllib
import time
import numpy as np

In [172]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [174]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
#"/Users/lyt/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-a1516f46-b63a-46dd-b67a-1fb59d6c5d05/import"#
print(NEO4J_IMPORT)

/Users/lyt/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-a1516f46-b63a-46dd-b67a-1fb59d6c5d05/import


## Get PANGO lineages

In [175]:
pango = pd.read_csv(NEO4J_IMPORT + "/00b-PANGOLineage.csv", dtype=str)

In [176]:
pango.sample(5)

Unnamed: 0,lineage,description,alias,predecessor,l0,l1,l2,l3,levels
265,B.1.1.170,UK/Denmark lineage,,,B.1.1.170,B.1.1,B.1,B,4
673,BA.2.18,"Alias of B.1.1.529.2.18, UK lineage, from pang...",B.1.1.529.2.18,B.1.1.529.2,BA.2.18,BA.2,BA,,3
725,B.1.36.2,UK,,,B.1.36.2,B.1.36,B.1,B,4
1059,B.1.277,European lineage,,,B.1.277,B.1,B,,3
1054,B.1.268,USA,,,B.1.268,B.1,B,,3


In [177]:
lineages = pango['lineage'].unique()

In [178]:
pattern1 = re.compile(' [A-Z]{1,2}[.]\d+ ', re.IGNORECASE)
pattern2 = re.compile(' [A-Z]{1,2}[.]\d+[.]\d+ ', re.IGNORECASE)
pattern3 = re.compile(' [A-Z]{1,2}[.]\d+[.]\d+[.]+\d+ ', re.IGNORECASE)

# add WHO lineage
who_lineage = [' Alpha ', ' Beta ', ' Gamma ', ' Epsilon ',' Zeta ', ' Eta ', ' Theta  ',\
               ' Iota ', ' Kappa ', ' Lambda ', ' Mu ']
pattern4 = re.compile("|".join(who_lineage), re.IGNORECASE)

In [179]:
# add who to lineages
lineages = np.append(lineages, who_lineage)

In [180]:
# remove A B
lineages = np.delete(lineages, np.where(lineages == 'A'))
lineages = np.delete(lineages, np.where(lineages == 'B'))

## Get CORD-19 Metadata

In [181]:
CACHE = Path(NEO4J_IMPORT +'/cache/cord19/2022-03-31/metadata.csv')

In [182]:
metadata = pd.read_csv(CACHE, dtype='str')

In [183]:
metadata.fillna('', inplace=True)
#convert datetime column to just date
metadata['year'] = metadata['publish_time'].apply(lambda d: d[:4] if len(d) > 4 else '')
metadata['date'] = metadata['publish_time'].apply(lambda d: dateutil.parser.parse(d) if len(d) > 0 else '')

In [155]:
print("Total number of papers", metadata.shape[0])

Total number of papers 992921


### Modify for knowledge graph

In [184]:
#metadata.head(2)

In [15]:
# This part can be removed, refer to "hits" part
"""def kg_id(x):
    if x['doi']: return x['doi']
    elif x['pubmed_id']: return x['pubmed_id']
    elif x['pmcid']: return x['pmcid']
    elif x['url']: return x['url']
    elif x['mag_id']: return x['mag_id']
    elif x['who_covidence_id']: return x['who_covidence_id']
    elif x['arxiv_id']: return x['arxiv_id']
    elif x['s2_id']: return x['s2_id']
    else: return cord_uid"""

In [16]:
"""metadata['id'] = metadata.apply(kg_id,axis=1)"""

In [29]:
#metadata['id'] = metadata['pmcid']

## Extract a list of PANGO lineages

Remove special characters to simply parsing for lineages in parenthesis, comma-separated lists, etc.

In [185]:
metadata['title'] = metadata['title'].replace('[()/,]', ' ', regex=True)
metadata['abstract'] = metadata['abstract'].replace('[()/,]', ' ', regex=True)

Match PANGO patterns and check agains list of known lineages.

In [186]:
pattern1 = re.compile(' [A-Z]{1,2}[.]\d+ ', re.IGNORECASE)
pattern2 = re.compile(' [A-Z]{1,2}[.]\d+[.]\d+ ', re.IGNORECASE)
pattern3 = re.compile(' [A-Z]{1,2}[.]\d+[.]\d+[.]+\d+ ', re.IGNORECASE)

# add WHO lineage
who_lineage = [' Alpha ', ' Beta ', ' Gamma ', ' Epsilon ',' Zeta ', ' Eta ', ' Theta  ',\
               ' Iota ', ' Kappa ', ' Lambda ', ' Mu ']
pattern4 = re.compile("|".join(who_lineage), re.IGNORECASE)

In [187]:
# add who to lineages
lineages = np.append(lineages, who_lineage)

In [188]:
def get_lineages(row):
    text = ' ' + row.title + ' ' + row.abstract + ' '
    lin = pattern1.findall(text) + pattern2.findall(text) + pattern3.findall(text)
    u_lin = set()
    
    
    for l in lin:
        l = l.strip()
        # check if lineage is valid (e.g., not a withdrawn lineage or false positive)
        if l in lineages:
            u_lin.add(l)
            
    return ";".join(u_lin)

### Run on whole dataset

In [189]:
metadata['lineages'] = metadata.apply(get_lineages, axis=1)

Keep only papers that map to PANGO lineages

In [190]:
hits = metadata[metadata['lineages'].str.len() > 0].copy()

### Assign CURIEs from [Identifiers.org](https://identifiers.org)

In [191]:
hits['doi'] = hits['doi'].apply(lambda x: 'doi:' + x if len(x) > 0 else '')
hits['pubmed_id'] = hits['pubmed_id'].apply(lambda x: 'pubmed:' + x if len(x) > 0 else '')
hits['pmcid'] = hits['pmcid'].apply(lambda x: 'pmc:' + x if len(x) > 0 else '')
hits['arxiv_id'] = hits['arxiv_id'].apply(lambda x: 'arxiv:' + x if len(x) > 0 else '')

In [75]:
#hits.sort_values(by=['publish_time'], ascending=False, inplace=True)

In [192]:
print("Number of matches", hits.shape[0])

Number of matches 4429


In [193]:
def create_id(row):
    """Creates a unique id using the most commonly available id in priority order"""
    if row.doi != '':
        return row.doi
    elif row.pubmed_id != '':
        return row.pubmed_id
    elif row.pmcid != '':
        return row.pmcid
    elif row.arxiv_id != '':
        return row.arxiv_id
    elif row.url != '':
        return row.url
    else:
        # TODO deal with WHO papers here?
        return ''

In [194]:
hits['id'] = hits.apply(create_id, axis=1)

WHO documents seem to be copies of articles that are already present in the dataset and will be ignored for now.

In [195]:
hits.query('id != ""', inplace=True)

In [196]:
print("Total number of matches", hits.shape[0])

Total number of matches 3205


In [197]:
hits.to_csv(NEO4J_IMPORT + "01h-CORDLineages.csv", index=False)

In [None]:
# Knowledge graph "Publications.csv"

In [198]:
df = hits[['id','title','doi','url','pubmed_id','pmcid','journal','year','title','abstract']]

In [199]:
df.columns = ['id', 'name', 'doi', 'url', 'pubmedId', 'pmcId', 'journal', 'year',
       'title', 'abstract']

In [200]:
df.size

32050

In [201]:
# only takes those have pmcid, otherwise it is impossible to scrape
df = df[df.pmcId != '']

In [157]:
#df.to_csv('Publication_1113.csv',index=False)

## KG Generalization

In [208]:
# download articles in XML and return body paragraph
def download_article(article_id):
    url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML'
    xmldoc = parse(urlopen(url))
    
    # get full text
    root = xmldoc.getroot()
    text = root.findall('.//p')

    # put body paragraphs together
    ptext = ""
    for p in text:
        ptext += ''.join([x for x in p.itertext()]) + '.\n' + '\n'
    return ptext

In [89]:
# get lineage for full texts
def get_full_lineage(ptext):
    # tokenize texts into sentences
    p_sentence = nltk.tokenize.sent_tokenize(ptext)
    
    # record lineages
    linset = set()
    pair = []
    for s in p_sentence:
        s1 = re.subn('[()/,]', ' ', s)[0] # remove special chars
        lin = set(pattern1.findall(s1) + pattern2.findall(s1) + pattern3.findall(s1) + pattern4.findall(s1))

        if lin: 
            for l in lin:
                # valid lineage and not recorded
                l = l.strip()
                l = l.capitalize()
                if (l in lineages) and (l not in linset): 
                    linset.add(l)
                    pair.append([l, s])
                else: continue
    return pair

In [215]:
def pub_mentions_lin(article_id, real_id):
    body_text = download_article(article_id) # get body text
    record = get_full_lineage(body_text) # extract lineages in text
    [x.append(real_id) for x in record] # attach article id to lineage record
    df = pd.DataFrame(record)
    df.columns = ['lineage', 'string', 'ID']
    df = df[['ID','lineage','string']]
    return df

In [221]:
def run_pipeline(N, pub):
    results = []
    for i in range(N):
        article = pub.iloc[i]
        article_id = article.pmcId.split(":")[1]
        real_id = article.id
        print(f'start article {i}: id {real_id}')
        try:
            results.append(pub_mentions_lin(article_id, real_id))
        except urllib.error.HTTPError as exc:
            time.sleep(5) # wait 5 seconds and then make http request again
            continue
    return pd.concat(results)

In [222]:
pub = df.copy()
N = pub.size


In [223]:
ans = run_pipeline(N, pub)
ans.columns = [['from','to','evidence']]
ans.to_csv('Publication-MENTIONS-Lineage.csv',index=False)

start article 0: id doi:10.1016/s0168-1702(97)00081-6
start article 1: id doi:10.1016/j.jinf.2021.04.005
start article 2: id doi:10.1016/s1473-3099(21)00262-0


KeyboardInterrupt: 

## Fulltext Regrex
This part is removed when generating knowledge graph data