# Read datafiles, merge, and lightly clean

In [1]:
import os
import json
import datetime
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

#### Configuration

In [2]:
ROOT = '..'

In [3]:
datafile_date = '2020-04-10-v7'

In [4]:
PROCESS_SMALL_DOCS = False # True  # Small docs are the individual paragraphs in the text

In [5]:
json_subdir = 'pdf_json/'  # may want pmc (xml) for small docs (set to '' for dataversion <= 5)
SOURCE_FILES = {
    'COMM-USE': ROOT + f'/data/raw/{datafile_date}/comm_use_subset/{json_subdir}',
    'BioRxiv': ROOT + f'/data/raw/{datafile_date}/biorxiv_medrxiv/{json_subdir}',
    'NONCOMM': ROOT + f'/data/raw/{datafile_date}/noncomm_use_subset/{json_subdir}',
    'PMC': ROOT + f'/data/raw/{datafile_date}/custom_license/{json_subdir}',
}

In [6]:
metadata_file = ROOT + f'/data/raw/{datafile_date}/metadata.csv'

In [7]:
outdir = ROOT + f'/data/interim/{datafile_date}/'
outfile = f'{outdir}{datafile_date}-covid19-combined.jsonl'
outfile_small_docs = f'{outdir}{datafile_date}-covid19-combined-smalldocs.jsonl'
outfile_only_abstracts = f'{outdir}{datafile_date}-covid19-combined-only-abstracts.jsonl'
outfile_abstracts = f'{outdir}{datafile_date}-covid19-combined-abstracts.jsonl'
json_args = {'orient': 'records', 'lines': True}
out_json_args = {'date_format': 'iso', **json_args}

In [8]:
out_path_mode = 0o777
os.makedirs(outdir, mode = out_path_mode, exist_ok = True)
os.makedirs(outdir_am, mode = out_path_mode, exist_ok = True)

## Helper Functions

Some functions taken and modified from https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv

In [9]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        doi = None
        pmid = None
        other_ids = bib.get('other_ids')
        if other_ids:
            doi = other_ids.get('DOI')
            pmid = other_ids.get('PMID')
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        if doi:
            formatted_ls.extend(doi)
        if pmid:
            formatted_ls.extend(['PMID' + p for p in pmid])
        formatted.append(", ".join(formatted_ls))

    return "\n ".join(formatted)

In [10]:
def bib_titles(bibs):
    result = {}
    for key, bib in bibs.items():
        result[key] = bib['title']
    return result

def extract_small_docs(main_doc_id, body_text, bib_titles_dict):
    result = []
    for i, di in enumerate(body_text):
        ref_titles = []
        for ref in di['cite_spans']:
            title = bib_titles_dict.get(ref['ref_id'])
            if title:
                ref_titles.append(title)
        result.append((main_doc_id, i, di['text'], di['section'], ref_titles))
    return result

In [11]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files, prepare_small_docs=False):
    cleaned_files = []
    small_docs = []
    
    for file in tqdm(all_files):
        if prepare_small_docs:
            bib_titles_dict = bib_titles(file['bib_entries'])
            docs = extract_small_docs(file['paper_id'], file['body_text'], bib_titles_dict)
        else:
            docs = []

        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries'],
            len(docs)
        ]

        cleaned_files.append(features)
        if prepare_small_docs:
            small_docs.extend(docs)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography',
                'num_small_docs']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    if prepare_small_docs:
        small_docs_df = pd.DataFrame(small_docs, 
                                     columns=['paper_id', 'small_doc_num', 'text', 'section', 'ref_titles'])
        return clean_df, small_docs_df
    else:
        return clean_df

## Load Data

### Load Metadata

In [12]:
metadata_df = pd.read_csv(metadata_file)

In [13]:
metadata_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350.0,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


### Clean Metadata

In [14]:
metadata_df[metadata_df['cord_uid'].duplicated(keep=False)].sort_values('cord_uid').head(10)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
26037,0klupmep,,Elsevier,Infectious disease surveillance update,10.1016/s1473-3099(19)30075-1,,30833065.0,els-covid,,2019-03-31,"Zwizwai, Ruth",The Lancet Infectious Diseases,,,False,False,custom_license,https://doi.org/10.1016/s1473-3099(19)30075-1
16086,0klupmep,,PMC,Infectious disease surveillance update,10.1016/s1473-3099(19)30075-1,PMC7129894,30833064.0,no-cc,,2019-02-27,"Zwizwai, Ruth",Lancet Infect Dis,,,False,False,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
25904,0z5wacxs,7e787fd2ae5b544add6281d3d40ad322de26aa17,Elsevier,Transportation capacity for patients with high...,10.1111/1469-0691.12290,,24750421.0,els-covid,Abstract Highly infectious diseases (HIDs) are...,2019-04-30,"Schilling, S.; Maltezou, H.C.; Fusco, F.M.; De...",Clinical Microbiology and Infection,,,True,False,custom_license,https://doi.org/10.1111/1469-0691.12290
16084,0z5wacxs,7e787fd2ae5b544add6281d3d40ad322de26aa17,PMC,Transportation capacity for patients with high...,10.1111/1469-0691.12290,PMC7128608,25636943.0,no-cc,Highly infectious diseases (HIDs) are defined ...,2015-06-22,"Schilling, S.; Maltezou, H.C.; Fusco, F.M.; De...",Clin Microbiol Infect,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
25633,21htepa1,a25e212b03cc65c44dcc336775b101934e30f041,Elsevier,Panspermia—true or false?,10.1016/s0140-6736(03)14040-8,,12907025.0,els-covid,,2003-08-02,"de Leon, Samuel Ponce; Lazcano, Antonio",The Lancet,,,True,False,custom_license,https://doi.org/10.1016/s0140-6736(03)14040-8
16101,21htepa1,,PMC,Panspermia—true or false?,10.1016/s0140-6736(03)14040-8,PMC7135165,12907026.0,no-cc,,2003-08-02,"de Leon, Samuel Ponce; Lazcano, Antonio",Lancet,,,False,False,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
25484,21qu87oh,68249d769e1926678af8d52d2484e36787e13525,Elsevier,Bystander CD8 T-Cell-Mediated Demyelination is...,10.1016/s0002-9440(10)63126-4,,14742242.0,els-covid,Mice infected with the coronavirus mouse hepat...,2004-02-29,"Dandekar, Ajai A.; Anghelina, Daniela; Perlman...",The American Journal of Pathology,,,True,False,custom_license,https://doi.org/10.1016/s0002-9440(10)63126-4
34616,21qu87oh,7b033b7db8eca607dc5cbc6ac0aa7e3d843b9223,PMC,Bystander CD8 T-Cell-Mediated Demyelination is...,,PMC1602263,14742242.0,unk,Mice infected with the coronavirus mouse hepat...,2004-02-10,"Dandekar, Ajai A.; Anghelina, Daniela; Perlman...",,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
16095,2maferew,6744bc52b1b29d2ab28cbaeef8942eebece5175b,PMC,Virologie : l’apport de la biologie moléculair...,10.1016/s0929-693x(07)78706-7,PMC7133300,17182229.0,no-cc,The conventionnal tools used for virological d...,2008-02-15,"Brouard, J.; Vabret, A.; Perrot, S.; Nimal, D....",Arch Pediatr,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
25841,2maferew,6744bc52b1b29d2ab28cbaeef8942eebece5175b,Elsevier,Virologie : l’apport de la biologie moléculair...,10.1016/s0929-693x(07)78706-7,,18280911.0,els-covid,Résumé Les outils traditionnels du diagnostic ...,2007-12-31,"Brouard, J.; Vabret, A.; Perrot, S.; Nimal, D....",Archives de Pédiatrie,,,True,False,custom_license,https://doi.org/10.1016/s0929-693x(07)78706-7


In [15]:
metadata_df['publish_year'] = metadata_df['publish_time'].astype(str).apply(lambda d: 
                                                                d[:4] if d[0] in ('1', '2') else
                                                                '19xx' if d == 'nan' else
                                                                # d[2:6] if d.startswith("'[") else
                                                                '')

In [16]:
metadata_df['publish_year'].unique()

array(['2003', '2004', '2005', '2006', '2007', '2008', '2001', '2002',
       '2009', '2010', '2011', '2012', '1999', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '1975', '1978', '1983',
       '1973', '1996', '1981', '1988', '1982', '1979', '1991', '1987',
       '1995', '1994', '2000', '1980', '1992', '1989', '1976', '1985',
       '1984', '1993', '1986', '1977', '1970', '1997', '1990', '1974',
       '1998', '1971', '1969', '1972', '1961', '1965', '1955', '1957',
       '1963', '1967', '1966', '1951', '1962', '1968', '1952', '1959',
       '1964', '19xx'], dtype=object)

### Load Data Files

In [17]:
dfd = {}
small_docs = {}
for name, indir in SOURCE_FILES.items():
    print(f'Loading {name} from {indir}')
    data_files = load_files(indir)
    print(f"Cleaning {name} {len(data_files)} files" )
    if PROCESS_SMALL_DOCS:
        dfd[name], small_docs[name] = generate_clean_df(data_files, prepare_small_docs=True)
    else:
        dfd[name] = generate_clean_df(data_files)

Loading COMM-USE from ../data/raw/2020-04-10-v7/comm_use_subset/pdf_json/


HBox(children=(FloatProgress(value=0.0, max=9524.0), HTML(value='')))


Cleaning COMM-USE 9524 files


HBox(children=(FloatProgress(value=0.0, max=9524.0), HTML(value='')))


Loading BioRxiv from ../data/raw/2020-04-10-v7/biorxiv_medrxiv/pdf_json/


HBox(children=(FloatProgress(value=0.0, max=1625.0), HTML(value='')))


Cleaning BioRxiv 1625 files


HBox(children=(FloatProgress(value=0.0, max=1625.0), HTML(value='')))


Loading NONCOMM from ../data/raw/2020-04-10-v7/noncomm_use_subset/pdf_json/


HBox(children=(FloatProgress(value=0.0, max=2490.0), HTML(value='')))


Cleaning NONCOMM 2490 files


HBox(children=(FloatProgress(value=0.0, max=2490.0), HTML(value='')))


Loading PMC from ../data/raw/2020-04-10-v7/custom_license/pdf_json/


HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))


Cleaning PMC 26505 files


HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))




In [18]:
dfd['COMM-USE'].head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,num_small_docs
0,0263ef539a1344a642b1f4ff11aee6bc6ca84a34,st Workshop of the Canadian Society for Virology,"Craig Mccormick, Nathalie Grandvaux","Craig Mccormick (Dalhousie University, 5850 Co...",Abstract\n\nThe 1st Workshop of the Canadian S...,"Introduction\n\nCanadian virologists, includin...",Technical knockout: Understanding poxvirus pat...,"[{'first': 'Craig', 'middle': [], 'last': 'Mcc...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Technic...",0
1,75882d6856d4243248aa32fe119153efeb0dbe12,Lipopolysaccharide and Tumor Necrosis Factor A...,"Sonya A Macparland, Xue-Zhong Ma, Limin Chen, ...",Sonya A Macparland (Chinese Academy of Medical...,"Abstract\n\nCitation MacParland SA, Ma X-Z, Ch...",Inflammation may prevent the control of viral ...,Mechanism of action of interferon and ribaviri...,"[{'first': 'Sonya', 'middle': ['A'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Mechani...",0
2,bdb2855fba379ffefbd1e4b49d0d5db65ee93852,Identification and Validation of Reference Gen...,"Ke Li, Na Xu, Yu Jing Yang, Jin Hui Zhang, Hua...","Ke Li (Shanxi Normal University, 041004, Linfe...",Abstract\n\nMythimna separata is a major agric...,"Introduction\n\nThe oriental armyworm, Mythimn...",The role of nectar plants in severe outbreaks ...,"[{'first': 'Ke', 'middle': [], 'last': 'Li', '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The rol...",0
3,b0718d5c8888216c95fa19d7a79fd709da2c3ff4,Severe varicella-zoster virus pneumonia: a mul...,"Adrien Mirouse, Philippe Vignon, Prescillia Pi...","Adrien Mirouse, Philippe Vignon, Prescillia Pi...",Abstract\n\nBackground: Pneumonia is a dreaded...,Background\n\nPneumonia is associated with sig...,"Limoges, France. 4 INSERM U1092, Cic1435 Chu ...","[{'first': 'Adrien', 'middle': [], 'last': 'Mi...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Limoges...",0
4,7b4b3759cdde4218ea23a081b386bcffd7e6afc6,A Family-Wide RT-PCR Assay for Detection of Pa...,"Sander Van Boheemen, Theo M Bestebroer, Josann...","Sander Van Boheemen, Theo M Bestebroer, Josann...",Abstract\n\nFamily-wide molecular diagnostic a...,Introduction\n\nThe Paramyxoviridae family wit...,Respiratory viral infections in infants: cause...,"[{'first': 'Sander', 'middle': [], 'last': 'Va...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Respira...",0


### Combine data from text files

In [19]:
for name, df in dfd.items():
    df['dataset'] = name

In [20]:
df_combined = pd.concat(dfd.values(), ignore_index=True, sort=False)

In [21]:
df_combined.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,num_small_docs,dataset
0,0263ef539a1344a642b1f4ff11aee6bc6ca84a34,st Workshop of the Canadian Society for Virology,"Craig Mccormick, Nathalie Grandvaux","Craig Mccormick (Dalhousie University, 5850 Co...",Abstract\n\nThe 1st Workshop of the Canadian S...,"Introduction\n\nCanadian virologists, includin...",Technical knockout: Understanding poxvirus pat...,"[{'first': 'Craig', 'middle': [], 'last': 'Mcc...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Technic...",0,COMM-USE
1,75882d6856d4243248aa32fe119153efeb0dbe12,Lipopolysaccharide and Tumor Necrosis Factor A...,"Sonya A Macparland, Xue-Zhong Ma, Limin Chen, ...",Sonya A Macparland (Chinese Academy of Medical...,"Abstract\n\nCitation MacParland SA, Ma X-Z, Ch...",Inflammation may prevent the control of viral ...,Mechanism of action of interferon and ribaviri...,"[{'first': 'Sonya', 'middle': ['A'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Mechani...",0,COMM-USE
2,bdb2855fba379ffefbd1e4b49d0d5db65ee93852,Identification and Validation of Reference Gen...,"Ke Li, Na Xu, Yu Jing Yang, Jin Hui Zhang, Hua...","Ke Li (Shanxi Normal University, 041004, Linfe...",Abstract\n\nMythimna separata is a major agric...,"Introduction\n\nThe oriental armyworm, Mythimn...",The role of nectar plants in severe outbreaks ...,"[{'first': 'Ke', 'middle': [], 'last': 'Li', '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The rol...",0,COMM-USE
3,b0718d5c8888216c95fa19d7a79fd709da2c3ff4,Severe varicella-zoster virus pneumonia: a mul...,"Adrien Mirouse, Philippe Vignon, Prescillia Pi...","Adrien Mirouse, Philippe Vignon, Prescillia Pi...",Abstract\n\nBackground: Pneumonia is a dreaded...,Background\n\nPneumonia is associated with sig...,"Limoges, France. 4 INSERM U1092, Cic1435 Chu ...","[{'first': 'Adrien', 'middle': [], 'last': 'Mi...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Limoges...",0,COMM-USE
4,7b4b3759cdde4218ea23a081b386bcffd7e6afc6,A Family-Wide RT-PCR Assay for Detection of Pa...,"Sander Van Boheemen, Theo M Bestebroer, Josann...","Sander Van Boheemen, Theo M Bestebroer, Josann...",Abstract\n\nFamily-wide molecular diagnostic a...,Introduction\n\nThe Paramyxoviridae family wit...,Respiratory viral infections in infants: cause...,"[{'first': 'Sander', 'middle': [], 'last': 'Va...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Respira...",0,COMM-USE


In [22]:
if PROCESS_SMALL_DOCS:
    for name, df in small_docs.items():
        df['dataset'] = name
    df_combined_small_docs = pd.concat(small_docs.values(), ignore_index=True, sort=False)
    print(df_combined_small_docs.shape)

In [23]:
if PROCESS_SMALL_DOCS:
    print(df_combined_small_docs.columns)

### Join Metadata and Data Files

In [24]:
df = metadata_df.copy()

In [25]:
df_joined = df.join(df_combined.set_index('paper_id'), how='left', on='sha', rsuffix='_ft')

In [26]:
df_joined.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,title_ft,authors_ft,affiliations,abstract_ft,text,bibliography,raw_authors,raw_bibliography,num_small_docs,dataset
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,...,BMC Public Health Airborne rhinovirus detectio...,"Theodore A Myatt, Sebastian L Johnston, Stephe...",Theodore A Myatt (Harvard School of Public Hea...,"Abstract\n\nBackground: Rhinovirus, the most c...",Background\n\nRhinoviruses have been associate...,Intranasal recombinant alfa-2b interferon trea...,"[{'first': 'Theodore', 'middle': ['A'], 'last'...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Intrana...",0.0,PMC
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,...,Discovering human history from stomach bacteria,Todd R Disotell,"Todd R Disotell (New York University, 25 Waver...",Abstract\n\ncomment reviews reports deposited ...,\n\nCharles Darwin recognized that the distrib...,The Descent of Man and Selection in Relation t...,"[{'first': 'Todd', 'middle': ['R'], 'last': 'D...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The Des...",0.0,PMC
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350.0,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,...,,,,,,,,,,
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,...,BMC Medical Genetics Association of HLA class ...,"Marie Lin, Hsiang-Kuang Tseng, Jean A Trejaut,...","Marie Lin (Mackay Memorial Hospital, Taipei, T...",Abstract\n\nThe human leukocyte antigen (HLA) ...,"\n\npatient group, a further significant incre...",for surveillance of severe acute respiratory s...,"[{'first': 'Marie', 'middle': [], 'last': 'Lin...","{'BIBREF0': {'ref_id': 'b0', 'title': 'for sur...",0.0,PMC
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,...,BMC Infectious Diseases A double epidemic mode...,"Wai Tuen, Ng, Gabriel Turinici, Antoine Danchin","Wai Tuen, Ng (The University of Hong Kong, Ho...",Abstract\n\nBackground: An epidemic of a Sever...,Background\n\nSince November 2002 (and perhaps...,Porcine respiratory coronavirus differs from t...,"[{'first': 'Wai', 'middle': [], 'last': 'Tuen'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Porcine...",0.0,PMC


In [27]:
df_joined_ft = df_joined[~ df_joined['sha'].isnull()].copy()

In [28]:
df_joined_ft.shape

(38022, 29)

### Clean abstract

In [29]:
df_joined_ft['abstract_clean'] = df_joined_ft['abstract'].fillna('')

In [30]:
df_joined_ft['abstract_clean'] = df_joined_ft['abstract_clean'].apply(lambda x: x[9:] if x.lower().startswith('abstract') else x)

In [31]:
import re
mentions_covid = re.compile('COVID-19|SARS-CoV-2|2019-nCov|SARS Coronavirus 2|2019 Novel Coronavirus',
                           re.IGNORECASE)

In [32]:
df_joined_ft['abstract_mentions_covid'] = df_joined_ft['abstract_clean'].str.contains(mentions_covid)
df_joined_ft['abstract_mentions_covid'].sum()

1800

### Create citation ref

In [33]:
def first_author_lastname(metadata_author_list):
    if pd.isnull(metadata_author_list):
        return 'UNKNOWN'
    alist = metadata_author_list.split(';')
    if len(alist) == 1 and alist[0].count(',') > 1:
        # check if commas were used as delimiters
        alist = alist[0].split(',')
    first_author = alist[0]
    if ',' in first_author:
        split_char = ','
    else:
        split_char = ' '
    first_author_split = first_author.split(split_char)
    if split_char == ',':
        if len(first_author_split[0]) <=3:
            # short last name, use last name and first letter of first name
            lastname = first_author_split[0].strip() + '_' + first_author_split[1].strip()[0]
        else:
            lastname = first_author_split[0].strip()
        first_author_split = lastname.split(' ')
        if len(first_author_split) > 3 and len([x for x in first_author_split if len(x) > 3]) > 4:
            # probably a group name instead of a person's name
            lastname = first_author_split[0].strip()
    else:
        if len(first_author_split) > 3 and len([x for x in first_author_split if len(x) > 2]) > 3:
            # probably a group name instead of a person's name
            lastname = first_author_split[0].strip()
        elif len(first_author_split[-1]) <=3:
            # short last name, use last name and first letter of first name
            lastname = first_author_split[-1].strip() + '_' + first_author_split[0].strip()[0]
        else:
            lastname = first_author_split[-1].strip()
    if ' ' in lastname:
        lastname_split = lastname.split(' ')
        if '.' in lastname_split[0] or '(' in lastname_split[0]:
            # previously missed I. Last
            lastname_split = lastname_split[1:]
        elif '.' in lastname_split[1]:
            # somehow missed first i. last
            lastname_split = lastname_split[2:]
        lastname = '_'.join(lastname_split)
    return lastname

In [34]:
df_joined_ft['cite_ad'] = df_joined_ft['authors'].apply(first_author_lastname) + '_' + df_joined_ft['publish_year']

### Write data

In [35]:
df_joined_ft.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_pdf_parse',
       'has_pmc_xml_parse', 'full_text_file', 'url', 'publish_year',
       'title_ft', 'authors_ft', 'affiliations', 'abstract_ft', 'text',
       'bibliography', 'raw_authors', 'raw_bibliography', 'num_small_docs',
       'dataset', 'abstract_clean', 'abstract_mentions_covid', 'cite_ad'],
      dtype='object')

In [36]:
df_joined_ft.to_json(outfile, **out_json_args)
print(outfile)

../data/interim/2020-04-10-v7/2020-04-10-v7-covid19-combined.jsonl


In [37]:
if PROCESS_SMALL_DOCS:
    df_combined_small_docs.to_json(outfile_small_docs, **out_json_args)
    print(outfile_small_docs)

In [38]:
df_joined_ft.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,abstract_ft,text,bibliography,raw_authors,raw_bibliography,num_small_docs,dataset,abstract_clean,abstract_mentions_covid,cite_ad
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,...,"Abstract\n\nBackground: Rhinovirus, the most c...",Background\n\nRhinoviruses have been associate...,Intranasal recombinant alfa-2b interferon trea...,"[{'first': 'Theodore', 'middle': ['A'], 'last'...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Intrana...",0.0,PMC,"BACKGROUND: Rhinovirus, the most common cause ...",False,Myatt_2003
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,...,Abstract\n\ncomment reviews reports deposited ...,\n\nCharles Darwin recognized that the distrib...,The Descent of Man and Selection in Relation t...,"[{'first': 'Todd', 'middle': ['R'], 'last': 'D...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The Des...",0.0,PMC,Recent analyses of human pathogens have reveal...,False,Disotell_2003
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,...,Abstract\n\nThe human leukocyte antigen (HLA) ...,"\n\npatient group, a further significant incre...",for surveillance of severe acute respiratory s...,"[{'first': 'Marie', 'middle': [], 'last': 'Lin...","{'BIBREF0': {'ref_id': 'b0', 'title': 'for sur...",0.0,PMC,BACKGROUND: The human leukocyte antigen (HLA) ...,False,Lin_M_2003
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,...,Abstract\n\nBackground: An epidemic of a Sever...,Background\n\nSince November 2002 (and perhaps...,Porcine respiratory coronavirus differs from t...,"[{'first': 'Wai', 'middle': [], 'last': 'Tuen'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Porcine...",0.0,PMC,BACKGROUND: An epidemic of a Severe Acute Resp...,False,Ng_T_2003
5,qj4dh6rg,3ed670f60a7be2e3e2a991ea8af1fdd5fa5e2b2c,PMC,Cloaked similarity between HIV-1 and SARS-CoV ...,10.1186/1471-2180-3-20,PMC222911,14499001.0,no-cc,BACKGROUND: Severe acute respiratory syndrome ...,2003-09-21,...,Abstract\n\nBackground: Severe acute respirato...,Background\n\nInfection by many enveloped viru...,Core structure of gp41 from the HIV envelope g...,"[{'first': 'Yossef', 'middle': [], 'last': 'Kl...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Core st...",0.0,PMC,BACKGROUND: Severe acute respiratory syndrome ...,False,Kliger_2003


In [39]:
df_joined_ft.loc[:, ['cord_uid', 'sha', 'abstract_clean', 'abstract_mentions_covid',
                     'cite_ad', 'title', 'authors', 'publish_year', 'publish_time', 'dataset',
                                 'pmcid', 'pubmed_id', 'doi'
                    ]].to_json(outfile_abstracts, **out_json_args)
print(outfile_abstracts)

../data/interim/2020-04-10-v7/2020-04-10-v7-covid19-combined-abstracts.jsonl


In [40]:
df_joined_ft.loc[:, ['cord_uid', 'sha', 'abstract_clean']].to_json(outfile_only_abstracts, **out_json_args)
print(outfile_only_abstracts)

../data/interim/2020-04-10-v7/2020-04-10-v7-covid19-combined-only-abstracts.jsonl
