# Read datafiles, merge, and lightly clean

In [1]:
import os
import json
import datetime
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

#### Configuration

In [2]:
ROOT = '..'

In [3]:
datafile_date = '2020-03-27-v5'

In [4]:
PROCESS_SMALL_DOCS = False  # Small docs are the individual paragraphs in the text

In [5]:
SOURCE_FILES = {
    'COMM-USE': ROOT + f'/data/raw/{datafile_date}/comm_use_subset/',
    'BioRxiv': ROOT + f'/data/raw/{datafile_date}/biorxiv_medrxiv/',
    'NONCOMM': ROOT + f'/data/raw/{datafile_date}/noncomm_use_subset/',
    'PMC': ROOT + f'/data/raw/{datafile_date}/custom_license/',
}

In [6]:
metadata_file = ROOT + f'/data/raw/{datafile_date}/metadata.csv'

In [7]:
outdir = ROOT + f'/data/interim/{datafile_date}/'
outfile = f'{outdir}{datafile_date}-covid19-combined.jsonl'
outfile_small_docs = f'{outdir}{datafile_date}-covid19-combined-smalldocs.jsonl'
outfile_abstracts = f'{outdir}{datafile_date}-covid19-combined-abstracts.jsonl'
json_args = {'orient': 'records', 'lines': True}
out_json_args = {'date_format': 'iso', **json_args}

In [8]:
out_path_mode = 0o777
os.makedirs(outdir, mode = out_path_mode, exist_ok = True)

## Helper Functions

Some functions taken and modified from https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv

In [9]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        doi = None
        pmid = None
        other_ids = bib.get('other_ids')
        if other_ids:
            doi = other_ids.get('DOI')
            pmid = other_ids.get('PMID')
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        if doi:
            formatted_ls.extend(doi)
        if pmid:
            formatted_ls.extend(['PMID' + p for p in pmid])
        formatted.append(", ".join(formatted_ls))

    return "\n ".join(formatted)

In [10]:
def bib_titles(bibs):
    result = {}
    for key, bib in bibs.items():
        result[key] = bib['title']
    return result

def extract_small_docs(main_doc_id, body_text, bib_titles_dict):
    result = []
    for i, di in enumerate(body_text):
        ref_titles = []
        for ref in di['cite_spans']:
            title = bib_titles_dict.get(ref['ref_id'])
            if title:
                ref_titles.append(title)
        result.append((main_doc_id, i, di['text'], di['section'], ref_titles))
    return result

In [11]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files, prepare_small_docs=False):
    cleaned_files = []
    small_docs = []
    
    for file in tqdm(all_files):
        if prepare_small_docs:
            bib_titles_dict = bib_titles(file['bib_entries'])
            docs = extract_small_docs(file['paper_id'], file['body_text'], bib_titles_dict)
        else:
            docs = []

        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries'],
            len(docs)
        ]

        cleaned_files.append(features)
        if prepare_small_docs:
            small_docs.extend(docs)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography',
                'num_small_docs']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    if prepare_small_docs:
        small_docs_df = pd.DataFrame(small_docs, 
                                     columns=['paper_id', 'small_doc_num', 'text', 'section', 'ref_titles'])
        return clean_df, small_docs_df
    else:
        return clean_df

## Load Data

### Load Metadata

In [12]:
metadata_df = pd.read_csv(metadata_file)

In [13]:
metadata_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,Samuel Minot; Stephen D Turner; Krista L Ternu...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/001727
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,Lin WANG; Xiang Li,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/003889
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/006866
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,Sofia Morfopoulou; Vincent Plagnol,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/007476
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,Stephen P Velsko; Jonathan E Allen,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/010389


### Clean Metadata

In [14]:
metadata_df['publish_year'] = metadata_df['publish_time'].astype(str).apply(lambda d: 
                                                                d[:4] if d[0] in ('1', '2') else
                                                                '19xx' if d == 'nan' else
                                                                # d[2:6] if d.startswith("'[") else
                                                                '')

In [15]:
metadata_df['publish_year'].unique()

array(['2014', '2015', '2016', '2017', '2019', '2020', '2018', '19xx',
       '2012', '2003', '2004', '2005', '2006', '1987', '1995', '1993',
       '1998', '1999', '1990', '1982', '2007', '1961', '1965', '1984',
       '1986', '1985', '1988', '1989', '1991', '1992', '1994', '1996',
       '1997', '1955', '1957', '1963', '1967', '1970', '2001', '2000',
       '1977', '1981', '1983', '1976', '2008', '2002', '1974', '1980',
       '2009', '2010', '2011', '2013', '1975', '1978', '1973', '1979',
       '1971', '1972', '1968', '1969', '1966', '1951', '1964', '1952',
       '1962', '1959'], dtype=object)

### Load Data Files

In [16]:
dfd = {}
small_docs = {}
for name, indir in SOURCE_FILES.items():
    print(f'Loading {name} from {indir}')
    data_files = load_files(indir)
    print(f"Cleaning {name} {len(data_files)} files" )
    if PROCESS_SMALL_DOCS:
        dfd[name], small_docs[name] = generate_clean_df(data_files, prepare_small_docs=True)
    else:
        dfd[name] = generate_clean_df(data_files)

Loading COMM-USE from ../data/raw/2020-03-27-v5/comm_use_subset/


HBox(children=(FloatProgress(value=0.0, max=9315.0), HTML(value='')))


Cleaning COMM-USE 9315 files


HBox(children=(FloatProgress(value=0.0, max=9315.0), HTML(value='')))


Loading BioRxiv from ../data/raw/2020-03-27-v5/biorxiv_medrxiv/


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


Cleaning BioRxiv 1053 files


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


Loading NONCOMM from ../data/raw/2020-03-27-v5/noncomm_use_subset/


HBox(children=(FloatProgress(value=0.0, max=2350.0), HTML(value='')))


Cleaning NONCOMM 2350 files


HBox(children=(FloatProgress(value=0.0, max=2350.0), HTML(value='')))


Loading PMC from ../data/raw/2020-03-27-v5/custom_license/


HBox(children=(FloatProgress(value=0.0, max=20657.0), HTML(value='')))


Cleaning PMC 20657 files


HBox(children=(FloatProgress(value=0.0, max=20657.0), HTML(value='')))




In [17]:
dfd['COMM-USE'].head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,num_small_docs
0,0263ef539a1344a642b1f4ff11aee6bc6ca84a34,st Workshop of the Canadian Society for Virology,"Craig Mccormick, Nathalie Grandvaux","Craig Mccormick (Dalhousie University, 5850 Co...",Abstract\n\nThe 1st Workshop of the Canadian S...,"Introduction\n\nCanadian virologists, includin...",Technical knockout: Understanding poxvirus pat...,"[{'first': 'Craig', 'middle': [], 'last': 'Mcc...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Technic...",0
1,75882d6856d4243248aa32fe119153efeb0dbe12,Lipopolysaccharide and Tumor Necrosis Factor A...,"Sonya A Macparland, Xue-Zhong Ma, Limin Chen, ...",Sonya A Macparland (Chinese Academy of Medical...,"Abstract\n\nCitation MacParland SA, Ma X-Z, Ch...",Inflammation may prevent the control of viral ...,Mechanism of action of interferon and ribaviri...,"[{'first': 'Sonya', 'middle': ['A'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Mechani...",0
2,bdb2855fba379ffefbd1e4b49d0d5db65ee93852,Identification and Validation of Reference Gen...,"Ke Li, Na Xu, Yu Jing Yang, Jin Hui Zhang, Hua...","Ke Li (Shanxi Normal University, 041004, Linfe...",Abstract\n\nMythimna separata is a major agric...,"Introduction\n\nThe oriental armyworm, Mythimn...",The role of nectar plants in severe outbreaks ...,"[{'first': 'Ke', 'middle': [], 'last': 'Li', '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The rol...",0
3,b0718d5c8888216c95fa19d7a79fd709da2c3ff4,Severe varicella-zoster virus pneumonia: a mul...,"Adrien Mirouse, Philippe Vignon, Prescillia Pi...","Adrien Mirouse, Philippe Vignon, Prescillia Pi...",Abstract\n\nBackground: Pneumonia is a dreaded...,Background\n\nPneumonia is associated with sig...,"Limoges, France. 4 INSERM U1092, Cic1435 Chu ...","[{'first': 'Adrien', 'middle': [], 'last': 'Mi...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Limoges...",0
4,7b4b3759cdde4218ea23a081b386bcffd7e6afc6,A Family-Wide RT-PCR Assay for Detection of Pa...,"Sander Van Boheemen, Theo M Bestebroer, Josann...","Sander Van Boheemen, Theo M Bestebroer, Josann...",Abstract\n\nFamily-wide molecular diagnostic a...,Introduction\n\nThe Paramyxoviridae family wit...,Respiratory viral infections in infants: cause...,"[{'first': 'Sander', 'middle': [], 'last': 'Va...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Respira...",0


### Combine data from text files

In [18]:
for name, df in dfd.items():
    df['dataset'] = name

In [19]:
df_combined = pd.concat(dfd.values(), ignore_index=True, sort=False)

In [20]:
df_combined.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,num_small_docs,dataset
0,0263ef539a1344a642b1f4ff11aee6bc6ca84a34,st Workshop of the Canadian Society for Virology,"Craig Mccormick, Nathalie Grandvaux","Craig Mccormick (Dalhousie University, 5850 Co...",Abstract\n\nThe 1st Workshop of the Canadian S...,"Introduction\n\nCanadian virologists, includin...",Technical knockout: Understanding poxvirus pat...,"[{'first': 'Craig', 'middle': [], 'last': 'Mcc...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Technic...",0,COMM-USE
1,75882d6856d4243248aa32fe119153efeb0dbe12,Lipopolysaccharide and Tumor Necrosis Factor A...,"Sonya A Macparland, Xue-Zhong Ma, Limin Chen, ...",Sonya A Macparland (Chinese Academy of Medical...,"Abstract\n\nCitation MacParland SA, Ma X-Z, Ch...",Inflammation may prevent the control of viral ...,Mechanism of action of interferon and ribaviri...,"[{'first': 'Sonya', 'middle': ['A'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Mechani...",0,COMM-USE
2,bdb2855fba379ffefbd1e4b49d0d5db65ee93852,Identification and Validation of Reference Gen...,"Ke Li, Na Xu, Yu Jing Yang, Jin Hui Zhang, Hua...","Ke Li (Shanxi Normal University, 041004, Linfe...",Abstract\n\nMythimna separata is a major agric...,"Introduction\n\nThe oriental armyworm, Mythimn...",The role of nectar plants in severe outbreaks ...,"[{'first': 'Ke', 'middle': [], 'last': 'Li', '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The rol...",0,COMM-USE
3,b0718d5c8888216c95fa19d7a79fd709da2c3ff4,Severe varicella-zoster virus pneumonia: a mul...,"Adrien Mirouse, Philippe Vignon, Prescillia Pi...","Adrien Mirouse, Philippe Vignon, Prescillia Pi...",Abstract\n\nBackground: Pneumonia is a dreaded...,Background\n\nPneumonia is associated with sig...,"Limoges, France. 4 INSERM U1092, Cic1435 Chu ...","[{'first': 'Adrien', 'middle': [], 'last': 'Mi...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Limoges...",0,COMM-USE
4,7b4b3759cdde4218ea23a081b386bcffd7e6afc6,A Family-Wide RT-PCR Assay for Detection of Pa...,"Sander Van Boheemen, Theo M Bestebroer, Josann...","Sander Van Boheemen, Theo M Bestebroer, Josann...",Abstract\n\nFamily-wide molecular diagnostic a...,Introduction\n\nThe Paramyxoviridae family wit...,Respiratory viral infections in infants: cause...,"[{'first': 'Sander', 'middle': [], 'last': 'Va...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Respira...",0,COMM-USE


In [21]:
if PROCESS_SMALL_DOCS:
    for name, df in small_docs.items():
        df['dataset'] = name
    df_combined_small_docs = pd.concat(small_docs.values(), ignore_index=True, sort=False)
    print(df_combined_small_docs.shape)

In [22]:
if PROCESS_SMALL_DOCS:
    print(df_combined_small_docs.columns)

### Join Metadata and Data Files

In [23]:
df = metadata_df.copy()

In [24]:
df_joined = df.join(df_combined.set_index('paper_id'), how='left', on='sha', rsuffix='_ft')

In [25]:
df_joined.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,title_ft,authors_ft,affiliations,abstract_ft,text,bibliography,raw_authors,raw_bibliography,num_small_docs,dataset
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,...,SIANN: Strain Identification by Alignment to N...,"Samuel S Minot, Stephen D Turner, Krista L Ter...","Samuel S Minot, Stephen D Turner, Krista L Ter...",Abstract\n\nNext-generation sequencing is incr...,Introduction\n\nThere are many different metho...,Scalable metagenomic taxonomy classification u...,"[{'first': 'Samuel', 'middle': ['S'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Scalabl...",0.0,BioRxiv
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,...,Spatial epidemiology of networked metapopulati...,"Lin Wang, · Xiang Li, L Wang, · X Li","Lin Wang (Fudan University, 200433, Shanghai, ...",Abstract\n\nAn emerging disease is one infecti...,Introduction\n\nThe term metapopulation was co...,Some demographic and genetic consequences of e...,"[{'first': 'Lin', 'middle': [], 'last': 'Wang'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Some de...",0.0,BioRxiv
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,...,Sequencing of the human IG light chain loci fr...,"Corey T Watson, Karyn Meltz Steinberg, Tina A ...","Corey T Watson (Simon Fraser University, V5A 1...",Abstract\n\nGermline variation at immunoglobul...,Introduction\n\nAntibodies are essential compo...,"Janeway's immunobiology. 7 edn, K Murphy, P Tr...","[{'first': 'Corey', 'middle': ['T'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Janeway...",0.0,BioRxiv
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,...,Bayesian mixture analysis for metagenomic comm...,"Sofia Morfopoulou, Vincent Plagnol","Sofia Morfopoulou (University College London, ...",Abstract\n\nDeep sequencing of clinical sample...,Introduction\n\nMetagenomics can be defined as...,"Basic local alignment search tool, S F Altschu...","[{'first': 'Sofia', 'middle': [], 'last': 'Mor...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Basic l...",0.0,BioRxiv
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,...,Mapping a viral phylogeny onto outbreak trees ...,"Jonathan E Allen, Stephan P Velsko, Stephan P ...",Jonathan E Allen (Lawrence Livermore National ...,Abstract\n\nBackground: Developing methods to ...,\n\n: Schematic of transmission links overlaid...,Molecular Evolution of the SARS Coronavirus Du...,"[{'first': 'Jonathan', 'middle': ['E'], 'last'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Molecul...",0.0,BioRxiv


In [26]:
df_joined_ft = df_joined[~ df_joined['sha'].isnull()].copy()

In [27]:
df_joined_ft.shape

(31753, 28)

### Clean abstract

In [28]:
df_joined_ft['abstract_clean'] = df_joined_ft['abstract'].fillna('')

In [29]:
df_joined_ft['abstract_clean'] = df_joined_ft['abstract_clean'].apply(lambda x: x[9:] if x.lower().startswith('abstract') else x)

### Create citation ref

In [30]:
df_joined_ft['cite_ad'] = df_joined_ft['authors'].fillna('').str.split(',').str[0].str.split(' ').str.join('_') + '_' + df_joined_ft['publish_year']

### Write data

In [31]:
df_joined_ft.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text',
       'full_text_file', 'url', 'publish_year', 'title_ft', 'authors_ft',
       'affiliations', 'abstract_ft', 'text', 'bibliography', 'raw_authors',
       'raw_bibliography', 'num_small_docs', 'dataset', 'abstract_clean',
       'cite_ad'],
      dtype='object')

In [32]:
# Warning: This file is over 2GB
df_joined_ft.to_json(outfile, **out_json_args)
outfile

'../data/interim/2020-03-27-v5/2020-03-27-v5-covid19-combined.jsonl'

In [33]:
if PROCESS_SMALL_DOCS:
    df_combined_small_docs.to_json(outfile_small_docs, **out_json_args)

In [34]:
df_joined_ft.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,affiliations,abstract_ft,text,bibliography,raw_authors,raw_bibliography,num_small_docs,dataset,abstract_clean,cite_ad
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,...,"Samuel S Minot, Stephen D Turner, Krista L Ter...",Abstract\n\nNext-generation sequencing is incr...,Introduction\n\nThere are many different metho...,Scalable metagenomic taxonomy classification u...,"[{'first': 'Samuel', 'middle': ['S'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Scalabl...",0.0,BioRxiv,Next-generation sequencing is increasingly bei...,Samuel_Minot;_Stephen_D_Turner;_Krista_L_Ternu...
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,...,"Lin Wang (Fudan University, 200433, Shanghai, ...",Abstract\n\nAn emerging disease is one infecti...,Introduction\n\nThe term metapopulation was co...,Some demographic and genetic consequences of e...,"[{'first': 'Lin', 'middle': [], 'last': 'Wang'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Some de...",0.0,BioRxiv,An emerging disease is one infectious epidemic...,Lin_WANG;_Xiang_Li_2014
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,...,"Corey T Watson (Simon Fraser University, V5A 1...",Abstract\n\nGermline variation at immunoglobul...,Introduction\n\nAntibodies are essential compo...,"Janeway's immunobiology. 7 edn, K Murphy, P Tr...","[{'first': 'Corey', 'middle': ['T'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Janeway...",0.0,BioRxiv,Germline variation at immunoglobulin gene (IG)...,Corey_T_Watson;_Karyn_Meltz_Steinberg;_Tina_A_...
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,...,"Sofia Morfopoulou (University College London, ...",Abstract\n\nDeep sequencing of clinical sample...,Introduction\n\nMetagenomics can be defined as...,"Basic local alignment search tool, S F Altschu...","[{'first': 'Sofia', 'middle': [], 'last': 'Mor...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Basic l...",0.0,BioRxiv,Deep sequencing of clinical samples is now an ...,Sofia_Morfopoulou;_Vincent_Plagnol_2014
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,...,Jonathan E Allen (Lawrence Livermore National ...,Abstract\n\nBackground: Developing methods to ...,\n\n: Schematic of transmission links overlaid...,Molecular Evolution of the SARS Coronavirus Du...,"[{'first': 'Jonathan', 'middle': ['E'], 'last'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Molecul...",0.0,BioRxiv,Developing methods to reconstruct transmission...,Stephen_P_Velsko;_Jonathan_E_Allen_2014


In [35]:
df_joined_ft.loc[:, ['cord_uid', 'sha', 'abstract_clean',
                     'cite_ad', 'title', 'authors', 'publish_year', 'publish_time', 'dataset',
                                 'pmcid', 'pubmed_id', 'doi'
                    ]].to_json(outfile_abstracts, **out_json_args)