In [35]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm import tqdm

# dir with files
biorxiv_dir = 'corona_research_challenge_data/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/'
pmc_dir = 'corona_research_challenge_data/2020-03-13/pmc_custom_license/pmc_custom_license/'
comm_dir = 'corona_research_challenge_data/2020-03-13/comm_use_subset/comm_use_subset/'
noncomm_dir = 'corona_research_challenge_data/2020-03-13/noncomm_use_subset/noncomm_use_subset/'

biorxiv_files = os.listdir(biorxiv_dir)

In [3]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [4]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

In [11]:
all_files = []

for filename in biorxiv_files:
    if filename != '.ipynb_checkpoints':
        filename = biorxiv_dir + filename
        file = json.load(open(filename, 'rb'))
        all_files.append(file)
    else:
        continue

In [25]:
from tqdm import tqdm

In [26]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

100%|██████████| 803/803 [00:00<00:00, 1075.73it/s]


In [28]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,2af91bbb625d983d5ad3aadb9e5f1d82268a0f1f,Highly ACE2 Expression in Pancreas May Cause P...,"Furong Liu, Xin Long, Wenbin Zou, Minghao Fang...",Furong Liu (Huazhong University of Science and...,Abstract\n\nThe ongoing outbreak of coronaviru...,Introduction\n\nSevere acute respiratory syndr...,Transmission of 2019-nCoV Infection from an As...,"[{'first': 'Furong', 'middle': [], 'last': 'Li...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Transmi..."
1,93f67ffe7803061de9b19c4dfa346b3aa97aa4eb,Non-uniform refinement: Adaptive regularizatio...,"Ali Punjani, Haowei Zhang, David J Fleet","Ali Punjani (University of Toronto), Haowei Zh...",Abstract\n\nSingle particle cryo-EM is a power...,Introduction\n\nSingle particle cryogenic elec...,High resolution single particle refinement in ...,"[{'first': 'Ali', 'middle': [], 'last': 'Punja...","{'BIBREF0': {'ref_id': 'b0', 'title': 'High re..."
2,e60640bfc445db0f53b5d92cb9f69cd943f1b73a,Expanding the size limit of RNA viruses: Evide...,Humberto J Debat,Humberto J Debat (Instituto Nacional de Tecnol...,Abstract\n\nWhile RNA viruses thrive with mass...,\n\nstudies are exploring and elucidating the ...,Changes to taxonomy and the international code...,"[{'first': 'Humberto', 'middle': ['J'], 'last'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Changes..."
3,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,Sequencing of the human IG light chain loci fr...,"Corey T Watson, Karyn Meltz Steinberg, Tina A ...","Corey T Watson (Simon Fraser University, V5A 1...",Abstract\n\nGermline variation at immunoglobul...,Introduction\n\nAntibodies are essential compo...,"Janeway's immunobiology. 7 edn, K Murphy, P Tr...","[{'first': 'Corey', 'middle': ['T'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Janeway..."
4,05082393ba4c7ec530190dd887d99c74fd72f6d6,Self-assembly of the RZZ complex into filament...,"Cláudia Pereira, Rita M Reis, José B Gama, Dha...","Cláudia Pereira (Universidade do Porto, 4200-1...",,Kinetochore expansion requires the RZZ complex...,Mechanisms of chromosome congression during mi...,"[{'first': 'Cláudia', 'middle': [], 'last': 'P...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Mechani..."


In [30]:
clean_df.to_csv('data_cleaned/biorxiv_clean.csv', index=False)

### PMC Generate csv's

In [36]:
pmc_files = load_files(pmc_dir)
pmc_df = generate_clean_df(pmc_files)
pmc_df.head()

100%|██████████| 1426/1426 [00:00<00:00, 1434.44it/s]
100%|██████████| 1426/1426 [00:01<00:00, 1013.78it/s]


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,27c85eb0542256f6599ba43e90986abfdb1ddc5b,Influenza-associated Deaths in Tropical Singap...,"Angela Chow, Stefan Ma, Ai Ee Ling, Suok Kai Chew","Angela Chow (Angela Chow, 16 College Rd, 16985...",Abstract\n\nWe used a regression model to exam...,\n\nI nfluenza virus infections cause excess i...,"Individual and community impact of influenza, ...","[{'first': 'Angela', 'middle': [], 'last': 'Ch...","{'BIBREF1': {'ref_id': 'b1', 'title': 'Individ..."
1,da6cb24186f59847fb1dcdb6614ab15b2bd3377c,Veterinary Science Detection and molecular cha...,"Tawatchai Pohuang, Niwat Chansiripornchai, Ach...","Tawatchai Pohuang (Chulalongkorn University, 1...",Abstract\n\nJanuary and June 2008. The 878-bp ...,"Introduction\n\nInfectious bronchitis (IB), ca...",Avian infectious bronchitis in the Southern pa...,"[{'first': 'Tawatchai', 'middle': [], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Avian i..."
2,916602c93360fa550e272dcc72f00bd36be64817,,,,,\n\nT he risk for deadly infectious diseases w...,Public health assessment of potential biologic...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Public ..."
3,57a67fd5273fb4aac8505d1ae7b32c2582d12df3,Physicochemical Properties of Cells and Their ...,"Francois-Xavier Theillet, Andres Binolfi, Tama...","Francois-Xavier Theillet, Andres Binolfi, Tama...",,INTRODUCTION\n\nIt has long been axiomatic tha...,"D293. (31) Record, S Sundararaj, A Guo, B Habi...","[{'first': 'Francois-Xavier', 'middle': [], 'l...","{'BIBREF21': {'ref_id': 'b21', 'title': 'D293...."
4,ed2de3694f5580ea38f4adf24bd1c8b46862df1f,Mannose-binding lectin deficiency and acute ex...,"Richard K Albert, John Connett, Jeffrey L Curt...",Richard K Albert (University of Colorado Denve...,Abstract\n\nBackground: Mannose-binding lectin...,Introduction\n\nMannose-binding lectin (MBL) i...,Serum lectin with known structure activates co...,"[{'first': 'Richard', 'middle': ['K'], 'last':...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Serum l..."


In [38]:
pmc_df.to_csv('data_cleaned/clean_pmc.csv', index=False)

### Commercial Use: Generate CSV

In [39]:
comm_files = load_files(comm_dir)
comm_df = generate_clean_df(comm_files)

100%|██████████| 9000/9000 [00:10<00:00, 863.71it/s]
100%|██████████| 9000/9000 [00:17<00:00, 527.94it/s]


In [40]:
comm_df.to_csv('data_cleaned/clean_comm_use.csv', index=False)

### Non-commercial Use: Generate CSV
    

In [41]:
noncomm_files = load_files(noncomm_dir)
noncomm_df = generate_clean_df(noncomm_files)

100%|██████████| 1973/1973 [00:01<00:00, 1865.86it/s]
100%|██████████| 1973/1973 [00:02<00:00, 708.07it/s]


In [42]:
noncomm_df.to_csv('data_cleaned/clean_noncomm_use.csv', index=False)