In [2]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

#### Extraction helper functions for the data

In [3]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

Unhide the cell below to find the definition of the following functions:
* `load_files(dirname)`
* `generate_clean_df(all_files)`

In [4]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

Let's first take a quick glance at the `biorxiv` subset of the data. We will also use this opportunity to load all of the json files into a list of **nested** dictionaries (each `dict` is an article).

In [5]:
biorxiv_dir = 'CORD-19-research-challenge/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 803


In [6]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [7]:
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


#### Biorxiv: Abstract
The abstract dictionary is fairly simple:

In [35]:
pprint(file['abstract'])

[]


#### Biorxiv: body text
Let's first probe what the `body_text` dictionary looks like:

In [9]:
print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 15
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


We take a look at the first part of the `body_text` content. As you will notice, the body text is separated into a list of small subsections, each containing a `section` and a `text` key. Since multiple subsection can have the same section, we need to first group each subsection before concatenating everything.

In [10]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Objective',
  'text': 'In this study, we used the imported cases and air travel data from '
          'Iran to other Middle East countries to estimate the number of '
          'COVID-19 cases in Iran. Then we compared our estimates with the '
          'number of reported cases in Iran to evaluate the extent of '
          'under-ascertainment.'},
 {'cite_spans': [],
  'ref_spans': [],
  'section': 'Data',
  'text': 'We obtained transport capability of international airlines from 30 '
          'major airports in Iran from 1 February to 24 February 2020 '
          '(WorldData, 2020), from the Variflight platform '
          '(https://data.variflight.com/). We collected the number of exported '
          'cases from Iran to other countries in the Middle East (World Health '
          'Organization, 2020).'}]


Let's see what the grouped section titles are for the example above:

In [11]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['Objective',
 'Data',
 'Methods',
 'Results and discussion',
 'Ethics approval and consent to participate',
 'Availability of data and materials',
 'Consent for publication',
 'Funding']


The following example shows what the final result looks like, after we format each section title with its content:

In [12]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])

Objective

In this study, we used the imported cases and air travel data from Iran to other Middle East countries to estimate the number of COVID-19 cases in Iran. Then we compared our estimates with the number of reported cases in Iran to evaluate the extent of under-ascertainment.

Data

We obtained transport capability of international airlines from 30 major airports in Iran from 1 February to 24 February 2020 (WorldData, 2020), from the Variflight platform (https://data.variflight.com/). We collected the number of exported cases from Iran to other countries in the Middle East (World Health Organization, 2020).Population size of Iran (81,800,269 in 2018) was obtained from the World Bank (https://data.worldbank.org/). Table 1 shows the number of daily population flow and total reported cases of countries used in our calculation.

Methods

Following Imai et al.'s (2020), we assumed number of cases (n) exported from Iran follows a Binomial distribution (Bin) with size N and probability

The function below lets you display the body text in one line (unhide to see exactly the same as above):

In [13]:
print(format_body(file['body_text'])[:3000])

Objective

In this study, we used the imported cases and air travel data from Iran to other Middle East countries to estimate the number of COVID-19 cases in Iran. Then we compared our estimates with the number of reported cases in Iran to evaluate the extent of under-ascertainment.

Data

We obtained transport capability of international airlines from 30 major airports in Iran from 1 February to 24 February 2020 (WorldData, 2020), from the Variflight platform (https://data.variflight.com/). We collected the number of exported cases from Iran to other countries in the Middle East (World Health Organization, 2020).Population size of Iran (81,800,269 in 2018) was obtained from the World Bank (https://data.worldbank.org/). Table 1 shows the number of daily population flow and total reported cases of countries used in our calculation.

Methods

Following Imai et al.'s (2020), we assumed number of cases (n) exported from Iran follows a Binomial distribution (Bin) with size N and probability

#### Biorxiv: Metadata
Let's first see what keys are contained in the `metadata` dictionary:

In [14]:
print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


Let's take a look at each of the correspond values:

In [15]:
print(all_files[0]['metadata']['title'])

Preliminary estimation of the novel coronavirus disease (COVID-19) cases in Iran: a modelling analysis based on overseas cases and air travel data


In [16]:
authors = all_files[0]['metadata']['authors']
pprint(authors[:3])

[{'affiliation': {'institution': 'Hong Kong Polytechnic University',
                  'laboratory': '',
                  'location': {'country': 'China',
                               'settlement': 'Hong Kong SAR'}},
  'email': '',
  'first': 'Zian',
  'last': 'Zhuang',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'Chinese University of Hong Kong',
                  'laboratory': '',
                  'location': {'country': 'China', 'settlement': 'Hong Kong'}},
  'email': '',
  'first': 'Shi',
  'last': 'Zhao',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'University of Michigan',
                  'laboratory': '',
                  'location': {'country': 'USA',
                               'region': 'Michigan',
                               'settlement': 'Ann Arbor'}},
  'email': '',
  'first': 'Qianying',
  'last': 'Lin',
  'middle': [],
  'suffix': ''}]


The `format_name` and `format_affiliation` functions:

In [17]:
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Zian Zhuang
Affiliation: Hong Kong Polytechnic University, Hong Kong SAR, China

Name: Shi Zhao
Affiliation: Chinese University of Hong Kong, Hong Kong, China

Name: Qianying Lin
Affiliation: University of Michigan, Ann Arbor, Michigan, USA

Name: Peihua Cao
Affiliation: Southern Medical University, Guangzhou, Guangdong, China

Name: Yijun Lou
Affiliation: Hong Kong Polytechnic University, Hong Kong SAR, China

Name: Lin Yang
Affiliation: Hong Kong Polytechnic University, Hong Kong, China

Name: Daihai He
Affiliation: Hong Kong Polytechnic University, Hong Kong SAR, China



Now, let's take as an example a slightly longer list of authors:

In [18]:
pprint(all_files[4]['metadata'], depth=4)

{'authors': [{'affiliation': {'institution': '',
                              'laboratory': 'State Key Laboratory of '
                                            'Veterinary Etiological Biology '
                                            'and Key Laboratory of Veterinary '
                                            'Parasitology',
                              'location': {}},
              'email': '',
              'first': 'Xiaofeng',
              'last': 'Xu',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': '$',
              'last': '',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Haishuo',
              'last': 'Ji 23$',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'Nankai University',
                              'laboratory': '',


Here, I provide the function `format_authors` that let you format a list of authors to get a final string, with the optional argument of showing the affiliation:

In [19]:
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Xiaofeng Xu, $ , Haishuo Ji 23$, Zhi Cheng, Jin , Xue Yao, Yanqiang Liu, Qiang Zhao, Tao Zhang, Jishou Ruan, Wenjun Bu, Ze Chen, Shan Gao

Formatting with affiliation:
Xiaofeng Xu, $ , Haishuo Ji 23$, Zhi Cheng (Nankai University, 300071, Tianjin, Tianjin, P.R.China. 10), Jin , Xue Yao (Nankai University, 300071, Tianjin, Tianjin, P.R.China. 10), Yanqiang Liu (Nankai University, 300071, Tianjin, Tianjin, P.R.China. 10), Qiang Zhao (Nankai University, 300071, Tianjin, Tianjin, P.R.China. 10), Tao Zhang (Nankai University, 300071, Tianjin, Tianjin, P.R.China. 10), Jishou Ruan (Nankai University, 300071, Tianjin, P.R.China), Wenjun Bu (Nankai University, 300071, Tianjin, Tianjin, P.R.China. 10), Ze Chen, Shan Gao (Nankai University, 300071, Tianjin, P.R.China)


#### Biorxiv: bibliography
Let's take a look at the bibliography section. 

In [20]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b0',
  'title': "Iran says 'tens of thousands' may get tested for coronavirus",
  'venue': 'abcNEWS',
  'volume': '',
  'year': None},
 {'authors': [{'first': 'I', 'last': 'Bogoch', 'middle': [...], 'suffix': ''},
              {'first': 'A', 'last': 'Watts', 'middle': [], 'suffix': ''},
              {'first': 'A',
               'last': 'Thomas-Bachli',
               'middle': [],
               'suffix': ''},
              {'first': 'C', 'last': 'Huber', 'middle': [], 'suffix': ''},
              {'first': 'M', 'last': 'Kraemer', 'middle': [...], 'suffix': ''},
              {'first': 'K', 'last': 'Khan', 'middle': [], 'suffix': ''}],
  'issn': '',
  'other_ids': {'DOI': ['10.1093/jtm/taaa011']},
  'pages': '',
  'ref_id': 'b1',
  'title': 'Potential for global spread of a novel coronavirus from China. '
           'Journal of travel medicine',
  'venue': '',
  'volume': '',
  'year': 2020}]


You can reused the `format_authors` function here:

In [21]:
format_authors(bibs[1]['authors'], with_affiliation=False)

'I I Bogoch, A Watts, A Thomas-Bachli, C Huber, M U Kraemer, K Khan'

The following function let you format the bibliography all at once. It only extracts the title, authors, venue, year, and separate each entry of the bibliography with a `;`.

In [22]:
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

Iran says 'tens of thousands' may get tested for coronavirus, , abcNEWS, None; Potential for global spread of a novel coronavirus from China. Journal of travel medicine, I I Bogoch, A Watts, A Thomas-Bachli, C Huber, M U Kraemer, K Khan, , 2020; Estimating the potential total, N Imai, I Dorigatti, Cori A Riley, S Ferguson, N M , , None


#### Biorxiv: Generate CSV

In this section, I show you how to manually generate the CSV files. As you can see, it's now super simple because of the `format_` helper functions. In the next sections, I show you have to generate them in 3 lines using the `load_files` and `generate_clean_dr` helper functions.

In [23]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

HBox(children=(FloatProgress(value=0.0, max=803.0), HTML(value='')))




In [24]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,e5163021a1b88e2c2335cca27fbfcf883f870830,Preliminary estimation of the novel coronaviru...,"Zian Zhuang, Shi Zhao, Qianying Lin, Peihua Ca...","Zian Zhuang (Hong Kong Polytechnic University,...",,"Objective\n\nIn this study, we used the import...",Iran says 'tens of thousands' may get tested f...,"[{'first': 'Zian', 'middle': [], 'last': 'Zhua...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Iran sa..."
1,6d1b40bcdde63e22931509db389c6bf1949901dd,Title page • Title A combinatorial biomolecula...,"Jasmim Leal, Xinquan Liu, Xiujuan Peng, Rashmi...",Jasmim Leal (The University of Texas at Austin...,Abstract\n\nDrugs and drug delivery systems ha...,Introduction\n\nFor successful treatment of mu...,G) void area (i.e. porosity) measurements. Dat...,"[{'first': 'Jasmim', 'middle': [], 'last': 'Le...","{'BIBREF0': {'ref_id': 'b0', 'title': 'G) void..."
2,2d02192c3d251c2b561d37274350660277658a74,Enabling large-scale genome editing by reducin...,"Cory J Smith, Oscar Castanon, Khaled Said, Ver...","Cory J Smith (Harvard Medical School, Boston, ...",Abstract\n\nTo extend the frontier of genome e...,table S2.\n\nTo achieve similar efficiencies t...,"Sequence organization of the human genome, C W...","[{'first': 'Cory', 'middle': ['J'], 'last': 'S...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Sequenc..."
3,970a28c4a3c1fbcb5322772956955b8c4a3bb257,Clinical Characteristics of 2019 Coronavirus P...,"Zhangfu Fang, Yi 3#, Kang Wu, Kefang Lai, Xizh...",Zhangfu Fang (Third Affiliated Hospital of She...,Abstract\n\nClinical characteristics of novel ...,Introduction\n\nCOVID-19 among different studi...,Report of clustering pneumonia of unknown etio...,"[{'first': 'Zhangfu', 'middle': [], 'last': 'F...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Report ..."
4,aad76905ce54679c80b75e4ee35717c30e7e1099,Using pan RNA-seq analysis to reveal the ubiqu...,"Xiaofeng Xu, $ , Haishuo Ji 23$, Zhi Cheng, Ji...","Xiaofeng Xu, $ , Haishuo Ji 23$, Zhi Cheng (Na...","Abstract\n\nIn this study, we used pan RNA-seq...","53\n\nRNA sequencing (RNA-seq) , usually based...",R language and Bioconductor in bioinformatics ...,"[{'first': 'Xiaofeng', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'R langu..."


In [26]:
clean_df.to_csv('cleaned/biorxiv_clean.csv', index=False)

## PMC: Generate CSV

In [27]:
pmc_dir = 'CORD-19-research-challenge/2020-03-13/pmc_custom_license/pmc_custom_license/'
pmc_files = load_files(pmc_dir)
pmc_df = generate_clean_df(pmc_files)
pmc_df.head()

HBox(children=(FloatProgress(value=0.0, max=1426.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1426.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,c723a1d259891519fd489b94a7ff128f9a5555aa,Partitioning of Viruses in Wastewater Systems ...,"Mari Titcombe Lee, Amy Pruden, Linsey C Marr","Mari Titcombe Lee, Amy Pruden, Linsey C Marr",Abstract\n\nTo gain insight into the potential...,■ INTRODUCTION\n\nThe Ebola outbreak in 2014 r...,Transmission of Ebola Viruses: What We Know an...,"[{'first': 'Mari', 'middle': ['Titcombe'], 'la...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Transmi..."
1,f5a95ba3b592c46903d5f4a4c00bd15a2cbb74f9,Influenza vaccines and vaccinations in Poland ...,"Lidia B Brydak, Agnieszka Woźniak Kosek, Aneta...",Lidia B Brydak (National Institute of Public H...,,Background\n\nInfluenza causes seasonal infect...,"Influenza, pandemic flu myth or a real threat?...","[{'first': 'Lidia', 'middle': ['B'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Influen..."
2,6397e80ffb931ee6f6d03c1fe85ef82aa36e1b9a,,,,,"\n\nOn March 7, 2013 , this report was posted ...",Novel coronavirus associated with severe respi...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Novel c..."
3,724be3f77bbbdcc7e73de43f4cfe22335ffb9ef9,"Influenza D Virus of New Phylogenetic Lineage,...",,,,\n\nAppendix Table 1 \n\n,"IDV, influenza D virus; mAb, monoclonal antibo...",[],"{'BIBREF4': {'ref_id': 'b4', 'title': 'IDV, in..."
4,0ae70ae84cc9052dedf214ca44a842f0f9edc161,Capacity of Thailand to Contain an Emerging In...,"Weerasak Putthasri, Jongkol Lertiendumrong, Po...","Weerasak Putthasri (Mahidol University, 420/6 ...",Abstract\n\nSoutheast Asia will likely be the ...,\n\nT he World Health Organization (WHO) has h...,Cumulative number of confi rmed human cases of...,"[{'first': 'Weerasak', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Cumulat..."


In [29]:
pmc_df.to_csv('cleaned/clean_pmc.csv', index=False)

## Commercial Use: Generate CSV

In [30]:
comm_dir = 'CORD-19-research-challenge/2020-03-13/comm_use_subset/comm_use_subset/'
comm_files = load_files(comm_dir)
comm_df = generate_clean_df(comm_files)
comm_df.head()

HBox(children=(FloatProgress(value=0.0, max=9000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9000.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,589045646462acd78b115142b5464cca998f26d7,Src inhibitor reduces permeability without dis...,"Yi-Xin He, Jin Liu, Baosheng Guo, Yi-Xiang Wan...","Yi-Xin He (Hong Kong Baptist University, Hong ...",,\n\nThe subchondral collapse is directly attri...,Pulsed methylprednisolone therapy compared to ...,"[{'first': 'Yi-Xin', 'middle': [], 'last': 'He...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Pulsed ..."
1,ba2d3601f5ae3bb5964c8515431abe352bad9437,"The association between temperature, rainfall ...","Fazle Rabbi Chowdhury, Quazi Shihab, Uddin Ibr...",Fazle Rabbi Chowdhury (Bangabandhu Sheikh Muji...,Abstract\n\nBangladesh is one of the world's m...,\n\na1111111111 a1111111111 a1111111111 a11111...,Climate change 2014 synthesis report summary f...,"[{'first': 'Fazle', 'middle': ['Rabbi'], 'last...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Climate..."
2,d38f954f1b3937ead02257e75454dd9ad2ec0ce6,An evaluation of psychological distress and so...,"Abdulaziz Mohammed, Taiwo Lateef Sheikh, Sahee...","Abdulaziz Mohammed, Taiwo Lateef Sheikh, Sahee...","Abstract\n\nBackground: By September 2014, an ...",Background\n\nThe West African outbreak of Ebo...,Emergence of Zaire Ebola Virus Disease in Guin...,"[{'first': 'Abdulaziz', 'middle': [], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen..."
3,77dc09841a62d92ba5a40d4f848f34e3c4e27713,Social contact patterns relevant to the spread...,"Kathy Kong, Leung, Mark Jit, Eric H Y Lau, Jo...","Kathy Kong, Leung (The University of Hong Kon...",,The impact of different social contact data on...,Inferring Influenza Infection Attack Rate from...,"[{'first': 'Kathy', 'middle': [], 'last': 'Kon...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Inferri..."
4,5be812ecd3a63e7b5e718f17d9ca3a0d0fdb4a70,Spatiotemporal dynamics of HSV genome nuclear ...,"Eiki Sekine, Nora Schmidt, David Gaboriau, Pet...","Eiki Sekine (St Mary's Medical School, London,...",Abstract\n\nWe investigated the spatiotemporal...,Introduction\n\nVirtually all DNA virus classe...,"Viral entry into the nucleus, G R Whittaker, M...","[{'first': 'Eiki', 'middle': [], 'last': 'Seki...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Viral e..."


In [31]:
comm_df.to_csv('cleaned/clean_comm_use.csv', index=False)

## Non-commercial Use: Generate CSV

In [32]:
noncomm_dir = 'CORD-19-research-challenge/2020-03-13/noncomm_use_subset/noncomm_use_subset/'
noncomm_files = load_files(noncomm_dir)
noncomm_df = generate_clean_df(noncomm_files)
noncomm_df.head()

HBox(children=(FloatProgress(value=0.0, max=1973.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1973.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,3f95cd9324e4e68dfac149200521e09981091feb,Understanding the canine intestinal microbiota...,"Silke Schmitz, Jan Suchodolski","Silke Schmitz (Justus-Liebig University, Giess...",Abstract\n\nInterest in the composition of the...,Introduction\n\nMicroorganisms are found abund...,Efficacy of Saccharomyces boulardii as a probi...,"[{'first': 'Silke', 'middle': [], 'last': 'Sch...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Efficac..."
1,65d508add328dceec3427a9bc54cff552b6a2271,Evidence based risk assessment tool for tracin...,"Oliver ; Mohr, Oliver Mohr, Julia Hermes, Susa...","Oliver ; Mohr, Oliver Mohr, Julia Hermes, Susa...",,\n\nTracing persons who have been in contact w...,Australasian Society for HIV Medicine. Austral...,"[{'first': 'Oliver', 'middle': [';'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Austral..."
2,2176706829fb93fa471a706137b138e8766b4f32,GRAFT-VERSUS-HOST DISEASE IN CYCLOSPORIN A-TRE...,"Arnold Glazier, Peter J Tutschka, Evan R Farme...",Arnold Glazier (The Johns Hopkins University S...,Abstract\n\nCyclosporin A (CsA) ~ is a potent ...,\n\nused for the histological documentation of...,Biological effects of cyclosporin A: a new ant...,"[{'first': 'Arnold', 'middle': [], 'last': 'Gl...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Biologi..."
3,35ec885b13ad96752433a085e543db81dd1d80df,"Emerging Microbes and Infections (2012) 1, e35","Patrick Cy Woo, Susanna Kp Lau, Kenneth Sm Li,...","Patrick Cy Woo (The University of Hong Kong, H...",,\n\nhe recent outbreak of severe respiratory i...,Isolation of a novel coronavirus from a man wi...,"[{'first': 'Patrick', 'middle': ['Cy'], 'last'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Isolati..."
4,066178ded1d6d540be366e6468bc01e78e605a81,NOTE Repeated avian infectious bronchitis viru...,"Atsushi Kato, Shiori Oguro, Yukino Kurihara, H...",Atsushi Kato (Nippon Institute for Biological ...,Abstract\n\nGenotyping of avian infectious bro...,\n\nAvian infectious bronchitis virus (IBV) is...,Pathogenesis and Diagnostic Approaches of Avia...,"[{'first': 'Atsushi', 'middle': [], 'last': 'K...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Pathoge..."


In [34]:
noncomm_df.to_csv('cleaned/clean_noncomm_use.csv', index=False)