# Creating Clean CSV

explore the biorxiv subset of the papers. Since it is stored in JSON format, the structure is likely too complex to directly perform analysis. Thus, I not only explore the structure of those files, but I also provide the following helper functions for you to easily format inner dictionaries from each file:

format_name(author)
format_affiliation(affiliation)
format_authors(authors, with_affiliation=False)
format_body(body_text)
format_bib(bibs)

In [1]:
import os
import json
from pprint import pprint
from copy import deepcopy
import ipywidgets
import numpy as np 
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [3]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'],  with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df


# Biorxiv Exploration

We will also use this opportunity to load all of the json files into a list of nested dictionaries (each dict is an article).

In [4]:
biorxiv_dir = '/home/kike/Documentos/data/CORONAVIRUS/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 885


In [5]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [6]:
file = all_files[1]
print("Dictionary keys:", file.keys())


Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [7]:
pprint(file['abstract'])


[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'Odor memories are exceptionally robust and essential for animal '
          'survival. The olfactory (piriform) cortex has long been '
          'hypothesized to encode odor memories, yet the cellular substrates '
          'for olfactory learning and memory remain unknown. Here, using '
          'intersectional, cFos-based genetic manipulations ("Fos-tagging"), '
          'we show that olfactory fear conditioning activates sparse and '
          'distributed ensembles of neurons in mouse piriform cortex. We '
          'demonstrate that chemogenetic silencing of these Fostagged piriform '
          'ensembles selectively interferes with odor fear memory retrieval, '
          'but does not compromise basic odor detection and discrimination. '
          'Furthermore, chemogenetic reactivation of piriform neurons that '
          'were Fos-tagged during olfactory fear conditioning causes a '
          'decrease

In [8]:
print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 92
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


In [9]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [{...}],
  'ref_spans': [],
  'section': 'Introduction',
  'text': 'Odor perception, and emotional and behavioral responses to odors '
          'strongly depend on experience, and learned odor-context '
          'associations often last for the lifetime of an animal (Mouly and '
          'Sullivan, 2010) . The cellular and neural circuit mechanisms '
          'underlying olfactory learning and memory, however, remain poorly '
          'understood.'},
 {'cite_spans': [{...}, {...}, {...}, {...}, {...}, {...}],
  'ref_spans': [],
  'section': 'Introduction',
  'text': 'Recent studies on episodic and contextual learning in hippocampal '
          'neural networks have suggested that memories are encoded in the '
          'activity of distributed ensembles of neurons, often referred to as '
          "a 'memory trace' (Mayford and Reijmers, 2015; Poo et al., 2016; "
          'Tonegawa et al., 2015) . The neurons constituting such a memory '
       

In [10]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['Introduction',
 'Fos-tagging and functional manipulation of piriform neurons',
 'Fos-tagged neurons displayed diverse morphologies and electrophysiological '
 'properties and included both excitatory and inhibitory neurons '
 '(Supplementary',
 'Fos-tagged piriform ensembles are necessary for odor fear memory recall',
 'Silencing Fos-tagged piriform ensembles does not alter odor detection and '
 'discrimination',
 'Fos-tagged piriform ensembles are odor-specific',
 'Reactivation of Fos-tagged piriform ensembles is sufficient to retrieve an '
 'odor fear memory',
 'Discussion',
 'Limitations of Fos-tagging for the study of olfactory memory traces',
 'Memory traces in hippocampus and piriform cortex',
 'Mice',
 'Constructs and viruses',
 'Stereotaxic injection',
 'Electrophysiology',
 'Behavioral apparatus',
 'Behavioral procedures',
 'Drugs',
 'Data analysis',
 'Statistics',
 'Stereotaxic injection Habituation',
 'CS+']


In [11]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])

Introduction

Odor perception, and emotional and behavioral responses to odors strongly depend on experience, and learned odor-context associations often last for the lifetime of an animal (Mouly and Sullivan, 2010) . The cellular and neural circuit mechanisms underlying olfactory learning and memory, however, remain poorly understood.Recent studies on episodic and contextual learning in hippocampal neural networks have suggested that memories are encoded in the activity of distributed ensembles of neurons, often referred to as a 'memory trace' (Mayford and Reijmers, 2015; Poo et al., 2016; Tonegawa et al., 2015) . The neurons constituting such a memory trace are thought to encode information about the environmental context and associated emotions of past experiences, and their activity is necessary and sufficient for memory retrieval (Liu et al., 2012; Reijmers et al., 2007; Tanaka et al., 2014) .Here, we investigate the organization of odor memory traces in the olfactory (piriform) c

In [12]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])

Introduction

Odor perception, and emotional and behavioral responses to odors strongly depend on experience, and learned odor-context associations often last for the lifetime of an animal (Mouly and Sullivan, 2010) . The cellular and neural circuit mechanisms underlying olfactory learning and memory, however, remain poorly understood.Recent studies on episodic and contextual learning in hippocampal neural networks have suggested that memories are encoded in the activity of distributed ensembles of neurons, often referred to as a 'memory trace' (Mayford and Reijmers, 2015; Poo et al., 2016; Tonegawa et al., 2015) . The neurons constituting such a memory trace are thought to encode information about the environmental context and associated emotions of past experiences, and their activity is necessary and sufficient for memory retrieval (Liu et al., 2012; Reijmers et al., 2007; Tanaka et al., 2014) .Here, we investigate the organization of odor memory traces in the olfactory (piriform) c

In [13]:
print(format_body(file['body_text'])[:3000])

Introduction

Odor perception, and emotional and behavioral responses to odors strongly depend on experience, and learned odor-context associations often last for the lifetime of an animal (Mouly and Sullivan, 2010) . The cellular and neural circuit mechanisms underlying olfactory learning and memory, however, remain poorly understood.Recent studies on episodic and contextual learning in hippocampal neural networks have suggested that memories are encoded in the activity of distributed ensembles of neurons, often referred to as a 'memory trace' (Mayford and Reijmers, 2015; Poo et al., 2016; Tonegawa et al., 2015) . The neurons constituting such a memory trace are thought to encode information about the environmental context and associated emotions of past experiences, and their activity is necessary and sufficient for memory retrieval (Liu et al., 2012; Reijmers et al., 2007; Tanaka et al., 2014) .Here, we investigate the organization of odor memory traces in the olfactory (piriform) c

In [14]:
print(all_files[0]['metadata'].keys())


dict_keys(['title', 'authors'])


In [15]:
print(all_files[0]['metadata']['title'])


Epidemic doubling time of the COVID-19 epidemic by Chinese province 1


In [16]:
authors = all_files[0]['metadata']['authors']
pprint(authors[:3])

[{'affiliation': {},
  'email': '',
  'first': 'Kamalich',
  'last': 'Muniz-Rodriguez',
  'middle': [],
  'suffix': ''},
 {'affiliation': {},
  'email': 'gchowell@gsu.edug.10chowellwesterpants@gmail.comc.-h.cheung',
  'first': 'Gerardo',
  'last': 'Chowell',
  'middle': [],
  'suffix': ''},
 {'affiliation': {},
  'email': '',
  'first': 'Chi-Hin',
  'last': 'Cheung',
  'middle': [],
  'suffix': ''}]


In [17]:
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Kamalich Muniz-Rodriguez
Affiliation: 

Name: Gerardo Chowell
Affiliation: 

Name: Chi-Hin Cheung
Affiliation: 

Name: Dongyu Jia
Affiliation: 

Name: ; 
Affiliation: 

Name: Po-Ying Lai
Affiliation: 

Name: Yiseul Lee
Affiliation: 

Name: Manyun Liu
Affiliation: 

Name: Sylvia K Ofori
Affiliation: 

Name: Kimberlyn M Roosa
Affiliation: 

Name: Mph ; Lone Simonsen
Affiliation: 

Name: Cecile Viboud
Affiliation: 

Name: ; Isaac
Affiliation: 

Name: Chun-Hai Fung
Affiliation: 



In [18]:
pprint(all_files[4]['metadata'], depth=4)


{'authors': [{'affiliation': {},
              'email': '',
              'first': 'Justina',
              'last': 'Jankauskaitė',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Brian',
              'last': 'Jiménez-García',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Justas',
              'last': 'Dapkūnas',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Juan',
              'last': 'Fernández-Recio',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Iain',
              'last': 'Moal',
              'middle': ['H'],
              'suffix': ''}],
 'title': 'SKEMPI 2.0: An updated benchmark of changes in protein-p

In [19]:
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Justina Jankauskaitė, Brian Jiménez-García, Justas Dapkūnas, Juan Fernández-Recio, Iain H Moal

Formatting with affiliation:
Justina Jankauskaitė, Brian Jiménez-García, Justas Dapkūnas, Juan Fernández-Recio, Iain H Moal


# Biorxiv: Bibliography

In [20]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [{'first': 'G',
               'last': 'Alexander',
               'middle': [...],
               'suffix': ''},
              {'first': 'S', 'last': 'Rogan', 'middle': [...], 'suffix': ''},
              {'first': 'A', 'last': 'Abbas', 'middle': [...], 'suffix': ''},
              {'first': 'B',
               'last': 'Armbruster',
               'middle': [...],
               'suffix': ''},
              {'first': 'Y', 'last': 'Pei', 'middle': [], 'suffix': ''},
              {'first': 'J', 'last': 'Allen', 'middle': [...], 'suffix': ''},
              {'first': 'R', 'last': 'Nonneman', 'middle': [...], 'suffix': ''},
              {'first': 'J', 'last': 'Hartmann', 'middle': [], 'suffix': ''},
              {'first': 'S', 'last': 'Moy', 'middle': [...], 'suffix': ''},
              {'first': 'M',
               'last': 'Nicolelis',
               'middle': [...],
               'suffix': ''}],
  'issn': '',
  'other_ids': {},
  'pages': '27--39',
  'ref_id': 'b0',
  '

In [21]:
format_authors(bibs[1]['authors'], with_affiliation=False)


'A Apicella, Q Yuan, M Scanziani, J S Isaacson'

The following function let you format the bibliography all at once. It only extracts the title, authors, venue, year, and separate each entry of the bibliography with a ;.



In [22]:
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

Remote Control of Neuronal Activity in Transgenic Mice Expressing Evolved G Protein-Coupled Receptors, G M Alexander, S C Rogan, A I Abbas, B N Armbruster, Y Pei, J A Allen, R J Nonneman, J Hartmann, S S Moy, M A Nicolelis, Neuron, 2009; Pyramidal cells in piriform cortex receive convergent input from distinct olfactory bulb glomeruli, A Apicella, Q Yuan, M Scanziani, J S Isaacson, J Neurosci, 2010; An Interglomerular Circuit Gates Glomerular Output and Implements Gain Control in the Mouse Olfactory Bulb, A Banerjee, F Marbach, F Anselmi, M S Koh, M B Davis, P Garcia Da Silva, K Delevich, H K Oyibo, P Gupta, B Li, Neuron, 2015; The Corticohippocampal Circuit, Synaptic Plasticity, and Memory, J Basu, S A Siegelbaum, Cold Spring Harb. Perspect. Biol, 2015; Complementary codes for odor identity and intensity in olfactory cortex, K A Bolding, K M Franks, , 2017


# Biorxiv: Generate CSV

In this section, I show you how to manually generate the CSV files. As you can see, it's now super simple because of the format_ helper functions. In the next sections, I show you have to generate them in 3 lines using the load_files and generate_clean_dr helper functions.

In [23]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

HBox(children=(FloatProgress(value=0.0, max=885.0), HTML(value='')))




In [24]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,5f6a410a4ba086b296100a8a5a10df59eb337c90,Epidemic doubling time of the COVID-19 epidemi...,"Kamalich Muniz-Rodriguez, Gerardo Chowell, Chi...","Kamalich Muniz-Rodriguez, Gerardo Chowell, Chi...",,To the editor: 27\n\nOur ability to estimate t...,"Infectious diseases of humans, R M Anderson, R...","[{'first': 'Kamalich', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Infecti..."
1,4eb8b7fd0032816e4a29d65b06939266d6446624,Encoding of odor fear memories in the mouse ol...,"Claire Meissner-Bernard, Yulia Dembitskaya, La...","Claire Meissner-Bernard (CNRS, Paris, France),...",Abstract\n\nOdor memories are exceptionally ro...,"Introduction\n\nOdor perception, and emotional...",Remote Control of Neuronal Activity in Transge...,"[{'first': 'Claire', 'middle': [], 'last': 'Me...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Remote ..."
2,fd847a97e6fce134345b25ae9240c4ebb680cca5,Clinical Characteristics on 25 Discharged Pati...,"Jing Yuan, Shanglong Kou, Yanhua Liang, Jianfe...","Jing Yuan (Shenzhen Third People's Hospital, 5...",Abstract\n\nHere we report the clinical featur...,"Introduction\n\nSince Dec 8 th 2019, many case...",Clinical Characteristics of 138 Hospitalized P...,"[{'first': 'Jing', 'middle': [], 'last': 'Yuan...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Clinica..."
3,8c900409a679dc1f16ef13c73500cbb53605b683,Lung epithelial cells have virus-specific and ...,"James T Vanleuven, Benjamin J Ridenhour, Craig...","James T Vanleuven (University of Idaho, Moscow...",Abstract\n\nThe severity and outcome of respir...,31\n\nMurine models of respiratory viral infec...,Mouse models of rhinovirus-induced disease and...,"[{'first': 'James', 'middle': ['T'], 'last': '...","{'BIBREF3': {'ref_id': 'b3', 'title': 'Mouse m..."
4,0acc1f9a1c333a9a6b2dbba4a252d7576f024783,SKEMPI 2.0: An updated benchmark of changes in...,"Justina Jankauskaitė, Brian Jiménez-García, Ju...","Justina Jankauskaitė, Brian Jiménez-García, Ju...",Abstract\n\nMotivation: Understanding the rela...,I. INTRODUCTION\n\nProtein-protein interaction...,Characterizing changes in the rate of protein-...,"[{'first': 'Justina', 'middle': [], 'last': 'J...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Charact..."


In [25]:
path = r'/home/kike/Documentos/data/CORONAVIRUS/CORD-19-research-challenge/clean_csv/'


In [26]:
clean_df.to_csv(path + 'biorxiv_clean.csv', index=False)


# Generate CSV: Custom PMC, Commercial, Non-commercial licenses

In [27]:
pmc_dir = '/home/kike/Documentos/data/CORONAVIRUS/CORD-19-research-challenge/custom_license/custom_license/'
pmc_files = load_files(pmc_dir)
pmc_df = generate_clean_df(pmc_files)
pmc_df.to_csv(path +'clean_pmc.csv', index=False)
pmc_df.head()

HBox(children=(FloatProgress(value=0.0, max=16959.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16959.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,587236e2df1d792802bcae5d78d7e9db5766bb4b,Multiple Sclerosis: Basic Concepts and Hypothesis,Moses Rodriguez,Moses Rodriguez,"Abstract\n\nMultiple sclerosis, an inflammator...",\n\nIndividual reprints of this article are no...,"The neuropathology of multiple sclerosis, Jw ;...","[{'first': 'Moses', 'middle': [], 'last': 'Rod...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The neu..."
1,466db44b1f77af3237d04ac0313bd03c2b2e94bc,,,,,\n\nA rctic populations have historically endu...,Indigenous health in the Arctic: an overview o...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Indigen..."
2,cac6ae818cd3f9622a6bf132381d5b22f1c667d7,Hapiosamates A and B: New Steroidal Sulfamate ...,"Asfia Qnreshi, D John Faullmer",Asfia Qnreshi (University of California at San...,Abstract\n\nAktract: Two sponges from the Phih...,"T H T ""OH N o sa\n\nThe presence of the snifam...","H-14), 1.41 (m, 1 H, H-24), 1.37 (m, 2 H, H-l)...","[{'first': 'Asfia', 'middle': [], 'last': 'Qnr...","{'BIBREF1': {'ref_id': 'b1', 'title': 'H-14), ..."
3,f42306a841ad7558e5d19de5593885c6444cfb78,Rapid diagnostic thin section electron microsc...,"Michael Laue, Bärbel Niederwöhrmeier, Norbert ...","Michael Laue (Robert Koch Institute, Nordufer ...",Abstract\n\nEmerging infectious diseases such ...,Introduction\n\nDiagnostic electron microscopy...,"Electron microscopy of viruses, S S Biel, H R ...","[{'first': 'Michael', 'middle': [], 'last': 'L...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Electro..."
4,3cdfc63c3790d0bc50c45fe33b2b7ca304641a5d,Interfering Waves of Adaptation Promote Spatia...,"Erik A Martens, Oskar Hallatschek","Erik A Martens, Oskar Hallatschek",Abstract\n\nA fundamental problem of asexual a...,\n\nO NE of the most basic questions of evolut...,"Periodic selection in Escherichia coli, K C At...","[{'first': 'Erik', 'middle': ['A'], 'last': 'M...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Periodi..."


In [28]:
comm_dir = '/home/kike/Documentos/data/CORONAVIRUS/CORD-19-research-challenge/comm_use_subset/comm_use_subset/'
comm_files = load_files(comm_dir)
comm_df = generate_clean_df(comm_files)
comm_df.to_csv(path + 'clean_comm_use.csv', index=False)
comm_df.head()


HBox(children=(FloatProgress(value=0.0, max=9118.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9118.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,7aa3c6793455d0748b58fc6ff75ab336b2863c6c,BMC Structural Biology Orientation determinati...,Ali Samir Saad,"Ali Samir Saad (King Saud University, P.O. Box...",Abstract\n\nIn order to perform a 3D reconstru...,Background\n\nThree-dimensional (3D) reconstru...,Procedures for three-dimensional reconstructio...,"[{'first': 'Ali', 'middle': ['Samir'], 'last':...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Procedu..."
1,cc146665ee73fa1830b7c42a24cd5844346c0f39,Detection of central nervous system viral infe...,"Arthur H P Mawuntu, Janno B B Bernadus, Rama D...","Arthur H P Mawuntu (Sam Ratulangi University, ...",Abstract\n\nCentral nervous system (CNS) viral...,Introduction\n\nCentral nervous system (CNS) i...,Beyond viruses: clinical profiles and etiologi...,"[{'first': 'Arthur', 'middle': ['H P'], 'last'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Beyond ..."
2,8af0901c4f1252ca252a8618a346c88b3b65ae1b,Potential Geographic Distribution of the Novel...,"Gengping Zhu, A Townsend Peterson","Gengping Zhu (Tianjin Normal University, Tianj...","Abstract\n\nBackground: In late March 2013, a ...","Introduction\n\nIn March and early April 2013,...",Origins and evolutionary genomics of the novel...,"[{'first': 'Gengping', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Origins..."
3,446c633eb33c9168e597b53b0c93ead24aa9d43c,Progeny Varicella-Zoster Virus Capsids Exit th...,"James H Girsch, Katherine Walters, Wallen Jack...","James H Girsch (University of Iowa, Iowa City,...",Abstract\n\nis an alphaherpesvirus that lacks ...,\n\nM acroautophagy (here called autophagy) is...,HSV-1 ICP34.5 confers neurovirulence by target...,"[{'first': 'James', 'middle': ['H'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'HSV-1 I..."
4,561bde3336a2cd2006251effb54e5428c4edf3b9,Systems Integration of Biodefense Omics Data f...,"P B Mcgarvey, H Huang, R Mazumder, J Zhang, Y ...","P B Mcgarvey, H Huang, R Mazumder, J Zhang, Y ...",Abstract\n\nThe NIAID (National Institute for ...,Introduction\n\nThe NIAID (National Institute ...,Building integrated approaches for the proteom...,"[{'first': 'P', 'middle': ['B'], 'last': 'Mcga...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Buildin..."


In [29]:
noncomm_dir = '/home/kike/Documentos/data/CORONAVIRUS/CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/'
noncomm_files = load_files(noncomm_dir)
noncomm_df = generate_clean_df(noncomm_files)
noncomm_df.to_csv(path + 'clean_noncomm_use.csv', index=False)
noncomm_df.head()

HBox(children=(FloatProgress(value=0.0, max=2353.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2353.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,b1870c85dae2f3bd004c29206c9a0281f4a9a249,Effect of Olfactory Bulb Ablation on Spread of...,"Stanley Perlman, &quot; , Gregory Evans, Adel ...","Stanley Perlman (University of Iowa, 52242, Io...","Abstract\n\nPrevious results suggested that, a...","\n\nM ouse hepatitis virus (MHV),' a member of...",64 :761. cephalomyelitis with extensive destru...,"[{'first': 'Stanley', 'middle': [], 'last': 'P...","{'BIBREF0': {'ref_id': 'b0', 'title': '64 :761..."
1,f5b032ddcc2c69fa564eec3ac027060064d7b255,New Respiratory Viruses and the Elderly,"Laura Jartti, Henriikka Langen, Maria Söderlun...","Laura Jartti (Turku City Hospital, Turku, Finl...",Abstract\n\nThe diagnostics of respiratory vir...,INTRODUCTION\n\nLife expectancy has increased ...,"Ageing populations: the challenges ahead, K Ch...","[{'first': 'Laura', 'middle': [], 'last': 'Jar...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Ageing ..."
2,4001bc47c8c30f370e59b5d26a74187b1727cc5b,Round-up of GHSA Steering Group and Action Pac...,,,,Introduction\n\nThis inaugural meeting of the ...,Global Health Security Agenda: Getting Ahead o...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Global ..."
3,459f95218942f624446533e21c9ab2775b8efce3,Supplemental Materials Molecular Biology of th...,Kolb,Kolb,,VPS38\n\nPart of a Vps34p phosphatidylinositol...,"Saccharomyces Genome Database, , , None; Cloni...","[{'first': '', 'middle': [], 'last': 'Kolb', '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Sacchar..."
4,76cbada4dbb0c067cbffac6a1f1cede25aec708b,The Impact of Middle East Respiratory Syndrome...,"So Hyun Paek, Do Kyun Kim, Jin Hee Lee, Young ...",So Hyun Paek (Seoul National University Hospit...,Abstract\n\nChanges occurred in the patterns o...,INTRODUCTION\n\nAn outbreak of Middle East res...,Middle East Respiratory Syndrome Coronavirus s...,"[{'first': 'So', 'middle': ['Hyun'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Middle ..."
