In [1]:
import numpy as np
import pandas as pd
import os
import json
import glob
import sys

In [2]:
json_filenames = glob.glob(f'../data/noncomm_use_subset/*.json', recursive=True)

In [3]:
def return_corona_df(json_filenames, source):
    features = {"doc_id": [None], "source": [None], "title": [None], "authors": [None],
                  "abstract": [None], "text_body": [None], "bibliography": [None]}
    df = pd.DataFrame.from_dict(features)
    
    for file_name in json_filenames:

        row = {"doc_id": None, "source": None, "title": None, "authors": [None],
              "abstract": None, "text_body": None, "bibliography": None}

        with open(file_name) as json_data:
            data = json.load(json_data)

            row['doc_id'] = data['paper_id']
            row['title'] = data['metadata']['title']
            
            authors = ", ".join([author['first'] + " " + author['last'] \
                                 for author in data['metadata']['authors'] if data['metadata']['authors']])
            row['authors'] = authors

            abstract_list = [data['abstract'][x]['text'] for x in range(len(data['abstract']) - 1)]
            abstract = "\n ".join(abstract_list)

            row['abstract'] = abstract 
            
            body_list = [d['text'] for d in data['body_text']]
            body = "\n ".join(body_list)
            
            row['text_body'] = body
            
            bibliography = "\n ".join([bib['title'] + "," + bib['venue'] + "," + str(bib['year']) \
                                      for bib in data['bib_entries'].values()])
            row['bibliography'] = bibliography
            
            if source == 'b':
                row['source'] = "BIORXIV"
            elif source == "c":
                row['source'] = "COMMON_USE_SUB"
            elif source == "n":
                row['source'] = "NON_COMMON_USE"
            elif source == "p":
                row['source'] = "PMC_CUSTOM_LICENSE"
            
            df = df.append(row, ignore_index=True)
    
    return df


In [4]:
corona_df = return_corona_df(json_filenames, 'b')

In [5]:
type(corona_df)

pandas.core.frame.DataFrame

In [6]:
corona_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1974 entries, 0 to 1973
Data columns (total 7 columns):
doc_id          1973 non-null object
source          1973 non-null object
title           1973 non-null object
authors         1973 non-null object
abstract        1973 non-null object
text_body       1973 non-null object
bibliography    1973 non-null object
dtypes: object(7)
memory usage: 108.0+ KB


In [7]:
corona_df.head()

Unnamed: 0,doc_id,source,title,authors,abstract,text_body,bibliography
0,,,,,,,
1,ad98979eada6e333a276d39efdce21779d538625,BIORXIV,Xanthine-based acyclic nucleoside phosphonates...,"Ond Rej Baszczy Nski, Martin Kaiser, Michal Ce...",,The concentration and ratio of purine nucleoti...,Absolute metabolite concentrations and implied...
2,c436139975d97ef929b5d8452595de40bda0c11c,BIORXIV,"Phone: (1) 301-451-9881, jbeigel@niaid.nih.gov","John Beigel, Pablo Tebas, Marie-Carmelle Elie-...",on behalf of the IRC002 Study Team Summary Bac...,Pandemic influenza remains a global health thr...,Meta-analysis: convalescent blood products for...
3,634128ea7d7736750e1c3cd0a48bb37843d06dac,BIORXIV,A Strategy To Estimate Unknown Viral Diversity...,"Simon Anthony, Jonathan Epstein, Kris Murray, ...",,"A total of 12,793 consensus PCR assays were pe...",Factors in the emergence of infectious disease...
4,f3cb4102ee8c1aeb8e68595843292801a08effe3,BIORXIV,Global Catastrophic Biological Risks Pandemics...,"Lone Simonsen, Cecile Viboud",,a welcome opportunity to reflect on the most s...,World Health Organization. Influenza. Pandemic...
