This is notebook to create dataframes of citation data for all of the articles.  The code is a slightly modified version of the EDA code created by user xhlulu here: https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv


## About this notebook

In this notebook, I quickly explore the `biorxiv` subset of the papers. Since it is stored in JSON format, the structure is likely too complex to directly perform analysis. Thus, I not only explore the structure of those files, but I also provide the following helper functions for you to easily format inner dictionaries from each file:
* `format_name(author)`
* `format_affiliation(affiliation)`
* `format_authors(authors, with_affiliation=False)`
* `format_body(body_text)`
* `format_bib(bibs)`

Feel free to reuse those functions for your own purpose! If you do, please leave a link to this notebook.

Throughout the EDA, I show you how to use each of those files. At the end, I show you how to generate a clean version of the `biorxiv` as well as all the other datasets, which you can directly use by choosing this notebook as a data source ("File" -> "Add or upload data" -> "Kernel Output File" tab -> search the name of this notebook).

### Update Log

* V9: First release.
* V10: Updated paths to include the [14k new papers](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge/discussion/137474).

In [None]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import ast

import seaborn as sns

## Helper Functions

Unhide the cell below to find the definition of the following functions:
* `format_name(author)`
* `format_affiliation(affiliation)`
* `format_authors(authors, with_affiliation=False)`
* `format_body(body_text)`
* `format_bib(bibs)`

In [None]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))
        

    return "; ".join(formatted)


#use this to structure sources into nested lists, to more easily create a pandas dataframe later
def bib_nested_lists(bibs):
    #bibs_df = pd.Dataframe(columns = ['article_title', 'article_year', 'cited_article', 'cited_article_year', 'data_source'])
    bib_data_rows = []
    if type(bibs) == dict:
        bibs = list(bibs.values())
    else:
        bibs = bibs.replace("\'", "\"")
        bibs = json.loads(bibs)
        bibs = list(bibs.values())
        
    bibs = deepcopy(bibs)
    
    #print('BIBS')
    #print(bibs)
    
    for bib in bibs:
        formatted_ls = [str(bib[k]) for k in ['title', 'year']]
        bib_data_rows.append(formatted_ls)
        
        #bibs_df = pd.DataFrame(bib_data_rows, columns = ['cited_article', 'cited_article_year'])
    return bib_data_rows

Unhide the cell below to find the definition of the following functions:
* `load_files(dirname)`
* `generate_clean_df(all_files)`

In [None]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            bib_nested_lists(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

In [None]:
biorxiv_dir = '/kaggle/input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

In [None]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [None]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd

In [None]:
file = all_files[0]
print("Dictionary keys:", file.keys())

## Biorxiv: Generate CSV

In this section, I show you how to manually generate the CSV files. As you can see, it's now super simple because of the `format_` helper functions. In the next sections, I show you have to generate them in 3 lines using the `load_files` and `generate_clean_dr` helper functions.

In [None]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        bib_nested_lists(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

In [None]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

In [None]:
clean_df.to_csv('biorxiv_nested_clean.csv', index=False)

In [None]:
clean_df['bibliography'][0]

## Generate CSV: Custom (PMC), Commercial, Non-commercial licenses

In [None]:
pmc_dir = '/kaggle/input/CORD-19-research-challenge/custom_license/custom_license/'
pmc_files = load_files(pmc_dir)
pmc_df = generate_clean_df(pmc_files)
pmc_df.to_csv('clean_nested_pmc.csv', index=False)
pmc_df.head()

In [None]:
pmc_df['bibliography'][0]

In [None]:
comm_dir = '/kaggle/input/CORD-19-research-challenge/comm_use_subset/comm_use_subset/'
comm_files = load_files(comm_dir)
comm_df = generate_clean_df(comm_files)
comm_df.to_csv('clean_comm_use_nested.csv', index=False)
comm_df.head()

In [None]:
comm_df['bibliography'][0]

In [None]:
noncomm_dir = '/kaggle/input/CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/'
noncomm_files = load_files(noncomm_dir)
noncomm_df = generate_clean_df(noncomm_files)
noncomm_df.to_csv('clean_noncomm_use_nested.csv', index=False)
noncomm_df.head()

In [None]:
noncomm_df['bibliography'][0]

# Using the non_comm dataset, generated a df of citations between papers

In [None]:
noncomm_df = pd.read_csv('/kaggle/input/covid19-for-citation-networks/clean_noncomm_use_nested.csv')

In [None]:
noncomm_df['dataset'] = 'non_comm'

In [None]:
noncomm_df

In [None]:
noncomm_df['bibliography'][0]

In [None]:
nest_list = ast.literal_eval(noncomm_df['bibliography'][0]) 
print(nest_list)

In [None]:
def bib_to_df(bib_data_rows):
    #bibs_df = pd.Dataframe(columns = ['article_title', 'article_year', 'cited_article', 'cited_article_year', 'data_source'])
    bibs_df = pd.DataFrame(ast.literal_eval(bib_data_rows), columns = ['cited_article', 'cited_article_year'])   
    return bibs_df

In [None]:
bib_to_df(noncomm_df['bibliography'][0])

In [None]:
def fill_df_rows(df, col_name, val):
    df[col_name] = val

In [None]:
def create_whole_citation_df(original_df):
    whole_citation_df = pd.DataFrame(columns = ['source_article','cited_article', 'cited_article_year', 'source_article_dataset'])
    for i in range(len(original_df)):
        print('Article: ', i)
        
        
        
        #user helper function to 
        citation_df_rows = bib_to_df(original_df.loc[i, 'bibliography'])
        
        
        
        
        fill_df_rows(citation_df_rows,'source_article', original_df.loc[i, 'title'])
        fill_df_rows(citation_df_rows, 'source_article_dataset', original_df.loc[i, 'dataset'])
        
        
        whole_citation_df = whole_citation_df.append(citation_df_rows)
        print('Num citations: ', len(whole_citation_df))
    return whole_citation_df

  


In [None]:
whole_df = create_whole_citation_df(noncomm_df)

In [None]:
whole_df

In [None]:
whole_df.to_csv('noncomm_network_data.csv')

# Repeat the process w/ the other 3 datasets and append them together

In [None]:
biorxiv_df = pd.read_csv('/kaggle/input/covid19-for-citation-networks/biorxiv_nested_clean.csv')

In [None]:
biorxiv_df['dataset'] = 'biorxiv'

In [None]:
network_biorxiv = create_whole_citation_df(biorxiv_df)

In [None]:
network_biorxiv.head()

In [None]:
network_biorxiv.to_csv('network_biorxiv.csv')

In [None]:
whole_df = whole_df.append(network_biorxiv)

In [None]:
pmc_df = pd.read_csv('/kaggle/input/covid19-for-citation-networks/clean_nested_pmc.csv')

In [None]:
pmc_df['dataset'] = 'pmc'

In [None]:
network_pmc = create_whole_citation_df(pmc_df)

In [None]:
network_pmc

In [None]:
network_pmc.to_csv('network_pmc.csv')

In [None]:
whole_df = whole_df.append(network_pmc)

In [None]:
comm_df = pd.read_csv('/kaggle/input/covid19-for-citation-networks/clean_comm_use_nested.csv')

In [None]:
comm_df

In [None]:
comm_df['dataset'] = 'comm'

In [None]:
network_comm = create_whole_citation_df(comm_df)

In [None]:
network_comm.to_csv('network_comm.csv')

In [None]:
whole_df= whole_df.append(network_comm)

In [None]:
whole_df

In [None]:
whole_df.to_csv('network_all_datasets.csv')

# Looking at citation data: Which Articles are cited the most?

In [None]:
whole_df = pd.read_csv('/kaggle/input/covid19-for-citation-networks/network_all_datasets.csv')

In [None]:
whole_df

In [None]:
whole_df['cited_article'].value_counts().head(100)

In [None]:
whole_df[whole_df['source_article_dataset']== 'pmc']['cited_article'].value_counts().head(100)

In [None]:
whole_df[whole_df['source_article_dataset']== 'biorxiv']['cited_article'].value_counts().head(100)

In [None]:
whole_df[whole_df['source_article_dataset']== 'comm']['cited_article'].value_counts().head(100)

In [None]:
whole_df[whole_df['source_article_dataset']== 'non_comm']['cited_article'].value_counts().head(100)