# About the Dataset

The original CORD-19 is a resource of over 1,000,000 scholarly articles, including over 400,000 with full text, about COVID-19, SARS-CoV-2, and related coronaviruses.

In our project, the dataset is sampled from the CORD-19 with size ~10,000 to reduce computation burden.

In [1]:
import os
import subprocess


# shared link: https://drive.google.com/drive/folders/1Td_ZTUVrsKeftDE5Zll7252YLJdWiNTk?usp=share_link 
# you can download the data via the shared link, and skip Step 0 and Step 1 if you want to run the code in your local machine 


# Step 0: add the shared folder to your google drive. e.g., /content/drive/MyDrive/CORD_19

# Step 1: Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")


!echo $PWD

!ls /content/drive/MyDrive/CORD_19/

# Step 2: unzip json files 
subset_dir = os.path.join(os.getcwd(),  "CORD_19_subset")


zip_file_path="/content/drive/MyDrive/CORD_19/subset.zip"

# Check if the destination directory exists
if not os.path.exists(subset_dir):
    # Unzip the file
    cmd = "unzip {} -d {}".format(zip_file_path, subset_dir)
    proc = subprocess.Popen(cmd, shell=True)
else:
    print(f"Directory {subset_dir} already exists. Skipping extraction.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
__MACOSX  meta_10k.csv	subset	subset.zip
Directory /content/CORD_19_subset already exists. Skipping extraction.


In [2]:
# import packages


import os
import json
import glob
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


# Data Load & Pre-processing

In [3]:
# Load Meta data from meta_10k.csv
data_root = '/content/drive/MyDrive/CORD_19/'

metadata_path = os.path.join(data_root, 'meta_10k.csv')
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi': str
})

print(len(meta_df))
meta_df.head()

9022


Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,44,m71xkuo9,c6bf372c094f035a514975c35a7f9c094abbe493,PMC,Sequence specific visual detection of LAMP rea...,10.1186/1472-6750-6-3,PMC1373654,16401354,cc-by,BACKGROUND: Development of a practical gene po...,2006-01-10,"Mori, Yasuyoshi; Hirano, Tsuyoshi; Notomi, Tsu...",BMC Biotechnol,,,,document_parses/pdf_json/c6bf372c094f035a51497...,document_parses/pmc_json/PMC1373654.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,
1,96,hqc7u9w3,c65f0939cf35a0f04bf93bd6e8f771b8521563a5,PMC,Transmission Parameters of the 2001 Foot and M...,10.1371/journal.pone.0000502,PMC1876810,17551582,cc-by,"Despite intensive ongoing research, key aspect...",2007-06-06,"Chis Ster, Irina; Ferguson, Neil M.",PLoS One,,,,document_parses/pdf_json/c65f0939cf35a0f04bf93...,document_parses/pmc_json/PMC1876810.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,
2,217,jzwcy7dr,a009c8efa4c5f13a5e604608d4e33e1dac078044,PMC,Results From a Hypothesis Generating Case-Cont...,10.1093/schbul/sbm139,PMC2632504,18156638,bronze-oa,Background: Herpes family viruses can cause ce...,2008-08-20,"Niebuhr, David W.; Millikan, Amy M.; Yolken, R...",Schizophrenia Bulletin,,,,document_parses/pdf_json/a009c8efa4c5f13a5e604...,document_parses/pmc_json/PMC2632504.xml.json,https://academic.oup.com/schizophreniabulletin...,
3,255,02opdk0m,b411e12b20d883ef2ee5ca19d48eff9fccedf05f,PMC,CVTree update: a newly designed phylogenetic s...,10.1093/nar/gkp278,PMC2703908,19398429,cc-by-nc,The CVTree web server (http://tlife.fudan.edu....,2009-07-01,"Xu, Zhao; Hao, Bailin",Nucleic Acids Res,,,,document_parses/pdf_json/b411e12b20d883ef2ee5c...,document_parses/pmc_json/PMC2703908.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,
4,342,094d0rn6,513d5ea4db4eb8e94c14c46b018c6041d78119cf,PMC,IPS-1 Is Essential for the Control of West Nil...,10.1371/journal.ppat.1000757,PMC2816698,20140199,cc-by,The innate immune response is essential for co...,2010-02-05,"Suthar, Mehul S.; Ma, Daphne Y.; Thomas, Sunil...",PLoS Pathog,,,,document_parses/pdf_json/513d5ea4db4eb8e94c14c...,document_parses/pmc_json/PMC2816698.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,


In [4]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9022 entries, 0 to 9021
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        9022 non-null   int64  
 1   cord_uid          9022 non-null   object 
 2   sha               9022 non-null   object 
 3   source_x          9022 non-null   object 
 4   title             9022 non-null   object 
 5   doi               8599 non-null   object 
 6   pmcid             7942 non-null   object 
 7   pubmed_id         6986 non-null   object 
 8   license           9022 non-null   object 
 9   abstract          8289 non-null   object 
 10  publish_time      9022 non-null   object 
 11  authors           9009 non-null   object 
 12  journal           8095 non-null   object 
 13  mag_id            0 non-null      float64
 14  who_covidence_id  0 non-null      float64
 15  arxiv_id          480 non-null    object 
 16  pdf_json_files    9022 non-null   object 


In [5]:
def glob_files(path, f_type=".json"):
    dst = []
    for root, _, files in os.walk(path):
        for f in files:
            if f.endswith(f_type):
                dst.append(os.path.join(root, f))
    return dst

# glob json files
json_dir = os.path.join(subset_dir, "subset/document_parses/pdf_json/")
print(json_dir)
json_files = glob_files(json_dir, ".json")

print("total json files:", len(json_files))

/content/CORD_19_subset/subset/document_parses/pdf_json/
total json files: 12000


In [6]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)

            self.title = content['metadata']['title']

            #dict_keys(['paper_id', 'metadata', 'abstract', 'body_text',
            #'bib_entries', 'ref_entries', 'back_matter'])


    def __repr__(self):
        return f"{self.paper_id}: {self.title } : {self.abstract[:200]}... {self.body_text[:200]}..."


first_row = FileReader(json_files[0])
print(first_row)

4abd0c9b745b665cd9677a3de0d7e8b9a38c0f23: Stress, physical activity, and screen-related sedentary behaviour within the first month of the COVID-19 pandemic : This study investigated how stress, physical activity and sedentary behaviours, of a small sample of Canadians, changed within the first month (i.e. March/April) of the COVID-19 pandemic and the reaso... ity trackers were recruited via social media. Participants (N = 121) completed fillable calendars (March/April 2020) with their step counts and answered an online survey. Separate paired-sample t-test...


In [7]:
from tqdm import tqdm

def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data


dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [],
         'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}


for idx, entry in tqdm(enumerate(json_files), total=len(json_files)):
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip

    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    if len(content.body_text) == 0:
        continue
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0:
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)

    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(get_breaks('. '.join(authors), 40))
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])

    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])

    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])

    # add doi
    dict_['doi'].append(meta_data['doi'].values[0])


df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text',
                                        'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

100%|██████████| 12000/12000 [00:59<00:00, 201.18it/s]


Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary
0,4abd0c9b745b665cd9677a3de0d7e8b9a38c0f23,10.1111/aphw.12261,"This study investigated how stress, physical a...",ity trackers were recruited via social media. ...,"Woodruff, Sarah J.. Coyne, Paige. St‐Pierre...","Stress, physical activity, and<br>screen‐rela...",Appl Psychol Health Well Being,"This study investigated how stress, physical<..."
1,c2aa3e817499fe7e08dc32dc8be7757ad710a50f,10.3762/bjoc.18.22,,"RNA-targeting oligonucleotides (e.g., antisens...","Kumar, Pawan. Brown, Tom",The role of chemistry in the success of<br>ol...,Beilstein J Org Chem,Not provided.
2,691a53fbf2a6ab87757dc315f558b52ddd1ff7c4,10.1177/1473325020973394,My emotional responses to this moment include ...,under the former Soviet Union with signs that ...,"Crampton, Alexandra",The lie of pandemic pivot and essential work,Qual Soc Work,My emotional responses to this moment include...
3,975e30a2e5e456c653a5be91cde1e7c6cf765071,10.4093/dmj.2020.0266,,"First of all, we would like to thank Kim et al...","Moon, Sun Joon. Rhee, Eun-Jung. Lee,<br>Won...",Independent Impact of Diabetes on the Severit...,Diabetes Metab J,Not provided.
4,6fed2795f636338b067cb926ec4750f983dea435,10.1007/978-3-030-60039-6_8,"In the current century, the novel coronavirus ...",Another aspect of wearable devices is general ...,"Krishnamurthi, Rajalakshmi. Gopinathan,<br>D...",Wearable Devices and COVID-19: State of the<b...,Emerging Technologies for Battling Covid-19,"In the current century, the novel coronavirus..."


In [8]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9022 entries, 0 to 9021
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          9022 non-null   object
 1   doi               8599 non-null   object
 2   abstract          9022 non-null   object
 3   body_text         9022 non-null   object
 4   authors           9009 non-null   object
 5   title             9022 non-null   object
 6   journal           8095 non-null   object
 7   abstract_summary  9022 non-null   object
dtypes: object(8)
memory usage: 564.0+ KB


In [9]:
df = df_covid
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8070 entries, 0 to 9020
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          8070 non-null   object
 1   doi               8070 non-null   object
 2   abstract          8070 non-null   object
 3   body_text         8070 non-null   object
 4   authors           8070 non-null   object
 5   title             8070 non-null   object
 6   journal           8070 non-null   object
 7   abstract_summary  8070 non-null   object
dtypes: object(8)
memory usage: 567.4+ KB


In [10]:
!pip install langdetect



In [11]:
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df))):
    # split by space into list, take the first x intex, join with space
    text = df.iloc[ii]['body_text'].split(" ")

    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:

            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[ii]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass

    # get the language
    languages.append(lang)

100%|██████████| 8070/8070 [01:08<00:00, 117.20it/s]


In [12]:
from pprint import pprint

languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)

print("Total: {}\n".format(len(languages)))
pprint(languages_dict)

Total: 8070

{'de': 20, 'en': 8028, 'es': 13, 'fr': 6, 'id': 1, 'nl': 2}


In [13]:
df['language'] = languages
df = df[df['language'] == 'en']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8028 entries, 0 to 9020
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          8028 non-null   object
 1   doi               8028 non-null   object
 2   abstract          8028 non-null   object
 3   body_text         8028 non-null   object
 4   authors           8028 non-null   object
 5   title             8028 non-null   object
 6   journal           8028 non-null   object
 7   abstract_summary  8028 non-null   object
 8   language          8028 non-null   object
dtypes: object(9)
memory usage: 627.2+ KB


# Histogram of year / journal

In [None]:
# Enhanced code for histogram of publication years
# Original: None

# Convert publish_time to datetime
meta_df['publish_time'] = pd.to_datetime(meta_df['publish_time'], errors='coerce')

# Plot histogram of publication years
plt.figure(figsize=(12, 6))
meta_df['publish_time'].dt.year.value_counts().sort_index().plot(kind='bar')
plt.title('Histogram of Publication Years')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.show()

# Enhanced code for histogram of journals
# Original: None

# Plot histogram of journals
plt.figure(figsize=(12, 6))
df['journal'].value_counts().head(20).plot(kind='bar')
plt.title('Top 20 Journals by Number of Papers')
plt.xlabel('Journal')
plt.ylabel('Number of Papers')
plt.xticks(rotation=90)
plt.show()


# Map-Reduce 

# Association Analysis

# Similarity Analysis

# Clustering Analysis