# About the Dataset

The original CORD-19 is a resource of over 1,000,000 scholarly articles, including over 400,000 with full text, about COVID-19, SARS-CoV-2, and related coronaviruses.

In our project, the dataset is sampled from the CORD-19 with size ~10,000 to reduce computation burden.

In [1]:
import os
import subprocess


# shared link: https://drive.google.com/drive/folders/1Td_ZTUVrsKeftDE5Zll7252YLJdWiNTk?usp=share_link 
# you can download the data via the shared link, and skip Step 0 and Step 1 if you want to run the code in your local machine 


# Step 0: add the shared folder to your google drive. e.g., /content/drive/MyDrive/CORD_19

# Step 1: Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")


!echo $PWD

!ls /content/drive/MyDrive/CORD_19/

# Step 2: unzip json files 
subset_dir = os.path.join(os.getcwd(),  "CORD_19_subset")


zip_file_path="/content/drive/MyDrive/CORD_19/subset.zip"

# Check if the destination directory exists
if not os.path.exists(subset_dir):
    # Unzip the file
    cmd = "unzip {} -d {}".format(zip_file_path, subset_dir)
    proc = subprocess.Popen(cmd, shell=True)
else:
    print(f"Directory {subset_dir} already exists. Skipping extraction.")

ModuleNotFoundError: No module named 'google.colab'

In [2]:
# import packages


import os
import json
import glob
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


# Data Load & Pre-processing

In [7]:
# Load Meta data from meta_10k.csv
data_root = os.path.join(os.getcwd(),'input')

metadata_path = os.path.join(data_root, 'meta_10k.csv')
meta_df = pd.read_csv(metadata_path, index_col=0,converters={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi': str
})

print(len(meta_df))
meta_df.head()

9022


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
44,m71xkuo9,c6bf372c094f035a514975c35a7f9c094abbe493,PMC,Sequence specific visual detection of LAMP rea...,10.1186/1472-6750-6-3,PMC1373654,16401354,cc-by,BACKGROUND: Development of a practical gene po...,2006-01-10,"Mori, Yasuyoshi; Hirano, Tsuyoshi; Notomi, Tsu...",BMC Biotechnol,,,,document_parses/pdf_json/c6bf372c094f035a51497...,document_parses/pmc_json/PMC1373654.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,
96,hqc7u9w3,c65f0939cf35a0f04bf93bd6e8f771b8521563a5,PMC,Transmission Parameters of the 2001 Foot and M...,10.1371/journal.pone.0000502,PMC1876810,17551582,cc-by,"Despite intensive ongoing research, key aspect...",2007-06-06,"Chis Ster, Irina; Ferguson, Neil M.",PLoS One,,,,document_parses/pdf_json/c65f0939cf35a0f04bf93...,document_parses/pmc_json/PMC1876810.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,
217,jzwcy7dr,a009c8efa4c5f13a5e604608d4e33e1dac078044,PMC,Results From a Hypothesis Generating Case-Cont...,10.1093/schbul/sbm139,PMC2632504,18156638,bronze-oa,Background: Herpes family viruses can cause ce...,2008-08-20,"Niebuhr, David W.; Millikan, Amy M.; Yolken, R...",Schizophrenia Bulletin,,,,document_parses/pdf_json/a009c8efa4c5f13a5e604...,document_parses/pmc_json/PMC2632504.xml.json,https://academic.oup.com/schizophreniabulletin...,
255,02opdk0m,b411e12b20d883ef2ee5ca19d48eff9fccedf05f,PMC,CVTree update: a newly designed phylogenetic s...,10.1093/nar/gkp278,PMC2703908,19398429,cc-by-nc,The CVTree web server (http://tlife.fudan.edu....,2009-07-01,"Xu, Zhao; Hao, Bailin",Nucleic Acids Res,,,,document_parses/pdf_json/b411e12b20d883ef2ee5c...,document_parses/pmc_json/PMC2703908.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,
342,094d0rn6,513d5ea4db4eb8e94c14c46b018c6041d78119cf,PMC,IPS-1 Is Essential for the Control of West Nil...,10.1371/journal.ppat.1000757,PMC2816698,20140199,cc-by,The innate immune response is essential for co...,2010-02-05,"Suthar, Mehul S.; Ma, Daphne Y.; Thomas, Sunil...",PLoS Pathog,,,,document_parses/pdf_json/513d5ea4db4eb8e94c14c...,document_parses/pmc_json/PMC2816698.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,


In [11]:
def glob_files(path, f_type=".json"):
    dst = []
    for root, _, files in os.walk(path):
        for f in files:
            if f.endswith(f_type):
                dst.append(os.path.join(root, f))
    return dst

# glob json files
json_dir = os.path.join(data_root, "subset","subset","document_parses","pdf_json")
print(json_dir)
json_files = glob_files(json_dir, ".json")

print("total json files:", len(json_files))

c:\Users\pywong\Desktop\PolyU\COMP5434 Big Data Computing\Assignment\input\subset\subset\document_parses\pdf_json
total json files: 12000


In [12]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)

            self.title = content['metadata']['title']

            #dict_keys(['paper_id', 'metadata', 'abstract', 'body_text',
            #'bib_entries', 'ref_entries', 'back_matter'])


    def __repr__(self):
        return f"{self.paper_id}: {self.title } : {self.abstract[:200]}... {self.body_text[:200]}..."


first_row = FileReader(json_files[0])
print(first_row)

00046b27022615aaec3782ea69c56da3d2fd2ffa: Vaccine design and delivery approaches for COVID-19 : ... process of vaccine design has been revolutionized by reverse vaccinology, which focuses on finding potential vaccine candidates through bioinformatics analysis of the protein-coding genome (proteome) ...


In [13]:
from tqdm import tqdm

def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data


dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [],
         'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}


for idx, entry in tqdm(enumerate(json_files), total=len(json_files)):
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip

    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    if len(content.body_text) == 0:
        continue
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0:
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)

    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(get_breaks('. '.join(authors), 40))
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])

    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])

    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])

    # add doi
    dict_['doi'].append(meta_data['doi'].values[0])


df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text',
                                        'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

  0%|          | 0/12000 [00:00<?, ?it/s]

100%|██████████| 12000/12000 [03:45<00:00, 53.29it/s]


Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary
0,0015cecc2298c3bdb9bda0e0b84b38ebdcca716f,10.1007/s40744-022-00425-0,Coronavirus disease 2019 caused by the severe ...,Systemic inflammation is the hallmark of coron...,"Drosos, Alexandros A.. Pelechas,<br>Elefther...",Colchicine Against SARS-CoV-2 Infection:<br>W...,Rheumatol Ther,Coronavirus disease 2019 caused by the severe...
1,0016a5c32e9cea6002cf6f352ecdb5231c7e3194,10.30699/ijp.2020.127312.2387,,Severe acute respiratory syndrome coronavirus ...,"Abdollahi, Alireza. shakoori, Abbas. <br>Kho...",Comparison of Patient-collected and Lab<br>Te...,Iran J Pathol,Not provided.
2,0020e5c796f50e927b96ff0b7ea375b4745261a3,10.1186/s12887-020-02112-x,Background: One major limitation for less inva...,Respiratory Distress Syndrome (RDS) affects 85...,"Chevallier, Marie. Durrmeyer, Xavier. Ego,<...",Propofol versus placebo (with rescue with<br>...,BMC Pediatr,Background: One major limitation for less<br>...
3,00234390dbc45e3370ac8f80cbd93cec3777ca92,10.1016/j.ajur.2021.03.006,Objective: To identify the impact of COVID-19 ...,The World Health Organization (WHO) has announ...,"Raheem Ali, Abdel. Ghazwani, Yahya. Alowida...",Impact of COVID-19 on endourology surgical<br...,Asian J Urol,Objective: To identify the impact of COVID-19...
4,0025fd87cfaf7953079b618771218b8de6893928,10.3389/fcell.2021.748905,,"The concept of specialized ""membrane microdoma...","Garofalo, Tina. Misasi, Roberta. Preta,<br>...",Editorial: Targeting Lipid Rafts as a Strateg...,Front Cell Dev Biol,Not provided.


In [14]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9022 entries, 0 to 9021
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          9022 non-null   object
 1   doi               9022 non-null   object
 2   abstract          9022 non-null   object
 3   body_text         9022 non-null   object
 4   authors           9009 non-null   object
 5   title             9022 non-null   object
 6   journal           8095 non-null   object
 7   abstract_summary  9022 non-null   object
dtypes: object(8)
memory usage: 564.0+ KB


In [15]:
df = df_covid
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8083 entries, 0 to 9021
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          8083 non-null   object
 1   doi               8083 non-null   object
 2   abstract          8083 non-null   object
 3   body_text         8083 non-null   object
 4   authors           8083 non-null   object
 5   title             8083 non-null   object
 6   journal           8083 non-null   object
 7   abstract_summary  8083 non-null   object
dtypes: object(8)
memory usage: 568.3+ KB


In [18]:
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df))):
    # split by space into list, take the first x intex, join with space
    text = df.iloc[ii]['body_text'].split(" ")

    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:

            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[ii]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass

    # get the language
    languages.append(lang)

100%|██████████| 8083/8083 [00:46<00:00, 173.18it/s]


In [19]:
from pprint import pprint

languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)

print("Total: {}\n".format(len(languages)))
pprint(languages_dict)

Total: 8083

{'de': 20, 'en': 8041, 'es': 13, 'fr': 6, 'id': 1, 'nl': 2}


In [20]:
df['language'] = languages
df = df[df['language'] == 'en']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8041 entries, 0 to 9021
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          8041 non-null   object
 1   doi               8041 non-null   object
 2   abstract          8041 non-null   object
 3   body_text         8041 non-null   object
 4   authors           8041 non-null   object
 5   title             8041 non-null   object
 6   journal           8041 non-null   object
 7   abstract_summary  8041 non-null   object
 8   language          8041 non-null   object
dtypes: object(9)
memory usage: 628.2+ KB


NLP

In [None]:
# NOTE: This section is directly copied from the script showed in lecture 5.
# For reference only. Do not use the code directly for project submission.

In [None]:
!pip install nltk
!pip install spacy==2.3.5
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
!pip install pyresparser

In [None]:
#NLP
import spacy
from spacy.lang.en_stop_words import STOP_WORDS
# import en_core_sci_lg

In [None]:
import string

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[0:10]

In [None]:
custom_stop_words = [
    'doi','preprint','copyright','peer','reviewed','org','https','et','al','author','figure',
    'rights','reserved','permission','used','using','biorxiv','medrxiv','license','fig','fig.',
    'al.','Elsevier','PMC','CZI'
]

for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)

In [None]:
# Parser
#parser = en_core_sci_lg.load(disable=["tagger","ner"])
#nlp = spacy.load('en_core_web_sm',disable=["tagger","ner"])
nlp = spacy.load('en_core_web_sm',disable=["tagger","parser","ner"]) # Only tokenization and lemmation are performed, POS tagging, NER and syntactic parsing are skipped.
nlp.max_length = 7000000

stopwords = set(stopwords)
punctuations = set(punctuations)
print('len(stopwords):',len(stopwords),'len(punctuations):',len(punctuations))

def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [word.lemma_.lower().strip() if word.lemma != '-PORN-' else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

tqdm.pandas()
df['procesed_text'] = df['body_text'].progress_apply(spacy_tokenizer)



Vectorization using TF-IDF

In [None]:
# NOTE: This section is directly copied from the script showed in lecture 5.
# For reference only. Do not use the code directly for project submission.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(text,max_feature):
    
    vectorizer = TfidfVectorizer(max_features=max_feature)
    X = vectorizer.fit_transform(text)
    
    return X

text = df['processed_text'].values
max_feature = 2**12

X = vectorize(text,max_feature)


PCA & Clustering

In [None]:
# NOTE: This section is directly copied from the script showed in lecture 5.
# For reference only. Do not use the code directly for project submission.

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans

# PCA
pca = PCA(n_components=0.95,random_state=42)
X_reduced = pca.fit_transform(X.toarray())
X_reduced.shape

# K Mean Clustering
k = 10
kmeans = KMeans(n_clusters=k,random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
df['y'] = y_pred

TSNE Visualization

In [None]:
# NOTE: This section is directly copied from the script showed in lecture 5.
# For reference only. Do not use the code directly for project submission.

In [None]:
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
import seaborn as sns

tsne = TSNE(verbose=1,perplexity=50) # Change perplexity from 100 to 50 per FAQ
X_embedded = tsne.fit_transform(X.toarray())

# sns setting
sns.set(rc={'figure.figsize':(15,15)})

# color
palette = sns.color_palette('bright',1)

# plot
sns.scatterplot(x=X_embedded[:,0],y=X_embedded[:,1],palette=palette)
plt.title('t-SNE with no Labels')
plt.savefig('t-sne_covid19.png')
plt.show()



In [None]:
# sns settings
sns.set(rc={'figure.figsize':(13,9)})

# color
palette = sns.color_palette(20,l=4,s=0.9)

# plot
sns.scatterplot(x=X_embedded[:,0],y=X_embedded[:,1],hue=y_pred, legend='full',palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig('improved_cluster_tsne.png')
plt.show()


# Histogram of year / journal

# Map-Reduce 

# Association Analysis

# Similarity Analysis

# Clustering Analysis