# About the Dataset

The original CORD-19 is a resource of over 1,000,000 scholarly articles, including over 400,000 with full text, about COVID-19, SARS-CoV-2, and related coronaviruses.

In our project, the dataset is sampled from the CORD-19 with size ~10,000 to reduce computation burden.

In [None]:
import os
import subprocess


# shared link: https://drive.google.com/drive/folders/1Td_ZTUVrsKeftDE5Zll7252YLJdWiNTk?usp=share_link 
# you can download the data via the shared link, and skip Step 0 and Step 1 if you want to run the code in your local machine 


# Step 0: add the shared folder to your google drive. e.g., /content/drive/MyDrive/CORD_19

# Step 1: Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")


!echo $PWD

!ls /content/drive/MyDrive/CORD_19/

# Step 2: unzip json files 
subset_dir = os.path.join(os.getcwd(),  "CORD_19_subset")


zip_file_path="/content/drive/MyDrive/CORD_19/subset.zip"

# Check if the destination directory exists
if not os.path.exists(subset_dir):
    # Unzip the file
    cmd = "unzip {} -d {}".format(zip_file_path, subset_dir)
    proc = subprocess.Popen(cmd, shell=True)
else:
    print(f"Directory {subset_dir} already exists. Skipping extraction.")

In [11]:
# Import necessary packages
import os
import json
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [None]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Data Load & Pre-processing

In [None]:
# Load Meta data from meta_10k.csv
data_root = os.path.join(os.getcwd(),'input')

metadata_path = os.path.join(data_root, 'meta_10k.csv')
meta_df = pd.read_csv(metadata_path, index_col=0,converters={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi': str
})

print(len(meta_df))
meta_df.head()

In [None]:
# Display the info of the dataframe
meta_df.info()

In [None]:
def glob_files(path, f_type=".json"):
    dst = []
    for root, _, files in os.walk(path):
        for f in files:
            if f.endswith(f_type):
                dst.append(os.path.join(root, f))
    return dst

# glob json files
json_dir = os.path.join(data_root, "subset","subset","document_parses","pdf_json")
print(json_dir)
json_files = glob_files(json_dir, ".json")

print("total json files:", len(json_files))

In [None]:
# Class to read JSON files
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
            self.title = content['metadata']['title']

    def __repr__(self):
        return f"{self.paper_id}: {self.title } : {self.abstract[:200]}... {self.body_text[:200]}..."

# Read the first row
first_row = FileReader(json_files[0])
print(first_row)

In [None]:
# Function to add breaks to text
from tqdm import tqdm

def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # Add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

# Dictionary to hold data
dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [],
         'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}

# Iterate through json files and extract content
for idx, entry in tqdm(enumerate(json_files), total=len(json_files)):
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # Invalid paper format, skip

    # Get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # No metadata, skip this paper
    if len(meta_data) == 0:
        continue
    if len(content.body_text) == 0:
        continue
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    # Also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0:
        # No abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # Abstract provided is too long for plot, take first 100 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # Abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)

    # Get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

    try:
        # If more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # More than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(get_breaks('. '.join(authors), 40))
        else:
            # Authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # If only one author - or Null value
        dict_['authors'].append(meta_data['authors'].values[0])

    # Add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # If title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])

    # Add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])

    # Add doi
    dict_['doi'].append(meta_data['doi'].values[0])

# Create a DataFrame
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text',
                                        'authors', 'title', 'journal', 'abstract_summary'])
print(df_covid.head())

In [None]:
# Display the info of the dataframe
df_covid.info()

In [None]:
# Drop NaN values
df = df_covid
df.dropna(inplace=True)
df.info()


In [None]:
# Text cleaning function
stop_words = set(stopwords.words('english'))

def clean_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(words)

# Apply text cleaning to abstract and body_text
df['abstract'] = df['abstract'].apply(clean_text)
df['body_text'] = df['body_text'].apply(clean_text)

In [None]:
!pip install langdetect

In [None]:
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df))):
    # split by space into list, take the first x intex, join with space
    text = df.iloc[ii]['body_text'].split(" ")

    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:

            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[ii]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass

    # get the language
    languages.append(lang)

In [None]:
from pprint import pprint

languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)

print("Total: {}\n".format(len(languages)))
pprint(languages_dict)

In [None]:
df['language'] = languages
df = df[df['language'] == 'en']
df.info()

# Histogram of year / journal

In [None]:
# Convert publish_time to datetime
meta_df['publish_time'] = pd.to_datetime(meta_df['publish_time'], errors='coerce')

# Drop rows with NaT values in publish_time
meta_df = meta_df.dropna(subset=['publish_time'])

# Plot histogram of publication years
plt.figure(figsize=(12, 6))
meta_df['publish_time'].dt.year.value_counts().sort_index().plot(kind='bar')
plt.title('Histogram of Publication Years')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.show()

# Plot histogram of journals
plt.figure(figsize=(12, 6))
df['journal'].value_counts().head(20).plot(kind='bar')
plt.title('Top 20 Journals by Number of Papers')
plt.xlabel('Journal')
plt.ylabel('Number of Papers')
plt.xticks(rotation=90)
plt.show()

# Filter years with too many or too few data points
min_threshold = 10  # Minimum number of papers for a year to be considered
max_threshold = 1000  # Maximum number of papers for a year to be considered
year_counts = meta_df['publish_time'].dt.year.value_counts()
filtered_years = year_counts[(year_counts >= min_threshold) & (year_counts <= max_threshold)].index

# Filter the dataframe
filtered_meta_df = meta_df[meta_df['publish_time'].dt.year.isin(filtered_years)]

# Plot filtered histogram of publication years
plt.figure(figsize=(12, 6))
filtered_meta_df['publish_time'].dt.year.value_counts().sort_index().plot(kind='bar')
plt.title('Filtered Histogram of Publication Years')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.show()

# Map-Reduce 

# Association Analysis

# Similarity Analysis

# Clustering Analysis