In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Approach:

- Parse the text from the body of each document using Natural Language Processing (NLP).
- Turn each document instance di into a feature vector Xi using Term Frequency–inverse Document Frequency (TF-IDF).
- Apply Dimensionality Reduction to each feature vector Xi using t-Distributed Stochastic Neighbor Embedding (t-SNE) to cluster similar research articles in the two-dimensional plane.
- Use Principal Component Analysis (PCA) to project down the dimensions of X to a number of dimensions that will keep .95 variance while removing noise and outliers.
- Apply k-means clustering to label each cluster.
- Apply Topic Modeling using Latent Dirichlet Allocation (LDA) to discover keywords from each cluster.

## Read the metadata

In [4]:
root_path = '/Users/ruoyu/Desktop/22 Fall/INLS 690/Individual Project/2020-05-19'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [5]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128492 entries, 0 to 128491
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   cord_uid          128492 non-null  object 
 1   sha               55751 non-null   object 
 2   source_x          128492 non-null  object 
 3   title             128464 non-null  object 
 4   doi               100586 non-null  object 
 5   pmcid             60771 non-null   object 
 6   pubmed_id         99124 non-null   object 
 7   license           128492 non-null  object 
 8   abstract          101611 non-null  object 
 9   publish_time      128477 non-null  object 
 10  authors           123725 non-null  object 
 11  journal           122195 non-null  object 
 12  mag_id            0 non-null       float64
 13  who_covidence_id  17071 non-null   object 
 14  arxiv_id          1395 non-null    object 
 15  pdf_json_files    55751 non-null   object 
 16  pmc_json_files    43

## Get all the Json file

In [36]:
import glob
 
# Get the list of all files and directories
#path = "/Users/ruoyu/Desktop/22 Fall/INLS 690/Individual Project/2020-05-19/document_parses/pdf_json"
#all_json = os.listdir(path)

all_json = glob.glob(f'{root_path}/document_parses/pdf_json/*.json', recursive=True)
len(all_json)

59561

In [37]:
all_json[:5]

['/Users/ruoyu/Desktop/22 Fall/INLS 690/Individual Project/2020-05-19/document_parses/pdf_json/4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea.json',
 '/Users/ruoyu/Desktop/22 Fall/INLS 690/Individual Project/2020-05-19/document_parses/pdf_json/86d4262de73cf81b5ea6aafb91630853248bff5f.json',
 '/Users/ruoyu/Desktop/22 Fall/INLS 690/Individual Project/2020-05-19/document_parses/pdf_json/b2f67d533f2749807f2537f3775b39da3b186051.json',
 '/Users/ruoyu/Desktop/22 Fall/INLS 690/Individual Project/2020-05-19/document_parses/pdf_json/9ec0b1175992879d5b8d3351ef40a28bb48f18ce.json',
 '/Users/ruoyu/Desktop/22 Fall/INLS 690/Individual Project/2020-05-19/document_parses/pdf_json/86a998617c077f4fe2ab26214995a3548fbc0fc5.json']

## Utilizing helper file

In [38]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea: Abnormal levels of end-tidal carbon dioxide (EtCO 2 ) during resuscitation in the delivery suite are associated with intraventricular haemorrhage (IVH) development. Our aim was to determine whether ca... Improvements in neonatal intensive care have resulted in decreased mortality rates of preterm infants. The development of intraventricular haemorrhage (IVH), however, can result in long-term adverse o...


In [41]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + " " + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

## Load data into dataframe

In [42]:
dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 100 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # if more than 2 authors, take them all with html tag breaks in between
            dict_['authors'].append(get_breaks('. '.join(authors), 40))
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
    # add doi
    dict_['doi'].append(meta_data['doi'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

Processing index: 0 of 59561
Processing index: 5956 of 59561
Processing index: 11912 of 59561
Processing index: 17868 of 59561
Processing index: 23824 of 59561
Processing index: 29780 of 59561
Processing index: 35736 of 59561
Processing index: 41692 of 59561
Processing index: 47648 of 59561
Processing index: 53604 of 59561
Processing index: 59560 of 59561


Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary
0,4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea,10.1007/s00431-019-03543-0,Abnormal levels of end-tidal carbon dioxide (E...,Improvements in neonatal intensive care have r...,"Tamura, Kentaro. Williams, Emma E. Dassios,...",End-tidal carbon dioxide levels during resusc...,Eur J Pediatr,Abnormal levels of end-tidal carbon dioxide (...
1,86d4262de73cf81b5ea6aafb91630853248bff5f,10.1016/j.bbamcr.2011.06.011,The endoplasmic reticulum (ER) is the biggest ...,The endoplasmic reticulum (ER) is a multi-func...,"Lynes, Emily M.. Simmen, Thomas",Urban planning of the endoplasmic reticulum (...,Biochimica et Biophysica Acta (BBA) - Molecula...,The endoplasmic reticulum (ER) is the biggest...
2,b2f67d533f2749807f2537f3775b39da3b186051,10.1016/j.fsiml.2020.100013,,There is a disproportionate number of individu...,"Liebrenz, Michael. Bhugra, Dinesh. Buadze, ...",Caring for persons in detention suffering wit...,Forensic Science International: Mind and Law,Not provided.
3,9ec0b1175992879d5b8d3351ef40a28bb48f18ce,10.1016/j.jhin.2019.07.001,,"Sir, Testing for respiratory virus infections ...","Gohil, S.. Donaghy, B.. Tature, D.. Kowal,...",Seasonal respiratory virus testing in managem...,J Hosp Infect,Not provided.
4,4ed70c27f14b7f9e6219fe605eae2b21a229f23c,10.1080/14787210.2017.1271712,,The Middle East respiratory syndrome coronavir...,"Al-Tawfiq, Jaffar A.. Memish, Ziad A.",Update on therapeutic options for Middle East...,Expert Rev Anti Infect Ther,Not provided.


## Adding more features to the dataset

In [43]:
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))  # word count in abstract
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))  # word count in body
df_covid['body_unique_words']=df_covid['body_text'].apply(lambda x:len(set(str(x).split())))  # number of unique words in body
df_covid.head()

Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary,abstract_word_count,body_word_count,body_unique_words
0,4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea,10.1007/s00431-019-03543-0,Abnormal levels of end-tidal carbon dioxide (E...,Improvements in neonatal intensive care have r...,"Tamura, Kentaro. Williams, Emma E. Dassios,...",End-tidal carbon dioxide levels during resusc...,Eur J Pediatr,Abnormal levels of end-tidal carbon dioxide (...,218,2601,830
1,86d4262de73cf81b5ea6aafb91630853248bff5f,10.1016/j.bbamcr.2011.06.011,The endoplasmic reticulum (ER) is the biggest ...,The endoplasmic reticulum (ER) is a multi-func...,"Lynes, Emily M.. Simmen, Thomas",Urban planning of the endoplasmic reticulum (...,Biochimica et Biophysica Acta (BBA) - Molecula...,The endoplasmic reticulum (ER) is the biggest...,234,8069,2282
2,b2f67d533f2749807f2537f3775b39da3b186051,10.1016/j.fsiml.2020.100013,,There is a disproportionate number of individu...,"Liebrenz, Michael. Bhugra, Dinesh. Buadze, ...",Caring for persons in detention suffering wit...,Forensic Science International: Mind and Law,Not provided.,0,1126,540
3,9ec0b1175992879d5b8d3351ef40a28bb48f18ce,10.1016/j.jhin.2019.07.001,,"Sir, Testing for respiratory virus infections ...","Gohil, S.. Donaghy, B.. Tature, D.. Kowal,...",Seasonal respiratory virus testing in managem...,J Hosp Infect,Not provided.,0,815,385
4,4ed70c27f14b7f9e6219fe605eae2b21a229f23c,10.1080/14787210.2017.1271712,,The Middle East respiratory syndrome coronavir...,"Al-Tawfiq, Jaffar A.. Memish, Ziad A.",Update on therapeutic options for Middle East...,Expert Rev Anti Infect Ther,Not provided.,0,2748,996


In [44]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52471 entries, 0 to 52470
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   paper_id             52471 non-null  object
 1   doi                  50995 non-null  object
 2   abstract             52471 non-null  object
 3   body_text            52471 non-null  object
 4   authors              51301 non-null  object
 5   title                52471 non-null  object
 6   journal              48182 non-null  object
 7   abstract_summary     52471 non-null  object
 8   abstract_word_count  52471 non-null  int64 
 9   body_word_count      52471 non-null  int64 
 10  body_unique_words    52471 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 4.4+ MB


## Check duplicate

In [45]:
df_covid['abstract'].describe(include='all')

count     52471
unique    37150
top            
freq      15128
Name: abstract, dtype: object

In [46]:
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
df_covid['abstract'].describe(include='all')

count     52339
unique    37150
top            
freq      15034
Name: abstract, dtype: object

In [47]:
df_covid['body_text'].describe(include='all')

count                              52339
unique                             52334
top       J o u r n a l P r e -p r o o f
freq                                   3
Name: body_text, dtype: object

It seems there are some missing values for both abstract and body text

## Final Dataset

In [48]:
df_covid.head()

Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary,abstract_word_count,body_word_count,body_unique_words
0,4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea,10.1007/s00431-019-03543-0,Abnormal levels of end-tidal carbon dioxide (E...,Improvements in neonatal intensive care have r...,"Tamura, Kentaro. Williams, Emma E. Dassios,...",End-tidal carbon dioxide levels during resusc...,Eur J Pediatr,Abnormal levels of end-tidal carbon dioxide (...,218,2601,830
1,86d4262de73cf81b5ea6aafb91630853248bff5f,10.1016/j.bbamcr.2011.06.011,The endoplasmic reticulum (ER) is the biggest ...,The endoplasmic reticulum (ER) is a multi-func...,"Lynes, Emily M.. Simmen, Thomas",Urban planning of the endoplasmic reticulum (...,Biochimica et Biophysica Acta (BBA) - Molecula...,The endoplasmic reticulum (ER) is the biggest...,234,8069,2282
2,b2f67d533f2749807f2537f3775b39da3b186051,10.1016/j.fsiml.2020.100013,,There is a disproportionate number of individu...,"Liebrenz, Michael. Bhugra, Dinesh. Buadze, ...",Caring for persons in detention suffering wit...,Forensic Science International: Mind and Law,Not provided.,0,1126,540
3,9ec0b1175992879d5b8d3351ef40a28bb48f18ce,10.1016/j.jhin.2019.07.001,,"Sir, Testing for respiratory virus infections ...","Gohil, S.. Donaghy, B.. Tature, D.. Kowal,...",Seasonal respiratory virus testing in managem...,J Hosp Infect,Not provided.,0,815,385
4,4ed70c27f14b7f9e6219fe605eae2b21a229f23c,10.1080/14787210.2017.1271712,,The Middle East respiratory syndrome coronavir...,"Al-Tawfiq, Jaffar A.. Memish, Ziad A.",Update on therapeutic options for Middle East...,Expert Rev Anti Infect Ther,Not provided.,0,2748,996


In [49]:
df_covid.describe()

Unnamed: 0,abstract_word_count,body_word_count,body_unique_words
count,52339.0,52339.0,52339.0
mean,157.998223,4769.102199,1409.134756
std,186.518579,9996.566795,1640.970446
min,0.0,1.0,1.0
25%,0.0,2008.0,802.0
50%,151.0,3359.0,1164.0
75%,234.0,5130.0,1606.0
max,7415.0,279623.0,38298.0


In [52]:
df_covid.to_csv("COVID-19 Literature Clustering Cleaned Data.csv", index=False)