In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import json

https://www.kaggle.com/maksimeren/covid-19-literature-clustering#Load-the-Data

# Import

Import Metadata

In [2]:
meta_df = pd.read_csv("metadata.csv",
                          dtype={
                             "pubmed_id": str,
                              "Microsoft Academic Paper ID": str
                          }
                         )
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   sha                          28462 non-null  object
 1   source_x                     44220 non-null  object
 2   title                        43996 non-null  object
 3   doi                          40750 non-null  object
 4   pmcid                        23319 non-null  object
 5   pubmed_id                    22943 non-null  object
 6   license                      44220 non-null  object
 7   abstract                     35806 non-null  object
 8   publish_time                 34197 non-null  object
 9   authors                      41074 non-null  object
 10  journal                      33173 non-null  object
 11  Microsoft Academic Paper ID  964 non-null    object
 12  WHO #Covidence               1767 non-null   object
 13  has_full_text                44

Get json paths

In [3]:
all_json = glob.glob(f'**/*.json', recursive=True)

In [4]:
def get_breaks(content, length=40):
    ''' 
    Make long summaries readable in the plot tooltip.
    Adds break after every word when line reaches a certain character amount, by default 40
    '''
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i]) # get length of each word
        if total_chars > length:
            # add 'enter' after each word if the total length up to that point is > length variable
            data = data + "<br>" + words[i] 
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [5]:
class FileReader:
    '''
    Extracts information from provided JSON file paths.
    
    Callable functions:
        * summarize_abstract(): returns the abstract with < 100 words, 40 characters per line
        * summarize_authors(): returns a max of 2 authors, with rest of authors denoted with "et al."
        * summarize_title(): returns the title with 40 characters per line
    
    Returns:
        * a print out of the paper id, abstract (200 characters), and body (200 characters)
    '''
    def __init__(self, file_path):
        '''
        Open the file. Extract the paper id, abstract, and body. Match it with the metadata.
        
        Callable variables:
            * content: the full, raw content of the json path
            * paper_id: the paper id stored in the json file
            * meta_data: the metadata that has a paper id match in the meta_df
            * journal: the journal paper was published in
            * abstract: the full abstract
            * body_text: the full body
        '''
        with open(file_path) as file:
            content = json.load(file)
            self.content = content
            self.paper_id = content['paper_id']
            self.meta_data = meta_df[meta_df['sha'] == self.paper_id]
            self.abstract = []
            self.body_text = []

            if len(self.meta_data) != 0:
                self.journal = self.meta_data['journal'].values[0]

                # Abstract
                for entry in content['abstract']:
                    self.abstract.append(entry['text'])
                self.abstract = '\n'.join(self.abstract)

                # Body text
                for entry in content['body_text']:
                    self.body_text.append(entry['text'])          
                self.body_text = '\n'.join(self.body_text)         


    def summarize_abstract(self):
        '''
        Limits abstract summary to <= 40 words for the plot tool tip.
        '''
        if len(self.abstract) == 0:
            abstract_summary = ("Not provided.")
            return abstract_summary
        elif len(self.abstract) > 100:
            # abstract is too long, take first 100 words and run through function
            info = self.abstract.split(' ')[:100]
            abstract_summary = get_breaks(' '.join(info)) + " ..."
            return abstract_summary
        else:
            # abstract is short enough, just run through function
            abstract_summary = get_breaks(self.abstract)
            return abstract_summary
            
    def summarize_authors(self):
        '''
        Limit number of authors to <= 2 for the plot tool tip
        '''
        try:
            # More than one author
            authors = self.meta_data['authors'].values[0].split(';')
            if len(authors) > 2:
                authors = ". ".join(authors[:2]) + ", et al."
                return authors
            else:
                authors = ". ".join(authors)
                return authors
        except Exception as e:
            # Just one author or no author
            authors = self.meta_data['authors'].values[0]
            return authors

    def summarize_title(self):
        '''
        Limit title length to <= 40 words for the plot tool tip
        '''
        try:
            # title is provided, run it through function
            title = get_breaks(self.meta_data['title'].values[0])
            return title
        except Exception as e:
            # no title provided
            title = self.meta_data['title'].values[0]
            return title
    
    def __repr__(self):
        '''
        This just prints out little summaries if you want to run a json through the class
        '''
        return f'PAPER ID: [{self.paper_id}] ABSTRACT: [{self.abstract[:200]}...] BODY: [{self.body_text[:200]}...]'

In [6]:
dict_ = {'paper_id':[],
         'abstract':[],
         'body_text':[],
         'authors':[],
         'title':[],
         'journal':[],
         'abstract_summary':[]
        }
for idx, entry in enumerate(all_json):
    # Progress Counter
    if idx % (len(all_json)//10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
        
    # Initialize FileReader with json
    content = FileReader(entry) 

    meta_data = content.meta_data
    # no metadata, skip paper
    if len(meta_data) == 0:
        continue
    else:
        # Append info to dictionary
        dict_['paper_id'].append(content.paper_id)
        dict_['abstract'].append(content.abstract)
        dict_['body_text'].append(content.body_text)
        dict_['abstract_summary'].append(content.summarize_abstract())
        dict_['authors'].append(content.summarize_authors())
        dict_['title'].append(content.summarize_title())
        dict_['journal'].append(content.journal)

print(f'Finished processing {len(all_json)} jsons!')

Processing index: 0 of 29315
Processing index: 2931 of 29315
Processing index: 5862 of 29315
Processing index: 8793 of 29315
Processing index: 11724 of 29315
Processing index: 14655 of 29315
Processing index: 17586 of 29315
Processing index: 20517 of 29315
Processing index: 23448 of 29315
Processing index: 26379 of 29315
Processing index: 29310 of 29315
Finished processing 29315 jsons!


# Clean

In [7]:
df_covid = pd.DataFrame(dict_)

In [8]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27678 entries, 0 to 27677
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          27678 non-null  object
 1   abstract          27678 non-null  object
 2   body_text         27678 non-null  object
 3   authors           26929 non-null  object
 4   title             27634 non-null  object
 5   journal           26784 non-null  object
 6   abstract_summary  27678 non-null  object
dtypes: object(7)
memory usage: 1.5+ MB


In [9]:
dict_ = None

df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
df_covid.describe(include="all")

Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary,abstract_word_count,body_word_count
count,27678,27678.0,27678,26929,27634,26784,27678,27678.0,27678.0
unique,27678,20191.0,27662,25576,27248,3324,20184,,
top,369914e87f682579eb3a5efeb43dc0184a88b5d6,,"In previous reports, workers have characterize...","Domingo, Esteban",Index,PLoS One,Not provided.,,
freq,1,7453.0,4,14,70,1511,7453,,
mean,,,,,,,,165.438399,4885.414047
std,,,,,,,,175.670244,7492.709215
min,,,,,,,,0.0,1.0
25%,,,,,,,,0.0,2503.0
50%,,,,,,,,162.0,3760.0
75%,,,,,,,,240.0,5531.75


In [10]:
df_covid.drop_duplicates(['abstract'], inplace=True)
df_covid.describe(include='all')

Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary,abstract_word_count,body_word_count
count,20191,20191,20191,19953,20183,19416,20191,20191.0,20191.0
unique,20191,20191,20190,19309,20110,2460,20184,,
top,369914e87f682579eb3a5efeb43dc0184a88b5d6,Background: Transmission of respiratory infect...,"In a global world, knowledge of imported infec...","Decaro, Nicola. Mari, Viviana, et al.",Index,PLoS One,Virus infection is a complex biological<br>ph...,,
freq,1,1,2,7,9,1503,2,,
mean,,,,,,,,226.567382,4611.755683
std,,,,,,,,168.584552,5642.990019
min,,,,,,,,0.0,1.0
25%,,,,,,,,149.0,2748.0
50%,,,,,,,,203.0,3860.0
75%,,,,,,,,262.0,5456.0


In [11]:
df_covid.info()
df_covid.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20191 entries, 0 to 27677
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   paper_id             20191 non-null  object
 1   abstract             20191 non-null  object
 2   body_text            20191 non-null  object
 3   authors              19953 non-null  object
 4   title                20183 non-null  object
 5   journal              19416 non-null  object
 6   abstract_summary     20191 non-null  object
 7   abstract_word_count  20191 non-null  int64 
 8   body_word_count      20191 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 1.5+ MB


In [12]:
import re

for col in df_covid.columns:
    # remove punctuation, make all lower case
    try:
        df_covid[col] = df_covid[col].str.lower()
        df_covid[col] = df_covid[col].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))

    except Exception as e:
        print(e)

Can only use .str accessor with string values!
Can only use .str accessor with string values!


In [13]:
df_covid.head()

Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary,abstract_word_count,body_word_count
0,b4db11ef895a9b39990f3b969f7ee8e9c4588ead,background little is known about antiviral res...,chronic rhinosinusitis crs is a heterogenous m...,hwang jae woong lee ki jeong et al,decreased expression of type i ifn and typebr...,journal of allergy and clinical immunology,background little is known about antiviralbrr...,264,6024
1,253d52a717629685ceadeab5430aedaaaee99566,,retroviruses are a unique class of viruses tha...,hatfield dolph l levin judith g et al,translational suppression in retroviral geneb...,advances in virus research,not provided,0,15139
2,88fd7896ce3b5dc6a078ed9a4bfa8f3825a33126,the 2003 severe acute respiratory syndrome sar...,restaurants in hong kong have already been put...,tse alan cb so stella et al,crisis management and recovery howbrrestauran...,international journal of hospitality management,the 2003 severe acute respiratory syndromebrs...,79,2630
3,8053681913fd4f6115645624eda127b533532e1b,negativepressure isolation rooms are required ...,summary negativepressure isolation rooms are r...,walker jt hoffman p et al,hospital and community acquired infection and...,journal of hospital infection,negativepressure isolation rooms arebrrequire...,188,3298
4,9f43b3d3fa582e9990f290c0ddf7696503f3e46a,objectives the rapid emergence of drugresistan...,shigella is a major gastrointestinal pathogen ...,gu bing xu ting et al,a 10year surveillance of antimicrobialbrsusce...,journal of global antimicrobial resistance,objectives the rapid emergence ofbrdrugresist...,245,3045


# 2-Grams

In [14]:
text = df_covid[["body_text"]].reset_index(drop = True)
text_arr = text.stack().tolist()

In [None]:
words = []
for i in range(0, len(text_arr)):
    words.append(text_arr[i].split(" "))

In [None]:
words[0][:10]

In [None]:
n_gram_all = []

for word in words:
    # get n-grams for the instance
    n_gram = []
    for i in range(len(word)-2+1):
        n_gram.append("".join(word[i:i+2]))
    n_gram_all.append(n_gram)

In [None]:
n_gram_all[0][:10]

In [None]:
type(words[0])