In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
import random
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [2]:
documents_dir='../input/CORD-19-research-challenge/document_parses/pdf_json/'
filenames = os.listdir(documents_dir)
print("Number of documents :", len(filenames))

Number of documents : 401214


In [3]:
file = json.load(open('../input/CORD-19-research-challenge/document_parses/pdf_json/0000028b5cc154f68b8a269f6578f21e31f62977.json', 'rb'))

In [4]:
pprint(file["metadata"]["title"])

'"Multi-faceted" COVID-19: Russian experience'


Data Cleaning

In [5]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    return text

Removing stopwords and tokens

In [6]:
def remove_stopwords_and_tokenize(text):
    my_stopwords = set(stopwords.words("english"))
    tokens = word_tokenize(text)  # tokenize 
    tokens = [t for t in tokens if not t in my_stopwords]  # Remove stopwords
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [7]:
def parse_body_text(body_text):
    body =""
    for item in body_text:
        body += item["section"]
        body += "\n\n"
        body += item["text"]
        body += "\n\n"
    body=clean(body)
    tokens=remove_stopwords_and_tokenize(body)
    return body,tokens

In [8]:
all_text = []
all_tokens=[]
all_titles=[]
for i,filename in enumerate(filenames[:1000]):
    filepath = documents_dir + filename
    file = json.load(open(filepath, 'rb'))
    text,tokens=parse_body_text(file["body_text"])
    all_text.append(text)
    all_tokens.append(tokens)
    all_titles.append(file["metadata"]["title"])

In [9]:
data=pd.DataFrame()
data['text']=all_text
data['tokens']=all_tokens
data['doc_id']=filenames[:1000]
data['title']=all_titles
del all_text,all_tokens,all_titles

In [10]:
data.head(2)

Unnamed: 0,text,tokens,doc_id,title
0,expected deaths by year represented by blue squares plotted against observed fatalities depicte...,"[expected, deaths, year, represented, blue, squares, plotted, observed, fatalities, depicted, bl...",8f97e16f3842e4bbd2d5d1c0c95ac1e31993ec68.json,An update on excess mortality in the second year of the COVID-19 pandemic in Germany Ein Update ...
1,covidnegative psychiatric units mitigating sequelae of pandemic isolation research letter to the...,"[covidnegative, psychiatric, units, mitigating, sequelae, pandemic, isolation, research, letter,...",8187ea360c53a56ca2c579d758a5d6aa67716836.json,


**Applying the LDA model**

In [11]:
dictionary = Dictionary(data["tokens"])
dictionary.filter_extremes(no_below = 20, no_above = 0.5)

In [12]:
corpus = [dictionary.doc2bow(doc) for doc in data["tokens"]]

In [13]:
from gensim.models import LdaModel

# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, random_state=100,
                chunksize=200, passes=100)

In [14]:
lda_model.print_topics()[:5]

[(0,
  '0.054*"cells" + 0.025*"cell" + 0.013*"mice" + 0.013*"expression" + 0.010*"il" + 0.009*"immune" + 0.006*"effects" + 0.006*"activation" + 0.006*"cancer" + 0.006*"activity"'),
 (1,
  '0.052*"vaccine" + 0.032*"sarscov" + 0.030*"vaccination" + 0.021*"vaccines" + 0.017*"antibody" + 0.016*"antibodies" + 0.011*"igg" + 0.010*"responses" + 0.009*"vaccinated" + 0.008*"neutralizing"'),
 (2,
  '0.021*"model" + 0.008*"models" + 0.007*"set" + 0.007*"network" + 0.007*"figure" + 0.006*"proposed" + 0.006*"process" + 0.005*"values" + 0.005*"function" + 0.005*"parameters"'),
 (3,
  '0.214*"de" + 0.125*"la" + 0.059*"en" + 0.054*"et" + 0.049*"des" + 0.046*"le" + 0.036*"un" + 0.028*"il" + 0.024*"con" + 0.019*"al"'),
 (4,
  '0.029*"survey" + 0.029*"respondents" + 0.025*"food" + 0.016*"participants" + 0.014*"online" + 0.013*"knowledge" + 0.013*"satisfaction" + 0.012*"behavior" + 0.012*"perceived" + 0.011*"service"')]

In [15]:
lda_model[corpus][0]

[(2, 0.19270805),
 (5, 0.25132352),
 (8, 0.11034494),
 (11, 0.28510663),
 (19, 0.15733667)]

**Final Results**

In [16]:
def get_document_topic_table(lda_model, corpus, texts = data):
    document_topic_df = pd.DataFrame()
    
      # Get main topic in each document
    for i, row_list in enumerate(lda_model[corpus]):           
        row = sorted(row_list, key=lambda x: (x[1]), reverse=True)
        topic_num=row[0][0]
        prop_topic=row[0][1]
        wp = lda_model.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        document_topic_df.at[i,'best_topic']=topic_num
        document_topic_df.at[i,'prop_topic']=prop_topic
        document_topic_df.at[i,'topic_keywords']=topic_keywords
        document_topic_df.at[i,'document_num']=i
    return document_topic_df

document_topic_df = get_document_topic_table(lda_model=lda_model, corpus=corpus, texts=data["tokens"])

document_topic_df.head(2)


Unnamed: 0,best_topic,prop_topic,topic_keywords,document_num
0,11.0,0.285137,"social, work, people, public, example, us, way, like, life, context",0.0
1,11.0,0.227459,"social, work, people, public, example, us, way, like, life, context",1.0
