This project is to understand the "Topic Modeling" concept with LDA(Latent Dirichlet Allocation) algorithm.This algorithm is used to classify text in a document to a particular topic.

In [None]:
#import required libraries.
import os
import re
import numpy as np 
import pandas as pd 
import json
from pprint import pprint
import random
import string 

#To split the text into tokens or words.
from nltk import word_tokenize

#Stopwords are --> 'a','the'...etc
from nltk.corpus import stopwords

#maps words in document to unique integer IDs.
from gensim.corpora import Dictionary

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)


In [None]:
documents_dir='../input/CORD-19-research-challenge/document_parses/pdf_json/'
filenames = os.listdir(documents_dir)
print("Number of documents :", len(filenames))

In [None]:
random.shuffle(filenames)

In [None]:
file = json.load(open('../input/CORD-19-research-challenge/document_parses/pdf_json/0000028b5cc154f68b8a269f6578f21e31f62977.json', 'rb'))

In [None]:
pprint(file["metadata"]["title"])

**Step 1 : Data Cleaning**

In [None]:
#function to clean(preprocess) the text
def clean(text):
    text = str(text).lower()
    #To remove '[] brackets'.
    text = re.sub(r'\[.*?\]', '', text)
    #To remove '() paranthesis'.
    text = re.sub(r'\(.*?\)', '', text)
    #To remove 'empty spaces 1 or more than'.
    text = re.sub(r"\s+", " ", text)
    #To remove 'alphanumeric and digits'
    text = re.sub(r'\w*\d\w*', '', text)
    #To remove ellipsis(which are '...')
    text = re.sub(r"\w+…|…", "", text) 
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    return text




In [None]:
#Function to remove and tokenize the text.
def remove_stopwords_and_tokenize(text):
    my_stopwords = set(stopwords.words("english"))
    tokens = word_tokenize(text)  # tokenize 
    tokens = [t for t in tokens if not t in my_stopwords]  # Remove stopwords
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


In [None]:
#parsing the text
def parse_body_text(body_text):
    body =""
    for item in body_text:
        body += item["section"]
        body += "\n\n"
        body += item["text"]
        body += "\n\n"
    body=clean(body)
    tokens=remove_stopwords_and_tokenize(body)
    return body,tokens


In [None]:
all_text = []
all_tokens=[]
all_titles=[]
for i,filename in enumerate(filenames[:1000]):
    filepath = documents_dir + filename
    file = json.load(open(filepath, 'rb'))
    text,tokens=parse_body_text(file["body_text"])
    all_text.append(text)
    all_tokens.append(tokens)
    all_titles.append(file["metadata"]["title"])
    

In [None]:
#Arranging the extracted data in Dataframe
data = pd.DataFrame()
data['text'] = all_text
data['tokens'] = all_tokens
data['doc_id'] = filenames[:1000]
data['title'] = all_titles
del all_text,all_tokens,all_titles

if not data['tokens'].empty and all(data['tokens']):
    print("Tokens are populated correctly!")
else:
    print("Tokens are empty!!")

In [None]:
data.head(2)

**Step 2 : Apply LDA model**

In [None]:
#creating a dictionary representation of the documents
dictionary = Dictionary(data["tokens"])

#Filter out the words that occur less than 20 documents, or than 50% of the documents.
dictionary.filter_extremes(no_below = 20, no_above = 0.5)

In [None]:
#Bag-of-words reperesentation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in data["tokens"]]

In [None]:
from gensim.models import LdaModel

#Build a LDA Model
lda_model = LdaModel(corpus = corpus,id2word=dictionary,num_topics=20,random_state=100,chunksize=200,passes=100)

In [None]:
#displaying topics that have been identified by LDA Algorithm.
lda_model.print_topics()[:5]
#Each word is assigned with a probability in a topic(This probability defines the importance of the word).
#'index' --> defines the topic.

In [None]:
#Topic distributions for the first document.
lda_model[corpus][0]

**Step 4 : Results**

In [None]:
#Document - Topic Table
def get_document_topic_table(lda_model, corpus, texts=data):
    #Init output
    document_topic_df = pd.DataFrame()
    
    #Get main topic in each documet
    for i,row_list in enumerate(lda_model[corpus]):
        row = sorted(row_list, key=lambda x: (x[1]), reverse=True)
        topic_num = row[0][0]
        prop_topic = row[0][1]
        wp = lda_model.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        document_topic_df.at[i,'best_topic'] = topic_num
        document_topic_df.at[i,'prop_topic'] = prop_topic
        document_topic_df.at[i,'topic_keyboards'] = topic_keywords
        document_topic_df.at[i,'document_num'] = i
    return document_topic_df

document_topic_df = get_document_topic_table(lda_model=lda_model, corpus=corpus, texts=data["tokens"])

In [None]:
document_topic_df.head(2)

**Recommend 'k' Topics**

In [None]:
def get_topic_id(doc_id):
    for i,row in data.iterrows():
        if(row["doc_id"]==doc_id):
            #print(document_topic_df["best_topic"][i])
            return document_topic_df["best_topic"][i]
    return -1

def get_matching_topics_docs(topic_id):
    matched_topics=[]
    for i,row in document_topic_df.iterrows():
        
        if(row["best_topic"]==topic_id):
            topic_prop_doc=(topic_id,row["prop_topic"],i)
            matched_topics.append(topic_prop_doc)
        
    return matched_topics
    
def get_top_k_topics(matched_topics,k):
    top_k=sorted(matched_topics, key=lambda x: [x[1]], reverse=True)
    print(top_k[:k])
    k_topics_df=pd.DataFrame(columns=["doc_id","topic_id","topic_prop","title"])
    i=0
    for topic_id,topic_prop,doc_num in top_k[:k]:
        k_topics_df.at[i,'doc_id']=data["doc_id"][doc_num]
        k_topics_df.at[i,'topic_id']=topic_id
        k_topics_df.at[i,'topic_prop']=topic_prop
        k_topics_df.at[i,'title']=data["title"][doc_num]
        i+=1
    return k_topics_df

def recommend_k_topics(doc_id,k):
    topic_id=get_topic_id(doc_id)
    if(topic_id!=-1):
        matched_topics=get_matching_topics_docs(topic_id) 
        return get_top_k_topics(matched_topics,k)
    
    
k_topics_df=recommend_k_topics('328401206bf2e3657e352ad5c5a2e566cc09736d.json',5)

In [None]:
k_topics_df