In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

# nltk
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# <font face="verdana" color="dodgerblue">Do we extract the hidden topics from large volumes of text ?
### <font face="verdana" color="red"> Dataset Description </font>
In response to the COVID-19 pandemic, the White House and a coalition of leading research groups have prepared the COVID-19 Open Research Dataset (CORD-19). CORD-19 is a resource of over 44,000 scholarly articles, including over 29,000 with full text, about COVID-19, SARS-CoV-2, and related coronaviruses. This freely available dataset is provided to the global research community to apply recent advances in natural language processing and other AI techniques to generate new insights in support of the ongoing fight against this infectious disease. There is a growing urgency for these approaches because of the rapid acceleration in new coronavirus literature, making it difficult for the medical research community to keep up.

### Contents
* Exploration of Text In Articles 
* Tokenize words and Clean-up text
* Creating Bigram and Trigram Models
* Remove Stopwords, Make Bigrams and Lemmatize
* Create the Dictionary and Corpus needed for Topic Modeling
* Building the Topic Model
* View the topics in LDA model
* Compute Model Perplexity and Coherence Score
* Visualize the topics-keywords
* Building LDA Mallet Model
* How to find the optimal number of topics for LDA?
* Finding the dominant topic in each sentence
* Find the most representative document for each topic
* Topic distribution across documents

# <font face="verdana" color="dodgerblue"> Exploration of Text In Articles </font>

I load the output files from **[xhlulu's kernel](https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv)**, which contains a useful transformation of the json files in dictionaries to csv readable format. Go check it to give some credit and upvote the kernel!

In [None]:
biorxiv = pd.read_csv("/kaggle/input/cord-19-eda-parse-json-and-generate-clean-csv/clean_comm_use.csv")
biorxiv = biorxiv.fillna("No Information")
biorxiv.head()

In [None]:
stop_words = set(stopwords.words("english"))

def clean_text(s):
    words = str(s).lower()
    words = re.sub('\[.*?\]', '', words)
    words = re.sub('https?://\S+|www\.\S+', '', words)
    words = re.sub('<.*?>+', '', words)
    words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
    words = re.sub('\n', '', words)
    words = re.sub('\w*\d\w*', '', words)
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    words =  ' '.join(words)
    return words

#source: https://www.kaggle.com/shahules/basic-eda-cleaning-and-glove
def get_top_unigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(1, 1)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_threegrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_fourgrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(4, 4)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

## <font face="verdana" color="dodgerblue">Ngrams Analysis</font>
In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus [source](https://en.wikipedia.org/wiki/N-gram).
<p>Ngrams can be a very useful tool when trying to figure out which words and phrases are used in English. They can help show when certain phrases entered into the vernacular, and when they fell out of favor. But they have their limitations.</p>

### <font face="verdana" color="red">Most Common Words in Title</font>

In [None]:
%%time
title = biorxiv['title'].apply(lambda x : clean_text(x))

plt.style.use('ggplot')
fig, axes = plt.subplots(2, 2, figsize=(18, 20), dpi=100)
           
top_unigrams=get_top_unigrams(title)[:20]
x,y=map(list,zip(*top_unigrams))
sns.barplot(x=y,y=x, ax=axes[0,0], color='dodgerblue')


top_bigrams=get_top_bigrams(title)[:20]
x,y=map(list,zip(*top_bigrams))
sns.barplot(x=y,y=x, ax=axes[0,1], color='orangered')

top_threegrams=get_top_threegrams(title)[:20]
x,y=map(list,zip(*top_threegrams))
sns.barplot(x=y,y=x, ax=axes[1, 0], color='limegreen')

top_fourgrams=get_top_fourgrams(title)[:20]
x,y=map(list,zip(*top_fourgrams))
sns.barplot(x=y,y=x, ax=axes[1, 1], color='red')


axes[0, 0].set_ylabel(' ')
axes[0, 1].set_ylabel(' ')
axes[1, 0].set_ylabel(' ')
axes[1, 1].set_ylabel(' ')

axes[0, 0].yaxis.set_tick_params(labelsize=15)
axes[0, 1].yaxis.set_tick_params(labelsize=15)
axes[1, 0].yaxis.set_tick_params(labelsize=15)
axes[1, 1].yaxis.set_tick_params(labelsize=15)

axes[0, 0].set_title('Top 20 most common unigrams in title', fontsize=15)
axes[0, 1].set_title('Top 20 most common bigrams in title', fontsize=15)
axes[1, 0].set_title('Top 20 most common threegrams in title', fontsize=15)
axes[1, 1].set_title('Top 20 most common fourgrams in title', fontsize=15)

plt.tight_layout()
plt.show()

### <font face="verdana" color="red">Most Common Words in Abstract</font>

In [None]:
abstract = biorxiv['abstract'].apply(lambda x : clean_text(x))

plt.style.use('ggplot')
fig, axes = plt.subplots(2, 2, figsize=(18, 20), dpi=100)
plt.tight_layout()

top_unigrams=get_top_unigrams(abstract)[:20]
x,y=map(list,zip(*top_unigrams))
sns.barplot(x=y,y=x, ax=axes[0,0], color='dodgerblue')


top_bigrams=get_top_bigrams(abstract)[:20]
x,y=map(list,zip(*top_bigrams))
sns.barplot(x=y,y=x, ax=axes[0,1], color='orangered')

top_threegrams=get_top_threegrams(abstract)[:20]
x,y=map(list,zip(*top_threegrams))
sns.barplot(x=y,y=x, ax=axes[1, 0], color='limegreen')

top_fourgrams=get_top_fourgrams(abstract)[:20]
x,y=map(list,zip(*top_fourgrams))
sns.barplot(x=y,y=x, ax=axes[1, 1], color='red')


axes[0, 0].set_ylabel(' ')
axes[0, 1].set_ylabel(' ')
axes[1, 0].set_ylabel(' ')
axes[1, 1].set_ylabel(' ')

axes[0, 0].yaxis.set_tick_params(labelsize=15)
axes[0, 1].yaxis.set_tick_params(labelsize=15)
axes[1, 0].yaxis.set_tick_params(labelsize=15)
axes[1, 1].yaxis.set_tick_params(labelsize=15)

axes[0, 0].set_title('Top 20 most common unigrams in abstract', fontsize=15)
axes[0, 1].set_title('Top 20 most common bigrams in abstract', fontsize=15)
axes[1, 0].set_title('Top 20 most common threegrams in abstract', fontsize=15)
axes[1, 1].set_title('Top 20 most common fourgrams in abstract', fontsize=15)

plt.tight_layout()
plt.show()

### <font face="verdana" color="red">Most Common Words in Text Body</font>

In [None]:
text = biorxiv['text'].apply(lambda x : clean_text(x))

plt.style.use('ggplot')
fig, axes = plt.subplots(2, 2, figsize=(18, 20), dpi=100)
plt.tight_layout()

top_unigrams=get_top_unigrams(text)[:20]
x,y=map(list,zip(*top_unigrams))
sns.barplot(x=y,y=x, ax=axes[0,0], color='dodgerblue')


top_bigrams=get_top_bigrams(text)[:20]
x,y=map(list,zip(*top_bigrams))
sns.barplot(x=y,y=x, ax=axes[0,1], color='orangered')

top_threegrams=get_top_threegrams(text)[:20]
x,y=map(list,zip(*top_threegrams))
sns.barplot(x=y,y=x, ax=axes[1, 0], color='limegreen')

top_fourgrams=get_top_fourgrams(text)[:20]
x,y=map(list,zip(*top_fourgrams))
sns.barplot(x=y,y=x, ax=axes[1, 1], color='red')


axes[0, 0].set_ylabel(' ')
axes[0, 1].set_ylabel(' ')
axes[1, 0].set_ylabel(' ')
axes[1, 1].set_ylabel(' ')

axes[0, 0].yaxis.set_tick_params(labelsize=15)
axes[0, 1].yaxis.set_tick_params(labelsize=15)
axes[1, 0].yaxis.set_tick_params(labelsize=15)
axes[1, 1].yaxis.set_tick_params(labelsize=15)

axes[0, 0].set_title('Top 20 most common unigrams in text', fontsize=15)
axes[0, 1].set_title('Top 20 most common bigrams in text', fontsize=15)
axes[1, 0].set_title('Top 20 most common threegrams in text', fontsize=15)
axes[1, 1].set_title('Top 20 most common fourgrams in text', fontsize=15)

plt.tight_layout()
plt.show()

### <font face="verdana" color="red">Distribution of Word</font>

In [None]:
plt.style.use('fivethirtyeight')
fig,(ax1,ax2, ax3)= plt.subplots(ncols=3, figsize=(18, 5), dpi=100)


length=title.str.split().map(lambda x: len(x))
ax1.hist(length,bins = 20, color='black')
ax1.set_title('Tittle')

length=abstract.str.split().map(lambda x: len(x))
ax2.hist(length, bins = 20,  color='black')
ax2.set_title('Abstract')

length=text.str.split().map(lambda x: len(x))
ax3.hist(length, bins = 20,  color='black')
ax3.set_title('Text')

plt.tight_layout()
plt.show()

# <font face="verdana" color="dodgerblue">Topic Modeling with Gensim</font>
*Topic Modeling is a technique to extract the hidden topics from large volumes of text. Latent Dirichlet Allocation(LDA) is a popular algorithm for topic modeling with excellent implementations in the Python’s Gensim package. The challenge, however, is **how to extract good quality of topics that are clear, segregated and meaningful.** This depends heavily on the quality of text preprocessing and the strategy of finding the optimal number of topics. This tutorial attempts to tackle both of these problems.*

<p>I will be using the Latent Dirichlet Allocation (LDA) from Gensim package along with the Mallet’s implementation (via Gensim). Mallet has an efficient implementation of the LDA. It is known to run faster and gives better topics segregation.</p>
<p>Let’s begin!</p>

### <font face="verdana" color="red">Tokenize words and Clean-up text</font>
Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.

Gensim’s simple_preprocess() is great for this. Additionally I have set deacc=True to remove the punctuations.

<div class="alert alert-info" role="alert">
OPEN THE HIDDEN BOX TO SEE THE OUTPUT
</div>

In [None]:
# Covert the raw text into list
text = biorxiv.text.values.tolist()
print(text[:1])

In [None]:
# cleaning the text

text = [re.sub('\S*@\S*\s?', '', word) for word in text]
text = [re.sub('\s+', ' ', word) for word in text]
text = [re.sub("\'", "", word) for word in text]

print(text[:1])

In [None]:
# Tokenize words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

text_words = list(sent_to_words(text))

print(text_words[:1])

### <font face="verdana" color="red">Creating Bigram and Trigram Models</font>
Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

Gensim’s Phrases model can build and implement the bigrams, trigrams, quadgrams and more. The two important arguments to Phrases are min_count and threshold. The higher the values of these param, the harder it is for words to be combined to bigrams.

In [None]:

"""Build the bigram and trigram models"""
#bigram = gensim.models.Phrases(text_words, min_count=5, threshold=100) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[text_words], threshold=100)  

"""Faster way to get a sentence clubbed as a trigram/bigram"""
#bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
#print(trigram_mod[bigram_mod[text_words[0]]])

"""Save an exported collocation model."""
#bigram_mod.save("/kaggle/working/my_bigram_model.pkl") 
#trigram_mod.save("/kaggle/working/my_trigram_model.pkl")


In [None]:
"""load an exported collocation model"""
bigram_reloaded = gensim.models.phrases.Phraser.load("../input/bi-and-tri-model/my_bigram_model.pkl")
trigram_reloaded = gensim.models.phrases.Phraser.load("../input/bi-and-tri-model/my_trigram_model.pkl")
print(trigram_reloaded[bigram_reloaded[text_words[0]]])

### <font face="verdana" color="red">Remove Stopwords, Make Bigrams and Lemmatize</font>
The bigrams model is ready. Let’s define the functions to remove the stopwords, make bigrams and lemmatization and call them sequentially.


<div class="alert alert-info" role="alert">
OPEN THE OUTPUT BOX TO SEE THE HIDDEN OUTPUT
</div>

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['et', 'al'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_reloaded[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_reloaded[bigram_reloaded[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for word in texts:
        doc = nlp(" ".join(word)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
text_words_nostops = remove_stopwords(text_words)

# Form Bigrams
text_words_bigrams = make_bigrams(text_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
text_words_lemmatized = lemmatization(text_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(text_words_lemmatized[:1])

### <font face="verdana" color="red">Create the Dictionary and Corpus needed for Topic Modeling</font>

<p>The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them. </p>
<p>Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.<p>


In [None]:
# Create Dictionary
id2word = corpora.Dictionary(text_words_lemmatized)

# Create Corpus
texts = text_words_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

Alright, without digressing further let’s jump back on track with the next step: Building the topic model.

### <font face="verdana" color="red">Building the Topic Model</font>
We have everything required to train the LDA model. In addition to the corpus and dictionary, you need to provide the number of topics as well.

Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior.

chunksize is the number of documents to be used in each training chunk. update_every determines how often the model parameters should be updated and passes is the total number of training passes.

Source: https://radimrehurek.com/gensim/models/ldamodel.html

In [None]:
"""Build LDA model"""
#lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           #id2word=id2word,
                                           #num_topics=20, 
                                           #random_state=100,
                                           #update_every=1,
                                           #chunksize=100,
                                           #passes=10,
                                           #alpha='auto',
                                           #per_word_topics=True)
"""save model"""
#lda_model.save('/kaggle/working/lda_model.model')

### <font face="verdana" color="red">View the topics in LDA model</font>
The above LDA model is built with 20 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.

You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown next.

In [None]:
# load trained model from file
model_reloaded =  gensim.models.LdaModel.load('../input/bi-and-tri-model/lda_model.model')

# Print the Keyword in the 10 topics
print(model_reloaded.print_topics())
doc_lda = model_reloaded[corpus]

Let's see How to interpret this?

Topic 0 is a represented as _'0.049*"vaccine" + 0.032*"antibody" + 0.022*"response" + 0.017*"epitope" + 0.016*"specific" + 0.016*"use" + 0.014*"immune" + 0.013*"mouse" + 0.012*"vaccination" + 0.012*"vector"'.

It means the top 10 keywords that contribute to this topic are: ‘vaccine’, ‘antibody’, ‘response’, 'epitope'.. and so on and the weight of ‘vaccine’ on topic 0 is 0.049.

The weights reflect how important a keyword is to that topic.

** <font face="verdana" color="green">Looking at these keywords, can we guess what this topic could be? I may summarise it either are ‘vaccine’ or ‘Antibody’.</font>**
Likewise, can you go through the remaining topic keywords and judge what the topic is?

<div class="alert alert-info" role="alert">
DOMAIN KNOWLEDGE WILL BE HELPFUL 
</div>



### <font face="verdana" color="red">Compute Model Perplexity and Coherence Score</font>
Model perplexity and [topic coherence](https://rare-technologies.com/what-is-topic-coherence/) provide a convenient measure to judge how good a given topic model is. Topic coherence score, in particular, has been more helpful.

source: https://radimrehurek.com/gensim/models/coherencemodel.html

In [None]:
"""Compute Perplexity"""
#print('\nPerplexity: ', model_reloaded.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

"""Compute Coherence Score"""
#coherence_model_lda = CoherenceModel(model=model_reloaded, texts=text_words_lemmatized, dictionary=id2word, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)


I going to train the CoherenceModel again. It is time taking, so, here is the score: 
**<p><font face="verdana" color="green">Perplexity:  -8.669969213095087</font></p>**
**<p><font face="verdana" color="green">Coherence Score:  0.48574600791139133</font></p>**