<a href="https://colab.research.google.com/github/souradipta93/NLP/blob/main/gridsearch_topic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LDA in Python – How to grid search best topic models?

In [None]:
import numpy as np
import pandas as pd
import re
import nltk

In [None]:
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint


#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

from nltk.stem.wordnet import WordNetLemmatizer 

In [None]:
df = pd.read_csv('drug.csv')

In [None]:
df.head()

Unnamed: 0,urlDrugName,rating,Review,score
0,enalapril,4,enalapril management of congestive heart failu...,Low
1,ortho-tri-cyclen,1,ortho-tri-cyclen birth prevention - Although t...,Low
2,ponstel,10,ponstel menstrual cramps - I was used to havin...,high
3,prilosec,3,prilosec acid reflux - The acid reflux went aw...,Low
4,lyrica,2,lyrica fibromyalgia - I think that the Lyrica ...,Low


In [None]:
#Adding custom stop words
new_words = ['http','bit','ly','rt','com','via', 'could', 'would', 'said', 'told', 'yet', 'even', 'shall','let',
            'one', 'never', 'might', 'upon', 'first', 'day', 'either', 'rather', 'thing', 'must', 'saw', 'like', 'know',
            'time', 'thought', 'made', 'found', 'seemed', 'year', 'mr', 'also', 'last', 'two', 'say', 'make', 'get',
            'back', 'take', 'away', 'drug', 'mg', 'side', 'effect', 'medication', 'pill']
stop_words = stop_words.union(new_words)

In [None]:
#Text pre-processing
corpus = []
for i in range(0, df.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    
    #Convert to lowercase
    text = text.lower()
    ##Convert to list from string
    text = text.split()
    ##Lemmatizing
    lm = WordNetLemmatizer() 
       
    
    text = [lm.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [None]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words=stop_words, ngram_range=(1,1), max_df=0.7).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
top_df.head(20)

Unnamed: 0,Word,Freq
0,taking,2278
1,pain,2038
2,week,1762
3,month,1551
4,treatment,1402
5,skin,1354
6,depression,1317
7,took,1264
8,sleep,1101
9,night,1093


In [None]:
vectorizer = CountVectorizer(analyzer='word',
                             min_df=0.001,
                             stop_words=stop_words,
                             token_pattern='[a-zA-Z]{3,}',
                            ngram_range=(1,1))
data_vectorized = vectorizer.fit_transform(corpus)

In [None]:
print(data_vectorized.shape)

(4143, 3884)


In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

In [None]:
# Compute Sparsity = Percentage of Non-Zero cells
print("Sparsity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsity:  1.0230239583698435 %


In [None]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=5, 
                                      max_iter=10, 
                                      learning_method='online',
                                     random_state=123,
                                     batch_size=128,
                                     evaluate_every=-1,
                                     n_jobs=-1)

lda_output = lda_model.fit_transform(data_vectorized)

print(lda_output.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(4143, 5)


### Let us look at the top 10 words of each topic

In [None]:
n_top_words = 8

for topic_idx, topic in enumerate(lda_model.components_):
  print("Topic {}:".format(topic_idx), end = ' ')
  print(" ".join([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic 0: infection taking symptom pressure took allergy treatment doctor
Topic 1: pain headache migraine hour severe taking nausea relief
Topic 2: depression taking sleep feel anxiety week night felt
Topic 3: period month blood control level taking week patient
Topic 4: skin acne face use hair dry treatment month


### Add custom stopwords and repeat pre-processing for better topic word mix

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

Log Likelihood:  -1476992.192972205
Perplexity:  1231.1485239208414


### Setting up the grid search

In [None]:
# Define Search Param
search_params = {'n_components': [3,4,5], 'learning_method':['online','batch']}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, verbose=2)

# Do the Grid Search
model.fit(data_vectorized)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] learning_method=online, n_components=3 ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........... learning_method=online, n_components=3, total=  22.2s
[CV] learning_method=online, n_components=3 ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.1s remaining:    0.0s


[CV] ........... learning_method=online, n_components=3, total=  23.2s
[CV] learning_method=online, n_components=3 ..........................
[CV] ........... learning_method=online, n_components=3, total=  25.6s
[CV] learning_method=online, n_components=3 ..........................
[CV] ........... learning_method=online, n_components=3, total=  20.2s
[CV] learning_method=online, n_components=3 ..........................
[CV] ........... learning_method=online, n_components=3, total=  21.9s
[CV] learning_method=online, n_components=4 ..........................
[CV] ........... learning_method=online, n_components=4, total=  23.5s
[CV] learning_method=online, n_components=4 ..........................
[CV] ........... learning_method=online, n_components=4, total=  21.3s
[CV] learning_method=online, n_components=4 ..........................
[CV] ........... learning_method=online, n_components=4, total=  21.0s
[CV] learning_method=online, n_components=4 ..........................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 13.4min finished


GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_method': ['online', 'batch'],
                         'n_components': [3, 4, 5]},
             verbose=2)

### Best model parameters

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_method': 'online', 'n_components': 3}
Best Log Likelihood Score:  -313149.01337389974
Model Perplexity:  1258.1070683100563
