<a href="https://colab.research.google.com/github/souradipta93/NLP/blob/main/topic_model_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LDA in Python – How to grid search best topic models?

In [None]:
import numpy as np
import pandas as pd
import re
import nltk

In [None]:
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint


#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

from nltk.stem.wordnet import WordNetLemmatizer 

In [None]:
df = pd.read_csv('drug.csv')

In [None]:
df.head()

Unnamed: 0,urlDrugName,rating,Review,score
0,enalapril,4,enalapril management of congestive heart failu...,Low
1,ortho-tri-cyclen,1,ortho-tri-cyclen birth prevention - Although t...,Low
2,ponstel,10,ponstel menstrual cramps - I was used to havin...,high
3,prilosec,3,prilosec acid reflux - The acid reflux went aw...,Low
4,lyrica,2,lyrica fibromyalgia - I think that the Lyrica ...,Low


In [None]:
#Adding custom stop words
new_words = ['http','bit','ly','rt','com','via', 'could', 'would', 'said', 'told', 'yet', 'even', 'shall','let',
            'one', 'never', 'might', 'upon', 'first', 'day', 'either', 'rather', 'thing', 'must', 'saw', 'like', 'know',
            'time', 'thought', 'made', 'found', 'seemed', 'year', 'mr', 'also', 'last', 'two', 'say', 'make', 'get',
            'back', 'take', 'away', 'drug', 'mg', 'side', 'effect', 'medication', 'pill', 'take']
stop_words = stop_words.union(new_words)

In [None]:
#Text pre-processing
corpus = []
for i in range(0, df.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    
    #Convert to lowercase
    text = text.lower()
    ##Convert to list from string
    text = text.split()
    ##Lemmatizing
    lm = WordNetLemmatizer() 
       
    
    text = [lm.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [None]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words=stop_words, ngram_range=(1,1), max_df=0.7).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
top_df.head(20)

Unnamed: 0,Word,Freq
0,taking,2278
1,pain,2038
2,week,1762
3,month,1551
4,treatment,1402
5,skin,1354
6,depression,1317
7,took,1264
8,sleep,1101
9,night,1093


In [None]:
vectorizer = TfidfVectorizer(analyzer='word',
                             min_df=0.001,
                             stop_words=stop_words,
                             token_pattern='[a-zA-Z]{3,}',
                            ngram_range=(1,1))
data_vectorized = vectorizer.fit_transform(corpus)

In [None]:
print(data_vectorized.shape)

(4143, 3902)


In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

In [None]:
# Compute Sparsity = Percentage of Non-Zero cells
print("Sparsity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsity:  1.0504215455834245 %


In [None]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=5, 
                                      max_iter=10, 
                                      learning_method='online',
                                     random_state=123,
                                     batch_size=128,
                                     evaluate_every=-1,
                                     n_jobs=-1)

lda_output = lda_model.fit_transform(data_vectorized)

print(lda_output.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(4143, 5)


### Let us look at the top 10 words of each topic

In [None]:
n_top_words = 8

for topic_idx, topic in enumerate(lda_model.components_):
  print("Topic {}:".format(topic_idx), end = ' ')
  print(" ".join([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic 0: blood pressure cholesterol acid reflux hair thyroid high
Topic 1: acne skin face cream retin use applied redness
Topic 2: flash hot bone lyrica menopause patch premarin estrogen
Topic 3: pain taking effect depression week day sleep month
Topic 4: outbreak valtrex herpes zovirax genital blister preventive recomended


### Add custom stopwords and repeat pre-processing for better topic word mix

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

Log Likelihood:  -190955.32915979958
Perplexity:  4374.574540779342


## View the dominant topic in each document

In [None]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)
lda_output


array([[0.07140381, 0.03617521, 0.03615454, 0.82009434, 0.0361721 ],
       [0.03005548, 0.03026861, 0.05813579, 0.8515377 , 0.03000242],
       [0.03336503, 0.03353801, 0.05702168, 0.84277678, 0.03329849],
       ...,
       [0.03700974, 0.14465004, 0.03698113, 0.74437349, 0.0369856 ],
       [0.02887398, 0.36454784, 0.16207799, 0.41593276, 0.02856743],
       [0.04051172, 0.83633753, 0.04043356, 0.04228475, 0.04043244]])

In [None]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
topicnames


['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4']

In [None]:
# index names
docnames = ["Doc" + str(i) for i in range(len(df))]
docnames[10]


'Doc10'

In [None]:
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

In [None]:
df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4
Doc0,0.07,0.04,0.04,0.82,0.04
Doc1,0.03,0.03,0.06,0.85,0.03
Doc2,0.03,0.03,0.06,0.84,0.03
Doc3,0.20,0.03,0.03,0.71,0.03
Doc4,0.04,0.04,0.19,0.70,0.04
...,...,...,...,...,...
Doc4138,0.12,0.18,0.03,0.65,0.03
Doc4139,0.04,0.04,0.04,0.86,0.04
Doc4140,0.04,0.14,0.04,0.74,0.04
Doc4141,0.03,0.36,0.16,0.42,0.03


In [None]:
# Get dominant topic for each document
# argmax - Returns indices of the max element of the array in a particular axis
dominant_topic = np.argmax(df_document_topic.values, axis=1)
dominant_topic[12]

3

In [None]:
#Add this as a column to the dataframe
df_document_topic['dominant_topic'] = dominant_topic

In [None]:
df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.07,0.04,0.04,0.82,0.04,3
Doc1,0.03,0.03,0.06,0.85,0.03,3
Doc2,0.03,0.03,0.06,0.84,0.03,3
Doc3,0.20,0.03,0.03,0.71,0.03,3
Doc4,0.04,0.04,0.19,0.70,0.04,3
...,...,...,...,...,...,...
Doc4138,0.12,0.18,0.03,0.65,0.03,3
Doc4139,0.04,0.04,0.04,0.86,0.04,3
Doc4140,0.04,0.14,0.04,0.74,0.04,3
Doc4141,0.03,0.36,0.16,0.42,0.03,3


In [None]:
# Styling
def color_red(val):
    color = 'red' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(5).style.applymap(color_red).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.07,0.04,0.04,0.82,0.04,3
Doc1,0.03,0.03,0.06,0.85,0.03,3
Doc2,0.03,0.03,0.06,0.84,0.03,3
Doc3,0.2,0.03,0.03,0.71,0.03,3
Doc4,0.04,0.04,0.19,0.7,0.04,3


## Review topics distribution across documents

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts(normalize=True)
df_topic_distribution

3    0.886073
1    0.088342
0    0.022448
2    0.003138
Name: dominant_topic, dtype: float64

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")


In [None]:
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,3,3671
1,1,366
2,0,93
3,2,13


In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()

Unnamed: 0,abated,abdomen,abdominal,abilify,ability,able,abnormal,abnormality,abruptly,absence,...,zocor,zofran,zoloft,zombie,zomig,zone,zovirax,zyban,zyprexa,zyrtec
Topic0,0.70502,0.237443,0.201231,0.200098,0.201884,0.201029,0.737843,0.217773,0.200646,0.200076,...,5.610294,0.200666,0.200111,0.200111,0.200073,0.202437,0.200413,0.200063,0.200058,0.200884
Topic1,0.200078,0.201015,0.200664,0.200057,0.200222,0.622672,0.204528,0.200089,0.200075,0.200062,...,0.200027,0.200057,0.200046,0.20006,0.200073,0.631187,0.207575,0.200042,0.200038,0.20088
Topic2,0.200167,1.701398,0.201122,0.200117,0.200541,0.200588,0.20963,0.200134,0.200095,0.201381,...,0.200057,0.201665,0.200207,0.200141,0.200173,0.200138,0.20033,0.200101,0.20012,0.200106
Topic3,1.434778,2.052775,9.369212,6.413028,16.12986,52.567443,1.426209,1.164844,1.171766,1.392382,...,0.202094,3.322752,18.685107,5.585479,7.49394,0.866432,1.499163,5.065145,3.255718,13.05654
Topic4,0.200183,0.200202,0.200344,0.200172,0.200129,0.200179,0.200218,0.200197,0.200139,0.200172,...,0.200079,0.200148,0.200127,0.200124,0.200164,0.2002,2.793731,0.20014,0.200127,0.200151


In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10)        
topic_keywords


[array(['blood', 'pressure', 'cholesterol', 'acid', 'reflux', 'hair',
        'thyroid', 'high', 'synthroid', 'level'], dtype='<U19'),
 array(['acne', 'skin', 'face', 'cream', 'retin', 'use', 'applied',
        'redness', 'dry', 'apply'], dtype='<U19'),
 array(['flash', 'hot', 'bone', 'lyrica', 'menopause', 'patch', 'premarin',
        'estrogen', 'density', 'osteoporosis'], dtype='<U19'),
 array(['pain', 'taking', 'effect', 'depression', 'week', 'day', 'sleep',
        'month', 'took', 'anxiety'], dtype='<U19'),
 array(['outbreak', 'valtrex', 'herpes', 'zovirax', 'genital', 'blister',
        'preventive', 'recomended', 'discussed', 'gastritis'], dtype='<U19')]

In [None]:
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,blood,pressure,cholesterol,acid,reflux,hair,thyroid,high,synthroid,level
Topic 1,acne,skin,face,cream,retin,use,applied,redness,dry,apply
Topic 2,flash,hot,bone,lyrica,menopause,patch,premarin,estrogen,density,osteoporosis
Topic 3,pain,taking,effect,depression,week,day,sleep,month,took,anxiety
Topic 4,outbreak,valtrex,herpes,zovirax,genital,blister,preventive,recomended,discussed,gastritis
