Source: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

Datasource: https://www.kaggle.com/datasets/elvinrustam/books-dataset

In [1]:
import pandas as pd
import os
import io

In [None]:
df = pd.read_csv('../Data/BooksDataset.csv')
df.sample(5)

In [None]:
df.dropna(subset=['Description'], inplace=True)
df.dropna(subset=['Category'], inplace=True)

In [57]:
# Keep only the first 500 rows in df
#df = df.iloc[:500]
df = df.sample(n=500, random_state=1)

# Rename 'title' column to 'Comment'
df.rename(columns={'Description': 'Comment'}, inplace=True)
df['Comment']

In [58]:
df.shape

(500, 2)

In [3]:
pip install git+https://github.com/rwalk/gsdmm.git


Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /private/var/folders/p3/s4n39zb50rb2l96w9n054ch80000gn/T/pip-req-build-bii31zk3
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /private/var/folders/p3/s4n39zb50rb2l96w9n054ch80000gn/T/pip-req-build-bii31zk3
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [1]:
from gsdmm import MovieGroupProcess
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora

In [59]:
# Download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# def preprocess(text):
#     stop_words = set(stopwords.words('english'))
#     words = word_tokenize(text.lower())
#     return [word for word in words if word.isalpha() and word not in stop_words]

def preprocess(text):
    # Tokenize, remove stopwords and lemmatize
    return [lemmatizer.lemmatize(word) for word in gensim.utils.simple_preprocess(text) if word not in stop_words]

processed_data = [preprocess(text) for text in df['title']]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [69]:
# Create an instance of the GSDMM model
# num_topics = k = 5
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=15)

# Fit the model on the data
vocab = set(x for doc in processed_data for x in doc)
n_terms = len(vocab)
y = mgp.fit(processed_data, n_terms)

# To see the topics for the first 10 documents
#for i in range(5):
    #print(f"Document: {processed_data[i]}, Topic: {mgp.choose_best_label(processed_data[i])}")

In stage 0: transferred 393 clusters with 10 clusters populated
In stage 1: transferred 247 clusters with 10 clusters populated
In stage 2: transferred 209 clusters with 10 clusters populated
In stage 3: transferred 184 clusters with 10 clusters populated
In stage 4: transferred 182 clusters with 10 clusters populated
In stage 5: transferred 172 clusters with 10 clusters populated
In stage 6: transferred 191 clusters with 10 clusters populated
In stage 7: transferred 161 clusters with 10 clusters populated
In stage 8: transferred 168 clusters with 10 clusters populated
In stage 9: transferred 170 clusters with 10 clusters populated
In stage 10: transferred 164 clusters with 10 clusters populated
In stage 11: transferred 142 clusters with 10 clusters populated
In stage 12: transferred 147 clusters with 10 clusters populated
In stage 13: transferred 146 clusters with 10 clusters populated
In stage 14: transferred 146 clusters with 10 clusters populated


In [70]:
from collections import defaultdict
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Assuming mgp is your fitted GSDMM model and processed_data is your documents

# Find the dominant topic for each document
doc_topic = [mgp.choose_best_label(doc) for doc in processed_data]

# Word frequencies per topic
topic_word_freq = defaultdict(lambda: defaultdict(int))
for doc, topic in zip(processed_data, doc_topic):
    for word in doc:
        topic_word_freq[topic[0]][word] += 1

# Extract top N words for each topic
top_n = 10
top_words_per_topic = {}
for topic, word_freq in topic_word_freq.items():
    top_words = sorted(word_freq, key=word_freq.get, reverse=True)[:top_n]
    top_words_per_topic[topic] = top_words

# Create a dictionary and corpus for coherence calculation
dictionary = Dictionary(processed_data)
corpus = [dictionary.doc2bow(doc) for doc in processed_data]

#list(top_words_per_topic.values())

In [71]:

# Using the top words for each topic to calculate coherence
coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()),texts=processed_data,dictionary=dictionary, coherence='c_v')
#coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()), corpus = corpus, dictionary=dictionary, coherence='u_mass')
#coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()), texts=processed_data, dictionary=dictionary, coherence='c_uci')
coherence_score = coherence_model.get_coherence()
print('Coherence Score:', coherence_score)

Coherence Score: 0.3819018753029405


# No of data sample = 10000

3 topic = Coherence Score: 0.3154939587778532

5 topic = Coherence Score: 0.39488720656846593

10 topic = Coherence Score: 0.3541673664052408

15 topic = Coherence Score: 0.36738970687231437

20 topic = Coherence Score: 0.3531226663611407

25 topic = Coherence Score: 0.333633515439587

30 topic = Coherence Score: 0.35206420000236205

35 topic = Coherence Score: 0.35284489843398603

40 topic = Coherence Score: 0.3301376346654848

# No of data sample = top 500 | random 500 

3 topic = Coherence Score: 0.46768588924709203 | Coherence Score: 0.4515472297700338

5 topic = Coherence Score: 0.436618900342459 | Coherence Score: 0.3992964486716794

10 topic = Coherence Score: 0.3809790533337555 | Coherence Score: 0.3819018753029405

15 topic = Coherence Score: 0.3679961871652472

20 topic = Coherence Score: 0.320415482006279

25 topic = Coherence Score: 0.31854773646225865