https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [None]:
pip install git+https://github.com/rwalk/gsdmm.git

In [1]:
import pandas as pd
import os
import io
from gsdmm import MovieGroupProcess
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
from collections import defaultdict
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('../Data/Final_Synthetic_Patient_Feedback_Dataset.csv')

In [3]:
df.shape

(450, 1)

In [4]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalpha() and word not in stop_words]

# def preprocess(text):
#     # Tokenize, remove stopwords and lemmatize
#     return [lemmatizer.lemmatize(word) for word in gensim.utils.simple_preprocess(text) if word not in stop_words]

processed_data = [preprocess(text) for text in df['Patient Feedback']]

In [None]:
# Define the range of num_topics and top_n values
num_topics_list = [3, 5, 10, 15, 20]
top_n_list = [5, 10, 15, 20]

# Store coherence scores
coherence_scores = []

for num_topics in num_topics_list:
    mgp = MovieGroupProcess(K=num_topics, alpha=0.1, beta=0.1, n_iters=15)
    
    # Fit the model on the data
    vocab = set(x for doc in processed_data for x in doc)
    n_terms = len(vocab)
    mgp.fit(processed_data, n_terms)

    # Find the dominant topic for each document
    doc_topic = [mgp.choose_best_label(doc) for doc in processed_data]

    # Word frequencies per topic
    topic_word_freq = defaultdict(lambda: defaultdict(int))
    for doc, topic in zip(processed_data, doc_topic):
        for word in doc:
            topic_word_freq[topic[0]][word] += 1

    # Create dictionary and corpus
    dictionary = Dictionary(processed_data)
    corpus = [dictionary.doc2bow(doc) for doc in processed_data]

    for top_n in top_n_list:
        # Extract top N words for each topic
        top_words_per_topic = {
            topic: sorted(word_freq, key=word_freq.get, reverse=True)[:top_n]
            for topic, word_freq in topic_word_freq.items()
        }

        # Calculate coherence score
        coherence_model = CoherenceModel(
            topics=list(top_words_per_topic.values()),
            texts=processed_data,
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_score = coherence_model.get_coherence()

        # Store results
        coherence_scores.append({
            "num_topics": num_topics,
            "top_n": top_n,
            "coherence": coherence_score
        })

# Convert results into a dataframe for better visualization
df_coherence = pd.DataFrame(coherence_scores)

In stage 0: transferred 262 clusters with 3 clusters populated
In stage 1: transferred 105 clusters with 3 clusters populated
In stage 2: transferred 31 clusters with 3 clusters populated
In stage 3: transferred 17 clusters with 3 clusters populated
In stage 4: transferred 8 clusters with 3 clusters populated
In stage 5: transferred 10 clusters with 3 clusters populated
In stage 6: transferred 14 clusters with 3 clusters populated
In stage 7: transferred 4 clusters with 3 clusters populated
In stage 8: transferred 11 clusters with 3 clusters populated
In stage 9: transferred 9 clusters with 3 clusters populated
In stage 10: transferred 11 clusters with 3 clusters populated
In stage 11: transferred 14 clusters with 3 clusters populated
In stage 12: transferred 12 clusters with 3 clusters populated
In stage 13: transferred 15 clusters with 3 clusters populated
In stage 14: transferred 15 clusters with 3 clusters populated
In stage 0: transferred 326 clusters with 5 clusters populated
In 

In [6]:
# Display the dataframe
print(df_coherence)

    num_topics  top_n  coherence
0            3      5   0.805581
1            3     10   0.580945
2            3     15   0.398460
3            3     20   0.333099
4            5      5   0.807994
5            5     10   0.544567
6            5     15   0.357743
7            5     20   0.327948
8           10      5   0.848167
9           10     10   0.537886
10          10     15   0.418089
11          10     20   0.302280
12          15      5   0.888646
13          15     10   0.654489
14          15     15   0.499532
15          15     20   0.376754
16          20      5   0.902087
17          20     10   0.630237
18          20     15   0.464550
19          20     20   0.386240
