https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [1]:
import pandas as pd
import os
import io

In [2]:
df = pd.read_csv('../Data/patient_review_data.csv')
df['Comment']

0      The nurses were exceptionally compassionate an...
1      I appreciated the clear communication from the...
2        The cleanliness of the hospital was remarkable.
3           My pain was managed well throughout my stay.
4                  I felt rushed during my consultation.
                             ...                        
295    Experiencing anger from a caregiver was both s...
296    The awful experience I had with the staff has ...
297    The delay in treatment had a harmful effect on...
298    I hate to say it, but the service here has det...
299    Witnessing cops within the hospital premises w...
Name: Comment, Length: 300, dtype: object

In [None]:
df = pd.read_csv('book_title_test_data.csv')
# Keep only the first 500 rows in df
df = df.iloc[:500]

# Rename 'title' column to 'Comment'
df.rename(columns={'title': 'Comment'}, inplace=True)
df['Comment']

0      15 Highly Important Questions About Adulthood,...
1      250 Nuns Just Cycled All The Way From Kathmand...
2      Australian comedians "could have been shot" du...
3      Lycos launches screensaver to increase spammer...
4      Fußball-Bundesliga 2008–09: Goalkeeper Butt si...
                             ...                        
495    '90s Cartoons With Human Eyes May Ruin Your Ch...
496    Wait, WTF Is "I Saw Mommy Kissing Santa Claus"...
497           First Women Win Seats in Kuwait Parliament
498                   Were You A Scene Kid Or An Emo Kid
499    Opposition leader takes early lead in Sierra L...
Name: Comment, Length: 500, dtype: object

In [None]:
df = pd.read_csv("./Data/patient_review_data.csv")
df = pd.read_csv("./Data/test_data.csv")

In [None]:
pip install git+https://github.com/rwalk/gsdmm.git


Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-8u25wyh7
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-8u25wyh7
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4585 sha256=e0ff82c286bb9a4f1cffa941a5e5195d457cf6cae338da28eb3277d28bbd40c3
  Stored in directory: /tmp/pip-ephem-wheel-cache-b_mnqefj/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1


In [None]:
from gsdmm import MovieGroupProcess
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora

In [None]:
# Download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# def preprocess(text):
#     stop_words = set(stopwords.words('english'))
#     words = word_tokenize(text.lower())
#     return [word for word in words if word.isalpha() and word not in stop_words]

def preprocess(text):
    # Tokenize, remove stopwords and lemmatize
    return [lemmatizer.lemmatize(word) for word in gensim.utils.simple_preprocess(text) if word not in stop_words]

processed_data = [preprocess(text) for text in df['Comment']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Create an instance of the GSDMM model
# num_topics = k = 5
mgp = MovieGroupProcess(K=3, alpha=0.1, beta=0.1, n_iters=30)

# Fit the model on the data
vocab = set(x for doc in processed_data for x in doc)
n_terms = len(vocab)
y = mgp.fit(processed_data, n_terms)

# To see the topics for the first 10 documents
for i in range(5):
    print(f"Document: {processed_data[i]}, Topic: {mgp.choose_best_label(processed_data[i])}")

In stage 0: transferred 263 clusters with 3 clusters populated
In stage 1: transferred 137 clusters with 3 clusters populated
In stage 2: transferred 114 clusters with 3 clusters populated
In stage 3: transferred 99 clusters with 3 clusters populated
In stage 4: transferred 86 clusters with 3 clusters populated
In stage 5: transferred 74 clusters with 3 clusters populated
In stage 6: transferred 75 clusters with 3 clusters populated
In stage 7: transferred 72 clusters with 3 clusters populated
In stage 8: transferred 68 clusters with 3 clusters populated
In stage 9: transferred 70 clusters with 3 clusters populated
In stage 10: transferred 82 clusters with 3 clusters populated
In stage 11: transferred 76 clusters with 3 clusters populated
In stage 12: transferred 72 clusters with 3 clusters populated
In stage 13: transferred 64 clusters with 3 clusters populated
In stage 14: transferred 69 clusters with 3 clusters populated
In stage 15: transferred 66 clusters with 3 clusters populated

In [None]:
from collections import defaultdict
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Assuming mgp is your fitted GSDMM model and processed_data is your documents

# Find the dominant topic for each document
doc_topic = [mgp.choose_best_label(doc) for doc in processed_data]

# Word frequencies per topic
topic_word_freq = defaultdict(lambda: defaultdict(int))
for doc, topic in zip(processed_data, doc_topic):
    for word in doc:
        topic_word_freq[topic[0]][word] += 1

# Extract top N words for each topic
top_n = 10
top_words_per_topic = {}
for topic, word_freq in topic_word_freq.items():
    top_words = sorted(word_freq, key=word_freq.get, reverse=True)[:top_n]
    top_words_per_topic[topic] = top_words

# Create a dictionary and corpus for coherence calculation
dictionary = Dictionary(processed_data)
corpus = [dictionary.doc2bow(doc) for doc in processed_data]

list(top_words_per_topic.values())

[['know',
  'based',
  'new',
  'life',
  'actually',
  'thing',
  'question',
  'would',
  'like',
  'show'],
 ['u',
  'two',
  'crash',
  'iraq',
  'court',
  'return',
  'fire',
  'first',
  'team',
  'british'],
 ['new',
  'make',
  'kid',
  'people',
  'say',
  'gift',
  'fall',
  'like',
  'tweet',
  'plan']]

In [None]:

# Using the top words for each topic to calculate coherence
coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()),texts=processed_data,dictionary=dictionary, coherence='c_v')
#coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()), corpus = corpus, dictionary=dictionary, coherence='u_mass')
#coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()), texts=processed_data, dictionary=dictionary, coherence='c_uci')
coherence_score = coherence_model.get_coherence()
print('Coherence Score:', coherence_score)

Coherence Score: -13.869615911568884
