Source: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [None]:
pip install git+https://github.com/rwalk/gsdmm.git

In [1]:
import pandas as pd
import os
import io
from gsdmm import MovieGroupProcess
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.datasets import fetch_20newsgroups
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
from collections import defaultdict
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

In [2]:
# Step 1: Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Step 2: Create a DataFrame for better handling
df = pd.DataFrame({
    'Text': newsgroups.data,
    'Category': [newsgroups.target_names[label] for label in newsgroups.target]
})

In [3]:
print(df.shape)

# Step 1: Count words in the 'Text' column
df['WordCount'] = df['Text'].apply(lambda x: len(x.split()))

# Step 2: Calculate the thresholds for the bottom and top 10%
lower_threshold = df['WordCount'].quantile(0.1)
upper_threshold = df['WordCount'].quantile(0.9)

# Step 3: Filter the DataFrame to exclude bottom 10% and top 10%
df = df[(df['WordCount'] > lower_threshold) & (df['WordCount'] < upper_threshold)]

# Step 2: Calculate statistics
median_word_count = df['WordCount'].median()
min_word_count = df['WordCount'].min()
max_word_count = df['WordCount'].max()

# Display the results
print("Median Word Count:", median_word_count)
print("Minimum Word Count:", min_word_count)
print("Maximum Word Count:", max_word_count)

# Display the filtered DataFrame
print(df.shape)

(18846, 2)
Median Word Count: 83.0
Minimum Word Count: 19
Maximum Word Count: 330
(15021, 3)


In [2]:
df = df = pd.read_csv('../Data/20NewsGroup_500.csv')
df.dropna(subset=['Text'], inplace=True)
# Randomly sample 500 rows (use the actual dataset size if it's smaller than 500)
#df = df.sample(n=min(500, len(df)), random_state=42)

# Rename 'title' column to 'Comment'
df.rename(columns={'Text': 'Comment'}, inplace=True)

df.shape

(493, 2)

In [3]:
# Download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# def preprocess(text):
#     stop_words = set(stopwords.words('english'))
#     words = word_tokenize(text.lower())
#     return [word for word in words if word.isalpha() and word not in stop_words]

def preprocess(text):
    # Tokenize, remove stopwords and lemmatize
    return [lemmatizer.lemmatize(word) for word in gensim.utils.simple_preprocess(text) if word not in stop_words]

processed_data = [preprocess(text) for text in df['Comment']]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Create an instance of the GSDMM model
# num_topics = k = 5
mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.1, n_iters=15)

# Fit the model on the data
vocab = set(x for doc in processed_data for x in doc)
n_terms = len(vocab)
y = mgp.fit(processed_data, n_terms)

# To see the topics for the first 10 documents
#for i in range(5):
    #print(f"Document: {processed_data[i]}, Topic: {mgp.choose_best_label(processed_data[i])}")


# Assuming mgp is your fitted GSDMM model and processed_data is your documents

# Find the dominant topic for each document
doc_topic = [mgp.choose_best_label(doc) for doc in processed_data]

# Word frequencies per topic
topic_word_freq = defaultdict(lambda: defaultdict(int))
for doc, topic in zip(processed_data, doc_topic):
    for word in doc:
        topic_word_freq[topic[0]][word] += 1

# Extract top N words for each topic
top_n = 5
top_words_per_topic = {}
for topic, word_freq in topic_word_freq.items():
    top_words = sorted(word_freq, key=word_freq.get, reverse=True)[:top_n]
    top_words_per_topic[topic] = top_words

# Create a dictionary and corpus for coherence calculation
dictionary = Dictionary(processed_data)
corpus = [dictionary.doc2bow(doc) for doc in processed_data]

#list(top_words_per_topic.values())

# Using the top words for each topic to calculate coherence
coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()),texts=processed_data,dictionary=dictionary, coherence='c_v')
#coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()), corpus = corpus, dictionary=dictionary, coherence='u_mass')
#coherence_model = CoherenceModel(topics=list(top_words_per_topic.values()), texts=processed_data, dictionary=dictionary, coherence='c_uci')
coherence_score = coherence_model.get_coherence()
print('Coherence Score:', coherence_score)

In stage 0: transferred 449 clusters with 20 clusters populated
In stage 1: transferred 221 clusters with 19 clusters populated
In stage 2: transferred 96 clusters with 14 clusters populated
In stage 3: transferred 76 clusters with 12 clusters populated
In stage 4: transferred 63 clusters with 12 clusters populated
In stage 5: transferred 53 clusters with 10 clusters populated
In stage 6: transferred 57 clusters with 10 clusters populated
In stage 7: transferred 48 clusters with 9 clusters populated
In stage 8: transferred 41 clusters with 11 clusters populated
In stage 9: transferred 41 clusters with 7 clusters populated
In stage 10: transferred 39 clusters with 9 clusters populated
In stage 11: transferred 46 clusters with 8 clusters populated
In stage 12: transferred 44 clusters with 9 clusters populated
In stage 13: transferred 34 clusters with 10 clusters populated
In stage 14: transferred 40 clusters with 7 clusters populated
Coherence Score: 0.5941684511114177
