# Topic modeling

### Step we follow:
- Set up the environment and imported necessary libraries.
- Loaded and preprocessed the 20 Newsgroups dataset.
- Created a dictionary and corpus for Gensim.
- Trained an LDA (Latent Dirichlet Allocation) model.
- Evaluated the model using coherence score.
- Visualized the topics using pyLDAvis.
- Printed the top words for each topic.
- Assigned topics to documents.
- Analyzed and visualized the topic distribution.
- Created a function to get the topic for new text.

In [16]:
import os
import dill
import nltk
import joblib
import gensim
from gensim import corpora
from gensim.models import LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import matplotlib.pyplot as plt

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/ram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ram/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Load and preprocess the 20 Newsgroups dataset:

In [2]:
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(
    subset='all', remove=('headers', 'footers', 'quotes'))

# Create a DataFrame
df = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target})

# Define preprocessing function


def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords and non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()
              and token not in STOPWORDS]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens


# Apply preprocessing to the text column
df['processed_text'] = df['text'].apply(preprocess_text)

### Create a dictionary and corpus for Gensim:

In [3]:
# Create a dictionary
dictionary = corpora.Dictionary(df['processed_text'])

# Filter out extreme words (appearing in less than 5 documents or more than 50% of documents)
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create the corpus
corpus = [dictionary.doc2bow(text) for text in df['processed_text']]

### Train the LDA model:

In [4]:
# Set LDA parameters
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

# Train the LDA model
lda_model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    chunksize=chunksize,
    passes=passes,
    iterations=iterations,
    eval_every=eval_every,
    workers=4  # Adjust based on your CPU cores
)

### Evaluate the model using coherence score:

In [6]:
from gensim.models import CoherenceModel

# Calculate coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=df['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.6204193118079318


### Visualize the topics:

In [7]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualization
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

# Save the visualization as an HTML file
pyLDAvis.save_html(vis_data, 'lda_visualization.html')
print("LDA visualization saved as 'lda_visualization.html'")

LDA visualization saved as 'lda_visualization.html'


### Print the top words for each topic:

In [8]:
def print_topics(lda_model, num_words=10):
    for idx, topic in lda_model.print_topics(-1, num_words):
        print(f"Topic: {idx}")
        print(", ".join([word.split("*")[1].strip().replace('"', '')
              for word in topic.split("+")]))
        print()


print_topics(lda_model)

Topic: 0
max, q, p, r, g, n, giz, bhj, m, w

Topic: 1
state, government, law, right, weapon, new, united, firearm, american, gun

Topic: 2
think, people, moral, right, value, human, society, yes, objective, point

Topic: 3
said, people, q, know, think, gun, time, child, president, going

Topic: 4
file, image, window, program, x, use, available, user, version, ftp

Topic: 5
m, g, r, p, c, b, n, w, s, o

Topic: 6
space, launch, nasa, mission, satellite, research, orbit, science, data, shuttle

Topic: 7
homosexual, sex, homosexuality, men, paul, church, male, sexual, gay, woman

Topic: 8
drive, card, disk, window, problem, use, know, thanks, work, like

Topic: 9
wire, power, use, ground, circuit, wiring, box, light, cable, outlet

Topic: 10
x, armenian, turkish, planet, turk, earth, year, entry, russian, genocide

Topic: 11
car, engine, new, know, like, problem, mile, tire, dealer, oil

Topic: 12
like, water, energy, time, battery, think, greek, american, right, mean

Topic: 13
book, theo

### Assign topics to documents:

In [9]:
def get_dominant_topic(lda_model, corpus):
    topic_assignments = []
    for doc in corpus:
        topic_dist = lda_model.get_document_topics(doc)
        dominant_topic = max(topic_dist, key=lambda x: x[1])[0]
        topic_assignments.append(dominant_topic)
    return topic_assignments


df['dominant_topic'] = get_dominant_topic(lda_model, corpus)

### Analyze topic distribution:

In [10]:
topic_distribution = df['dominant_topic'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
topic_distribution.plot(kind='bar')
plt.title('Topic Distribution')
plt.xlabel('Topic')
plt.ylabel('Number of Documents')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('topic_distribution.png')
plt.close()
print("Topic distribution plot saved as 'topic_distribution.png'")

Topic distribution plot saved as 'topic_distribution.png'


### Create a function to get topic for new text:

In [12]:
def get_topic_for_text(text, lda_model, dictionary):
    processed_text = preprocess_text(text)
    bow = dictionary.doc2bow(processed_text)
    topic_dist = lda_model.get_document_topics(bow)
    dominant_topic = max(topic_dist, key=lambda x: x[1])[0]
    return dominant_topic


# Example usage
new_text = "This is a sample text about computer science and programming."
topic = get_topic_for_text(new_text, lda_model, dictionary)
print(f"Dominant topic for the new text: {topic}")

Dominant topic for the new text: 4


### Save the model and other neccessary stuff for later use

In [27]:
def save_topic_model_components(lda_model, dictionary, corpus, df, preprocess_text_func, base_path="topic_model_data"):
    os.makedirs(base_path, exist_ok=True)

    joblib.dump(lda_model, os.path.join(base_path, "lda_model.joblib"))
    joblib.dump(dictionary, os.path.join(base_path, "dictionary.joblib"))
    joblib.dump(corpus, os.path.join(base_path, "corpus.joblib"))
    joblib.dump(df, os.path.join(base_path, "preprocessed_df.joblib"))

    with open(os.path.join(base_path, "preprocess_text_function.dill"), "wb") as f:
        dill.dump(preprocess_text_func, f)

    print("All topic model components saved successfully.")


# Usage
save_topic_model_components(lda_model, dictionary, corpus, df, preprocess_text)

All topic model components saved successfully.


### To load the saved components later:

In [19]:
def load_topic_model_components(base_path="topic_model_data"):
    lda_model = joblib.load(os.path.join(base_path, "lda_model.joblib"))
    dictionary = joblib.load(os.path.join(base_path, "dictionary.joblib"))
    corpus = joblib.load(os.path.join(base_path, "corpus.joblib"))
    df = joblib.load(os.path.join(base_path, "preprocessed_df.joblib"))

    with open(os.path.join(base_path, "preprocess_text_function.dill"), "rb") as f:
        preprocess_text_func = dill.load(f)

    print("All topic model components loaded successfully.")
    return lda_model, dictionary, corpus, df, preprocess_text_func


# Usage
loaded_lda_model, loaded_dictionary, loaded_corpus, loaded_df, loaded_preprocess_text = load_topic_model_components()

All topic model components loaded successfully.


In [20]:
# Example: Print topics using the loaded model
def print_topics(lda_model, num_words=10):
    for idx, topic in lda_model.print_topics(-1, num_words):
        print(f"Topic: {idx}")
        print(", ".join([word.split("*")[1].strip().replace('"', '')
              for word in topic.split("+")]))
        print()


print_topics(loaded_lda_model)

# Example: Get topic for new text using loaded components


def get_topic_for_text(text, lda_model, dictionary, preprocess_func):
    processed_text = preprocess_func(text)
    bow = dictionary.doc2bow(processed_text)
    topic_dist = lda_model.get_document_topics(bow)
    dominant_topic = max(topic_dist, key=lambda x: x[1])[0]
    return dominant_topic


new_text = "This is a sample text about computer science and programming."
topic = get_topic_for_text(new_text, loaded_lda_model,
                           loaded_dictionary, loaded_preprocess_text)
print(f"Dominant topic for the new text: {topic}")

Topic: 0
max, q, p, r, g, n, giz, bhj, m, w

Topic: 1
state, government, law, right, weapon, new, united, firearm, american, gun

Topic: 2
think, people, moral, right, value, human, society, yes, objective, point

Topic: 3
said, people, q, know, think, gun, time, child, president, going

Topic: 4
file, image, window, program, x, use, available, user, version, ftp

Topic: 5
m, g, r, p, c, b, n, w, s, o

Topic: 6
space, launch, nasa, mission, satellite, research, orbit, science, data, shuttle

Topic: 7
homosexual, sex, homosexuality, men, paul, church, male, sexual, gay, woman

Topic: 8
drive, card, disk, window, problem, use, know, thanks, work, like

Topic: 9
wire, power, use, ground, circuit, wiring, box, light, cable, outlet

Topic: 10
x, armenian, turkish, planet, turk, earth, year, entry, russian, genocide

Topic: 11
car, engine, new, know, like, problem, mile, tire, dealer, oil

Topic: 12
like, water, energy, time, battery, think, greek, american, right, mean

Topic: 13
book, theo

In [23]:
import unittest
import joblib
import dill
import numpy as np
import pandas as pd
from gensim.models import LdaMulticore
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel


class TestLDATopicModeling(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # Load the trained model and necessary data
        cls.lda_model = joblib.load("topic_model_data/lda_model.joblib")
        cls.dictionary = joblib.load("topic_model_data/dictionary.joblib")
        cls.corpus = joblib.load("topic_model_data/corpus.joblib")
        cls.df = joblib.load("topic_model_data/preprocessed_df.joblib")

        with open("topic_model_data/preprocess_text_function.dill", "rb") as f:
            cls.preprocess_text = dill.load(f)

        # Load a small test set
        cls.test_texts = cls.df['text'].head(100).tolist()
        cls.test_processed = [cls.preprocess_text(
            text) for text in cls.test_texts]

    def test_model_type(self):
        self.assertIsInstance(self.lda_model, LdaMulticore)
        self.assertIsInstance(self.dictionary, corpora.Dictionary)
        self.assertIsInstance(self.corpus, list)

    def test_model_output_format(self):
        for doc in self.test_processed[:5]:  # Test with first 5 documents
            bow = self.dictionary.doc2bow(doc)
            topics = self.lda_model.get_document_topics(bow)
            self.assertTrue(all(isinstance(topic[0], int) and isinstance(
                topic[1], float) for topic in topics))
            self.assertTrue(all(0 <= prob <= 1 for _, prob in topics))

    def test_topic_coherence(self):
        coherence_model = CoherenceModel(
            model=self.lda_model, texts=self.test_processed, dictionary=self.dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        self.assertGreater(coherence_score, 0.0)  # Adjust threshold as needed

    def test_topic_diversity(self):
        N = 20
        topic_words = [word for topic in self.lda_model.show_topics(num_topics=-1, num_words=N, formatted=False)
                       for word, _ in topic[1]]
        diversity_score = len(set(topic_words)) / \
            (self.lda_model.num_topics * N)
        self.assertGreater(diversity_score, 0.0)  # Adjust threshold as needed

    def test_new_text_assignment(self):
        new_text = "This is a sample text about artificial intelligence and machine learning."
        processed_text = self.preprocess_text(new_text)
        bow = self.dictionary.doc2bow(processed_text)
        topics = self.lda_model.get_document_topics(bow)
        self.assertTrue(len(topics) > 0)
        self.assertIsInstance(topics[0][0], int)
        self.assertIsInstance(topics[0][1], float)
        self.assertTrue(0 <= topics[0][1] <= 1)

    def test_model_consistency(self):
        text = "This is a test text for consistency."
        processed_text = self.preprocess_text(text)
        bow = self.dictionary.doc2bow(processed_text)
        topics1 = self.lda_model.get_document_topics(bow)
        topics2 = self.lda_model.get_document_topics(bow)
        self.assertEqual(topics1, topics2)

    def test_preprocessing_function(self):
        text = "This is a TEST sentence with UPPERCASE words and punctuation!"
        processed = self.preprocess_text(text)
        self.assertTrue(all(word.islower() for word in processed))
        self.assertTrue(all(word.isalpha() for word in processed))

    def test_dictionary_filter(self):
        # Check if extreme words have been filtered out
        self.assertTrue(
            all(self.dictionary.dfs[id] >= 5 for id in self.dictionary.dfs))
        self.assertTrue(all(
            self.dictionary.dfs[id] / len(self.corpus) <= 0.5 for id in self.dictionary.dfs))

    def test_corpus_format(self):
        self.assertTrue(all(isinstance(doc, list)
                        for doc in self.corpus[:5]))  # Test with first 5 documents
        self.assertTrue(all(isinstance(word_id, int) and isinstance(count, int)
                            for doc in self.corpus[:5] for word_id, count in doc))


if __name__ == '__main__':
    unittest.main()

usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/ram/.local/share/jupyter/runtime/kernel-v33878badd108547c084ff41c9f1bf87ea8239424a.json'


AttributeError: 'tuple' object has no attribute 'tb_frame'

In [24]:
df.head()

Unnamed: 0,text,target,processed_text,dominant_topic
0,\n\nI am sure some bashers of Pens fans are pr...,10,"[sure, bashers, pen, fan, pretty, confused, la...",14
1,My brother is in the market for a high-perform...,3,"[brother, market, video, card, support, vesa, ...",8
2,\n\n\n\n\tFinally you said what you dream abou...,17,"[finally, said, dream, mediterranean, new, are...",3
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,"[think, scsi, card, dma, transfer, disk, scsi,...",8
4,1) I have an old Jasmine drive which I cann...,4,"[old, jasmine, drive, use, new, understanding,...",8


In [25]:
df['tokenized_text'] = df['text'].apply(word_tokenize)

In [26]:
df.head()

Unnamed: 0,text,target,processed_text,dominant_topic,tokenized_text
0,\n\nI am sure some bashers of Pens fans are pr...,10,"[sure, bashers, pen, fan, pretty, confused, la...",14,"[I, am, sure, some, bashers, of, Pens, fans, a..."
1,My brother is in the market for a high-perform...,3,"[brother, market, video, card, support, vesa, ...",8,"[My, brother, is, in, the, market, for, a, hig..."
2,\n\n\n\n\tFinally you said what you dream abou...,17,"[finally, said, dream, mediterranean, new, are...",3,"[Finally, you, said, what, you, dream, about, ..."
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,"[think, scsi, card, dma, transfer, disk, scsi,...",8,"[Think, !, It, 's, the, SCSI, card, doing, the..."
4,1) I have an old Jasmine drive which I cann...,4,"[old, jasmine, drive, use, new, understanding,...",8,"[1, ), I, have, an, old, Jasmine, drive, which..."


In [28]:
import unittest
import joblib
import dill
import numpy as np
import pandas as pd
from gensim.models import LdaMulticore
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)


class TestLDATopicModeling(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # Load the trained model and necessary data
        cls.lda_model = joblib.load("topic_model_data/lda_model.joblib")
        cls.dictionary = joblib.load("topic_model_data/dictionary.joblib")
        cls.corpus = joblib.load("topic_model_data/corpus.joblib")
        cls.df = joblib.load("topic_model_data/preprocessed_df.joblib")

        with open("topic_model_data/preprocess_text_function.dill", "rb") as f:
            cls.preprocess_text = dill.load(f)

        # Load a small test set, now using the pre-tokenized texts
        cls.test_texts = cls.df['tokenized_text'].head(100).tolist()
        cls.test_processed = [cls.preprocess_text(
            text) for text in cls.test_texts]

    def test_model_type(self):
        self.assertIsInstance(self.lda_model, LdaMulticore)
        self.assertIsInstance(self.dictionary, corpora.Dictionary)
        self.assertIsInstance(self.corpus, list)

    def test_model_output_format(self):
        for doc in self.test_processed[:5]:  # Test with first 5 documents
            bow = self.dictionary.doc2bow(doc)
            topics = self.lda_model.get_document_topics(bow)
            self.assertTrue(all(isinstance(topic[0], int) and isinstance(
                topic[1], float) for topic in topics))
            self.assertTrue(all(0 <= prob <= 1 for _, prob in topics))

    def test_topic_coherence(self):
        coherence_model = CoherenceModel(
            model=self.lda_model, texts=self.test_processed, dictionary=self.dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        self.assertGreater(coherence_score, 0.0)  # Adjust threshold as needed

    def test_topic_diversity(self):
        N = 20
        topic_words = [word for topic in self.lda_model.show_topics(num_topics=-1, num_words=N, formatted=False)
                       for word, _ in topic[1]]
        diversity_score = len(set(topic_words)) / \
            (self.lda_model.num_topics * N)
        self.assertGreater(diversity_score, 0.0)  # Adjust threshold as needed

    def test_new_text_assignment(self):
        new_text = "This is a sample text about artificial intelligence and machine learning."
        processed_text = self.preprocess_text(new_text)
        bow = self.dictionary.doc2bow(processed_text)
        topics = self.lda_model.get_document_topics(bow)
        self.assertTrue(len(topics) > 0)
        self.assertIsInstance(topics[0][0], int)
        self.assertIsInstance(topics[0][1], float)
        self.assertTrue(0 <= topics[0][1] <= 1)

    def test_model_consistency(self):
        text = "This is a test text for consistency."
        processed_text = self.preprocess_text(text)
        bow = self.dictionary.doc2bow(processed_text)
        topics1 = self.lda_model.get_document_topics(bow)
        topics2 = self.lda_model.get_document_topics(bow)
        self.assertEqual(topics1, topics2)

    def test_preprocessing_function(self):
        text = "This is a TEST sentence with UPPERCASE words and punctuation!"
        processed = self.preprocess_text(text)
        self.assertTrue(all(word.islower() for word in processed))
        self.assertTrue(all(word.isalpha() for word in processed))

    def test_dictionary_filter(self):
        # Check if extreme words have been filtered out
        self.assertTrue(
            all(self.dictionary.dfs[id] >= 5 for id in self.dictionary.dfs))
        self.assertTrue(all(
            self.dictionary.dfs[id] / len(self.corpus) <= 0.5 for id in self.dictionary.dfs))

    def test_corpus_format(self):
        self.assertTrue(all(isinstance(doc, list)
                        for doc in self.corpus[:5]))  # Test with first 5 documents
        self.assertTrue(all(isinstance(word_id, int) and isinstance(count, int)
                            for doc in self.corpus[:5] for word_id, count in doc))


unittest.main()

usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/ram/.local/share/jupyter/runtime/kernel-v33878badd108547c084ff41c9f1bf87ea8239424a.json'


AttributeError: 'tuple' object has no attribute 'tb_frame'