LDA topic model for content in each category

source: https://developer.ibm.com/tutorials/awb-lda-topic-modeling-text-analysis-python/

In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

LDA model for all documents

In [4]:
# LDA modeling for all documents
data = pd.read_csv('dataset/category_with_document.csv')

data['processed_Document'] = data['Document'].apply(preprocess_text)
texts = data['processed_Document'].tolist()

# load dictionary
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below = 2)

# generate corpus as BoW
corpus = [dictionary.doc2bow(text) for text in texts]

# train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, chunksize=20, num_topics=5, passes=200, iterations=100)

for topic in lda_model.print_topics(num_topics=5, num_words=10):
    print(topic)

coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('Coherence Score:', coherence_score)

(0, '0.025*"tr" + 0.018*"model" + 0.017*"td" + 0.016*"using" + 0.014*"spark" + 0.014*"data" + 0.012*"v" + 0.012*"user" + 0.009*"application" + 0.009*"access"')
(1, '0.163*"version" + 0.112*"supported" + 0.082*"x" + 0.053*"release" + 0.036*"whitecheckmark" + 0.032*"latest" + 0.030*"update" + 0.022*"support" + 0.019*"currently" + 0.014*"patch"')
(2, '0.028*"please" + 0.023*"issue" + 0.021*"bug" + 0.019*"code" + 0.017*"file" + 0.016*"via" + 0.015*"bounty" + 0.015*"key" + 0.014*"found" + 0.012*"u"')
(3, '0.106*"policy" + 0.098*"vulnerability" + 0.034*"reporting" + 0.029*"please" + 0.026*"report" + 0.022*"project" + 0.019*"use" + 0.018*"disclosure" + 0.014*"fix" + 0.014*"information"')
(4, '0.060*"issue" + 0.060*"vulnerability" + 0.052*"report" + 0.028*"reporting" + 0.025*"please" + 0.020*"github" + 0.018*"email" + 0.014*"public" + 0.012*"project" + 0.012*"team"')
Coherence Score: 0.5324110355105856


λ = 0; the terms are ranked purely by their probability within the topic

λ = 1; the terms are ranked purely by their overall term frequency across all topics (most common terms in the dataset)

In [5]:
dickens_visual = gensimvisualize.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.display(dickens_visual)



LDA modeling for each category

In [None]:

data = pd.read_csv('dataset/category_with_document.csv')

data['processed_Document'] = data['Document'].apply(preprocess_text)
# group documents by category
grouped_data = data.groupby('Category')['processed_Document'].apply(list).reset_index()

models = {}

for _, row in grouped_data.iterrows():
    category = row['Category']
    texts = row['processed_Document']
    # print(texts[:1])

    # load dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below = 2)

    # generate corpus as BoW
    corpus = [dictionary.doc2bow(text) for text in texts]

    # train LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, chunksize=20, num_topics=7, passes=200, iterations=40)

    print(f"Category: {category}")
    for topic in lda_model.print_topics(num_topics=7, num_words=10):
        print(topic)

    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence Score:', coherence_score)

    models[category] = {
        "lda_model": lda_model,
        "corpus": corpus,
        "dictionary": dictionary,
        "coherence_score": coherence_score
    }
    
    print('\n')

In [None]:
category = 'Projects practise'

lda_model = models[category]['lda_model']
corpus = models[category]['corpus']
dictionary = models[category]['dictionary']

dickens_visual = gensimvisualize.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.display(dickens_visual)