https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [1]:
!pip install gensim nltk pyLDAvis



In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models

In [2]:
nltk.download('stopwords', force=True)
nltk.download('punkt', force=True)
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
import pandas as pd
import os
import io

In [4]:
# Read the CSV file into a DataFrame
df = pd.read_csv('../Data/YT_title_test_data.csv')
#df = df = pd.read_csv('../Data/YT_title_test_data_500.csv')
df.shape

(10000, 2)

In [6]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,title,label
0,"15 Highly Important Questions About Adulthood,...",clickbait
1,250 Nuns Just Cycled All The Way From Kathmand...,clickbait
2,"Australian comedians ""could have been shot"" du...",not-clickbait
3,Lycos launches screensaver to increase spammer...,not-clickbait
4,Fußball-Bundesliga 2008–09: Goalkeeper Butt si...,not-clickbait


In [5]:
# Step 1: Preprocess Data
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalpha() and word not in stop_words]

processed_data = df['title'].apply(preprocess)

In [7]:
# Create Dictionary and Corpus
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

# Step 2: LDA Model Training
lda_model = LdaModel(corpus, num_topics=25, id2word=dictionary, passes=15)

# Step 4: Coherence Evaluation with top_n words
top_n = 10  # Define the number of top words to consider
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_data,
    dictionary=dictionary,
    coherence='c_v',
    #topn=top_n
)
coherence_lda = coherence_model_lda.get_coherence()
print(f'\nCoherence Score with top {top_n} words:', coherence_lda)


Coherence Score with top 10 words: 0.6173446331999689


In [63]:
# Step 3: Visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [40]:
num_words = 5
topics = lda_model.print_topics(num_topics=-1, num_words=num_words)
for topic in topics:
    print(topic)

(0, '0.084*"us" + 0.026*"tell" + 0.022*"may" + 0.016*"reasons" + 0.016*"season"')
(1, '0.027*"sex" + 0.024*"feel" + 0.023*"old" + 0.018*"making" + 0.018*"books"')
(2, '0.087*"things" + 0.081*"know" + 0.048*"one" + 0.038*"based" + 0.036*"people"')
(3, '0.032*"american" + 0.022*"woman" + 0.022*"tv" + 0.020*"single" + 0.020*"die"')
(4, '0.032*"ways" + 0.032*"everyone" + 0.024*"canadian" + 0.024*"halloween" + 0.022*"want"')
(5, '0.035*"look" + 0.033*"harry" + 0.029*"potter" + 0.024*"questions" + 0.023*"fans"')
(6, '0.030*"india" + 0.029*"hilarious" + 0.025*"party" + 0.024*"friends" + 0.023*"work"')
(7, '0.173*"new" + 0.020*"york" + 0.020*"coast" + 0.020*"big" + 0.019*"report"')
(8, '0.041*"man" + 0.036*"british" + 0.024*"fire" + 0.021*"us" + 0.020*"bush"')
(9, '0.047*"love" + 0.027*"music" + 0.025*"years" + 0.017*"iran" + 0.016*"prison"')
(10, '0.065*"like" + 0.028*"video" + 0.026*"fall" + 0.025*"looks" + 0.022*"west"')
(11, '0.066*"life" + 0.030*"christmas" + 0.027*"would" + 0.024*"name" 