<a href="https://colab.research.google.com/github/srinikha193/Text_Analysis-Topic_Modeling/blob/main/4modelcomparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install top2vec




In [5]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.16.4


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from top2vec import Top2Vec
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, nmf
import numpy as np

# Ensure necessary NLTK resources are downloaded
nltk.download('stopwords')

# Load the single CSV file
#data = pd.read_csv('/content/combined_dataset.csv')
data = pd.read_csv('/content/combined_dataset.csv', usecols=['text'])

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing spaces
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Apply preprocessing
data['processed_tweets'] = data['text'].apply(preprocess_text)

# Prepare corpus and dictionary for Gensim coherence calculations
dictionary = Dictionary(data['processed_tweets'])
corpus = [dictionary.doc2bow(text) for text in data['processed_tweets']]

# Vectorization using TF-IDF
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform([" ".join(tweet) for tweet in data['processed_tweets']])

# Vectorization using Count Vectorizer
vectorizer_count = CountVectorizer(max_features=5000)
X_count = vectorizer_count.fit_transform([" ".join(tweet) for tweet in data['processed_tweets']])

# LDA for topic modeling (using Gensim for coherence)
gensim_lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=42)
lda_model = LatentDirichletAllocation(n_components=3, random_state=42)
lda_model.fit(X_count)

# NMF for topic modeling (using Gensim for coherence)
gensim_nmf_model = nmf.Nmf(corpus=corpus, num_topics=3, id2word=dictionary, random_state=42)
nmf_model = NMF(n_components=3, random_state=42)
nmf_model.fit(X_tfidf)

# Top2Vec model
top2vec_model = Top2Vec(documents=[" ".join(tweet) for tweet in data['processed_tweets']], speed="learn", workers=4)

# BERTopic model
bertopic_model = BERTopic()
topics, probs = bertopic_model.fit_transform([" ".join(tweet) for tweet in data['processed_tweets']])

# Coherence score calculation
def calculate_coherence(model, texts, dictionary, corpus, measure='c_v'):
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, corpus=corpus, coherence=measure)
    return coherence_model.get_coherence()

# Fine-tuning and coherence score evaluation
best_coherence = 0
best_lda_model = None
for num_topics in [2, 3, 5, 10]:
    for alpha in ['symmetric', 'asymmetric', 0.1, 0.5]:
        lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha, random_state=42)
        coherence = calculate_coherence(lda_model, data['processed_tweets'], dictionary, corpus)
        print(f"LDA - Num Topics: {num_topics}, Alpha: {alpha}, Coherence: {coherence}")

        if coherence > best_coherence:
            best_coherence = coherence
            best_lda_model = lda_model

print(f"Best LDA Coherence: {best_coherence}")

best_nmf_coherence = 0
best_nmf_model = None
for n_components in [2, 3, 5, 10]:
    nmf_model = NMF(n_components=n_components, init='random', random_state=42)
    nmf_model.fit(X_tfidf)
    topic_words = [[vectorizer_tfidf.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]] for topic in nmf_model.components_]
    gensim_nmf = [dictionary.doc2bow(text) for text in topic_words]
    coherence = CoherenceModel(topics=topic_words, texts=data['processed_tweets'], dictionary=dictionary, coherence='c_v').get_coherence()
    print(f"NMF - Components: {n_components}, Coherence: {coherence}")

    if coherence > best_nmf_coherence:
        best_nmf_coherence = coherence
        best_nmf_model = nmf_model

print(f"Best NMF Coherence: {best_nmf_coherence}")

# Coherence score for Top2Vec
top2vec_topics = top2vec_model.get_topics()
top2vec_coherence = CoherenceModel(topics=[words for words, _ in top2vec_topics], texts=data['processed_tweets'], dictionary=dictionary, coherence='c_v').get_coherence()
print(f"Top2Vec Coherence Score: {top2vec_coherence}")

# Coherence score for BERTopic
bertopic_topics = bertopic_model.get_topic_info()
bertopic_topic_words = [bertopic_model.get_topic(i) for i in range(len(bertopic_topics)) if i != -1]
bertopic_topic_words = [[word[0] for word in topic[:10]] for topic in bertopic_topic_words]
bertopic_coherence = CoherenceModel(topics=bertopic_topic_words, texts=data['processed_tweets'], dictionary=dictionary, coherence='c_v').get_coherence()
print(f"BERTopic Coherence Score: {bertopic_coherence}")

# Explanation:
# This code adds coherence score calculations for Top2Vec and BERTopic. The coherence score is used to evaluate
#the quality of the topics generated by each model, helping to determine which model performs best for the dataset.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
