### 🚀 Task 1: Load Dataset & Seed Words

In [1]:
import pandas as pd
import json
import nltk
nltk.download("stopwords")

# Load the emotion dataset
dataset_path = "emotion_dataset.csv"
df = pd.read_csv(dataset_path)

# Display the dataset
df.head()

print("✅ Emotion dataset loaded successfully!")

# Load seed words (if available)
seed_words_path = "seed_words.json"

try:
    with open(seed_words_path, "r", encoding="utf-8") as file:
        seed_words = json.load(file)
    print("✅ Seed words loaded successfully!")
    print(seed_words)
except FileNotFoundError:
    print("⚠️ Seed words file not found. Please upload 'seed_words.json'!")


✅ Emotion dataset loaded successfully!
✅ Seed words loaded successfully!
{'anger': ['outrageous', 'infuriating', 'ridiculous', 'absurd', 'exasperating', 'disgusting', 'insulting', 'offensive', 'intolerable', 'unacceptable', 'outrage', 'insane', 'angry', 'upset', 'boiling', 'seething', 'frustrated', 'mad', 'irritated', 'livid', 'indignant', 'agitated', 'annoyed', 'pissed', 'irate', 'aggravated', 'enraged', 'bitter', 'displeased', 'disgruntled', 'vexed', 'temperamental', 'cross', 'testy', 'impatient', 'belligerent', 'furious', 'hostile', 'offended', 'exasperated', 'resentful'], 'sadness': ['sad', 'unhappy', 'melancholy', 'dejected', 'mournful', 'downcast', 'despondent', 'blue', 'dismal', 'gloomy', 'forlorn', 'heartbroken', 'woeful', 'crestfallen', 'disheartened', 'grief', 'sorrowful', 'tearful', 'somber', 'bereaved', 'lamenting', 'doleful', 'mournful', 'lugubrious', 'pensive', 'heavyhearted', 'woebegone', 'troubled', 'depressed', 'brokenhearted', 'weepy', 'funereal', 'downhearted', 'low-

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rajubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 🚀 Task 2: Preprocessing the Text

In [2]:
import spacy
import re
from nltk.corpus import stopwords

# Load Spacy Model
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation & special characters
    doc = nlp(text)  # Tokenize & Lemmatize
    words = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return " ".join(words)

# Apply preprocessing
df["processed_text"] = df["text"].astype(str).apply(preprocess_text)

# Display cleaned data
df[["text", "processed_text"]]

print("✅ Text preprocessing completed!")


✅ Text preprocessing completed!


### 🚀 Task 3: Train 5 LDAs with K=4 Topics

In [None]:
from lda_model import LDA_Model  # Load custom LDA model from Moodle
from sklearn.feature_extraction.text import CountVectorizer

# Convert text into a document-term matrix (DTM)
vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(df["processed_text"])

# Train 5 LDA models
lda_models = []
for i in range(5):
    lda = LDA_Model(n_topics=4, n_iter=500, random_state=i)  # Set high `n_iter` value
    lda.fit(X)
    lda_models.append(lda)
    print(f"✅ LDA Model {i+1} trained successfully!")

# Save models for later use
lda_models_dict = {f"LDA_{i+1}": lda for i, lda in enumerate(lda_models)}
print("✅ All 5 LDA models trained and saved!")


### 🚀 Task 4: Train a Seeded LDA

In [None]:
from lda_model import SeededLDA_Model  # Load Seeded LDA model from Moodle

# Ensure seed words are available
if "seed_words" in locals():
    # Train Seeded LDA model
    seeded_lda = SeededLDA_Model(n_topics=4, n_iter=500, random_state=42, seed_words=seed_words)
    seeded_lda.fit(X)
    
    # Display the top words for each topic
    topics = seeded_lda.display_topics(vectorizer)
    print("✅ Seeded LDA trained successfully! Top words per topic:")
    for topic_num, words in topics.items():
        print(f"Topic {topic_num}: {', '.join(words)}")
else:
    print("⚠️ Seed words not found. Please check 'seed_words.json'!")


### 🚀 Task 5: Evaluate with Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Function to get the dominant topic for each document
def get_dominant_topic(model, X):
    return model.documents_to_topic_model(X)

# Compute dominant topics for each LDA model
lda_results = {f"LDA_{i+1}": get_dominant_topic(lda, X) for i, lda in enumerate(lda_models)}

# Compute dominant topics for Seeded LDA
seeded_lda_results = get_dominant_topic(seeded_lda, X)

# Convert true labels to numerical format
label_mapping = {"anger": 0, "joy": 1, "sadness": 2, "optimism": 3}
true_labels = df["emotion"].map(label_mapping)

# Compare each LDA model using confusion matrix
for model_name, preds in lda_results.items():
    cm = confusion_matrix(true_labels, preds)
    
    # Plot Confusion Matrix
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_mapping.keys(), yticklabels=label_mapping.keys())
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show()

    print(f"✅ Confusion matrix for {model_name} displayed!")

# Compare Seeded LDA using confusion matrix
cm_seeded = confusion_matrix(true_labels, seeded_lda_results)

# Plot Confusion Matrix for Seeded LDA
plt.figure(figsize=(6, 6))
sns.heatmap(cm_seeded, annot=True, fmt="d", cmap="Greens", xticklabels=label_mapping.keys(), yticklabels=label_mapping.keys())
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Seeded LDA")
plt.show()

print("✅ Confusion matrix for Seeded LDA displayed!")
