In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset
df = pd.read_csv("../data/SocialMediaSentimentsAnalysisDataset.csv")

# Keep only relevant columns
df = df[['Text', 'Sentiment']].copy()

# Drop missing values
df.dropna(inplace=True)

# Standardize sentiment labels (remove extra spaces, convert to lowercase)
df["Sentiment"] = df["Sentiment"].str.strip().str.lower()

# Check class distribution before mapping
print("Original Sentiment Distribution:\n", df["Sentiment"].value_counts())

# Define sentiment grouping
sentiment_map = {
    # Positive Sentiments
    "joy": "positive", "happiness": "positive", "excitement": "positive",
    "contentment": "positive", "admiration": "positive", "love": "positive",
    "optimism": "positive", "gratitude": "positive", "relief": "positive",
    "celestial wonder": "positive", "nature's beauty": "positive", "thrilling journey": "positive",
    
    # Neutral Sentiments
    "acceptance": "neutral", "calm": "neutral", "whispers of the past": "neutral",

    # Negative Sentiments
    "anger": "negative", "fear": "negative", "sadness": "negative",
    "disgust": "negative", "disappointment": "negative", "guilt": "negative",
    "shame": "negative", "frustration": "negative", "anxiety": "negative",
    "betrayal": "negative"
}

# Map sentiments to broader categories
df["Sentiment"] = df["Sentiment"].map(sentiment_map)
df = df.dropna(subset=["Sentiment"])  # Drops any rows with unmapped sentiments

# Check class distribution after mapping
print("New Sentiment Distribution:\n", df["Sentiment"].value_counts())

# Vectorize text
vectorizer = TfidfVectorizer(stop_words="english", max_features=3000, ngram_range=(1,2))
X = vectorizer.fit_transform(df["Text"])
y = df["Sentiment"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Complement Naive Bayes
nb_classifier = ComplementNB()
nb_classifier.fit(X_train, y_train)

# Evaluate Model
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 6: Test on New Sentences
new_texts = ["I love this product!", "This is the worst experience ever.", "It's okay, not great."]
new_texts_vectorized = vectorizer.transform(new_texts)
predictions = nb_classifier.predict(new_texts_vectorized)
print("Predictions:", predictions)


Original Sentiment Distribution:
 Sentiment
positive                45
joy                     44
excitement              37
contentment             19
neutral                 18
                        ..
celestial wonder         1
nature's beauty          1
thrilling journey        1
whispers of the past     1
relief                   1
Name: count, Length: 191, dtype: int64
New Sentiment Distribution:
 Sentiment
positive    135
negative     29
neutral       9
Name: count, dtype: int64
Accuracy: 0.4000
Classification Report:
               precision    recall  f1-score   support

    negative       0.75      1.00      0.86         3
     neutral       0.17      1.00      0.29         4
    positive       1.00      0.25      0.40        28

    accuracy                           0.40        35
   macro avg       0.64      0.75      0.51        35
weighted avg       0.88      0.40      0.43        35

Predictions: ['positive' 'negative' 'positive']
