In [None]:
import sys
import os
# Ensure project root is on Python path so `src` can be imported from notebooks
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from src.preprocess import clean_text

In [3]:
# Load the correct emotions dataset file
df = pd.read_csv("../datasets/emotions/go_emotions_dataset.csv")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset shape: (211225, 31)
Columns: ['id', 'text', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [4]:
# Preprocess: clean text and create emotion label
df["cleaned"] = df["text"].apply(clean_text)

# Create a single 'emotion' label from the one-hot emotion columns
exclude_cols = {'id', 'text', 'example_very_unclear'}
candidate_cols = [c for c in df.columns if c not in exclude_cols]
# Keep only numeric columns (the one-hot flags)
emotion_cols = [c for c in candidate_cols if pd.api.types.is_numeric_dtype(df[c])]
if not emotion_cols:
    raise RuntimeError('No numeric emotion columns found. Columns present: ' + ','.join(candidate_cols))
# Use idxmax to pick the emotion with the highest value; if no emotion flagged, set 'neutral'
df['emotion'] = df[emotion_cols].idxmax(axis=1)
no_label = df[emotion_cols].sum(axis=1) == 0
df.loc[no_label, 'emotion'] = 'neutral'
print(f"Emotion distribution:\n{df['emotion'].value_counts()}")

Emotion distribution:
emotion
neutral           58709
admiration        17131
approval          15530
annoyance         11929
disapproval        8917
amusement          8862
gratitude          8437
anger              7956
curiosity          7707
disappointment     6769
confusion          6600
love               5310
caring             5147
realization        5125
joy                5120
optimism           4994
excitement         4375
sadness            3863
surprise           3472
disgust            3420
desire             3002
fear               2514
embarrassment      1720
remorse            1648
nervousness         946
relief              814
pride               714
grief               494
Name: count, dtype: int64


In [5]:
# Vectorize and train model
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["cleaned"])
y = df["emotion"]

model = LogisticRegression(max_iter=1000)
model.fit(X, y)
print(f"Model trained. Classes: {model.classes_}")

Model trained. Classes: ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']


In [6]:
# Save models
joblib.dump(model, "../models/emotion_model.pkl")
joblib.dump(tfidf, "../models/emotion_vectorizer.pkl")
print("✅ Emotion model and vectorizer saved.")

✅ Emotion model and vectorizer saved.
