In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import neattext.functions as nfx
import joblib
import warnings
warnings.filterwarnings('ignore')


In [2]:

# Load and preprocess data
def load_and_preprocess_data(file_path):
    try:
        # Read CSV file
        df = pd.read_csv(file_path, names=['id', 'text', 'emotion'], header=0)
        
        # Basic data validation
        if df.empty:
            raise ValueError("Dataset is empty.")
        if df['text'].isnull().any() or df['emotion'].isnull().any():
            raise ValueError("Dataset contains missing values.")
        
        # Minimal text cleaning to preserve context
        df['clean_text'] = df['text'].str.lower()
        df['clean_text'] = df['clean_text'].apply(nfx.remove_special_characters)
        df['clean_text'] = df['clean_text'].apply(nfx.remove_punctuations)
        # Avoid removing stopwords to retain emotional context
        # df['clean_text'] = df['clean_text'].apply(nfx.remove_stopwords)
        
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [3]:
# Visualize emotion distribution
def plot_emotion_distribution(df, save_path='emotion_distribution.png'):
    plt.figure(figsize=(10, 6))
    sns.countplot(x='emotion', data=df)
    plt.title('Distribution of Emotions')
    plt.xlabel('Emotion')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.savefig(save_path)
    plt.close()
    # Print class distribution
    print("\nClass Distribution:")
    print(df['emotion'].value_counts(normalize=True))

In [4]:
# Build and train model
def train_model(X, y):
    # Create pipeline with TF-IDF vectorizer and Random Forest
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    # Perform cross-validation
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_macro')
    print(f"\nCross-Validation F1-Macro Scores: {cv_scores}")
    print(f"Average CV F1-Macro: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    # Train model
    pipeline.fit(X, y)
    return pipeline


In [None]:
# Evaluate model
def evaluate_model(model, X_test, y_test, save_path='confusion_matrix.png'):
    # Predictions
    y_pred = model.predict(X_test)
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(save_path)
    plt.close()


In [None]:
# Debug predictions with sample texts
def debug_predictions(model):
    sample_texts = [
        "I am so happy today!",
        "This is really frustrating.",
        "I feel so sad and alone.",
        "I love this so much!",
        "I'm terrified of what's happening."
    ]
    cleaned_samples = [preprocess_text(text) for text in sample_texts]
    predictions = model.predict(cleaned_samples)
    
    print("\nDebug Sample Predictions:")
    for text, pred in zip(sample_texts, predictions):
        print(f"Text: {text} => Predicted Emotion: {pred}")
    

In [6]:

def main(file_path):
    # Load and preprocess data
    df = load_and_preprocess_data(file_path)
    if df is None:
        return
    
    # Plot emotion distribution
    plot_emotion_distribution(df)
    
    # Split data
    X = df['clean_text']
    y = df['emotion']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Train model
    model = train_model(X_train, y_train)
    
    # Evaluate model
    evaluate_model(model, X_test, y_test)
    
    # Debug predictions
    debug_predictions(model)
    
    # Save model
    joblib.dump(model, 'model.pkl')
    print("\nModel saved as 'model.pkl'")


In [7]:
def preprocess_text(text):
    text = text.lower()
    text = nfx.remove_special_characters(text)
    text = nfx.remove_punctuations(text)
    # Avoid removing stopwords
    # text = nfx.remove_stopwords(text)
    return text


In [None]:
if __name__ == "__main__":
    main('Emotion Dataset.csv')


Class Distribution:
emotion
joy         0.338445
sadness     0.290749
anger       0.137514
fear        0.114470
love        0.082901
surprise    0.035921
Name: proportion, dtype: float64
