In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import neattext.functions as nfx
import joblib
import warnings
warnings.filterwarnings('ignore')


In [9]:
# Load and preprocess data
def load_and_preprocess_data(file_path):
    # Read CSV file
    df = pd.read_csv(file_path, names=['id', 'text', 'emotion'], header=0)
    
    # Clean text using neattext
    df['clean_text'] = df['text'].apply(nfx.remove_stopwords)
    df['clean_text'] = df['clean_text'].apply(nfx.remove_punctuations)
    df['clean_text'] = df['clean_text'].apply(nfx.remove_special_characters)
    
    return df


In [10]:
# Visualize emotion distribution
def plot_emotion_distribution(df, save_path='emotion_distribution.png'):
    plt.figure(figsize=(10, 6))
    sns.countplot(x='emotion', data=df)
    plt.title('Distribution of Emotions')
    plt.xlabel('Emotion')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.savefig(save_path)
    plt.close()

In [None]:
# Build and train model
def train_model(X, y):
    # Create pipeline with TF-IDF vectorizer
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', SVC(decision_function_shape='ovo', max_iter=1000))
    ])
    
    # Train model
    pipeline.fit(X, y)
    return pipeline


In [16]:
# Evaluate model
def evaluate_model(model, X_test, y_test, save_path='confusion_matrix.png'):
    # Predictions
    y_pred = model.predict(X_test)
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(save_path)
    plt.close()


In [17]:
def main(file_path):
    # Load and preprocess data
    df = load_and_preprocess_data(file_path)
    
    # Plot emotion distribution
    plot_emotion_distribution(df)
    
    # Split data
    X = df['clean_text']
    y = df['emotion']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = train_model(X_train, y_train)
    
    # Evaluate model
    evaluate_model(model, X_test, y_test)
    
    # Save model
    joblib.dump(model, 'emotion_detection_model.pkl')
    
    # Example prediction
    sample_text = ["I am so happy today!", "This is really frustrating."]
    cleaned_samples = [nfx.remove_stopwords(nfx.remove_punctuations(nfx.remove_special_characters(text))) for text in sample_text]
    predictions = model.predict(cleaned_samples)
    
    print("\nSample Predictions:")
    for text, pred in zip(sample_text, predictions):
        print(f"Text: {text} => Predicted Emotion: {pred}")


In [18]:

if __name__ == "__main__":
    # Replace 'dataset.csv' with your actual CSV file path
    main('Emotion Dataset.csv')


Classification Report:
              precision    recall  f1-score   support

       anger       0.56      0.93      0.70     11339
        fear       0.42      0.88      0.57      9376
         joy       0.78      0.84      0.81     28247
        love       0.82      0.49      0.61      6853
     sadness       0.96      0.32      0.48     24504
    surprise       0.75      0.62      0.68      3043

    accuracy                           0.67     83362
   macro avg       0.72      0.68      0.64     83362
weighted avg       0.77      0.67      0.65     83362


Sample Predictions:
Text: I am so happy today! => Predicted Emotion: joy
Text: This is really frustrating. => Predicted Emotion: anger
