In [11]:
import numpy as np
import matplotlib.pyplot as plt
import json
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [9]:
import matplotlib
print("NumPy Version:", np.__version__)
print("Matplotlib Version:", matplotlib.__version__)
print("Scikit-learn Version:", sklearn.__version__)
print("Joblib Version:", joblib.__version__)

NumPy Version: 1.26.4
Matplotlib Version: 3.8.3
Scikit-learn Version: 1.4.1.post1
Joblib Version: 1.3.2


In [10]:
# Load dataset
from datasets import load_dataset

dataset = load_dataset("dair-ai/emotion", "split")

In [13]:
train = dataset['train']
val = dataset['validation']
test = dataset['test']

label_names = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

def get_tweet(data):
    tweets = [x['text'] for x in data]
    labels = [label_names[x['label']] for x in data]
    return tweets, labels

In [14]:
label_names

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [15]:
# Get train and validation data
train_tweets, train_labels = get_tweet(train)
val_tweets, val_labels = get_tweet(val)

# Use TF-IDF vectorization instead of Tokenizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train = vectorizer.fit_transform(train_tweets).toarray()
X_val = vectorizer.transform(val_tweets).toarray()

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_val = label_encoder.transform(val_labels)

# Save vectorizer and label encoder
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

### 1. Logistic Regression

In [16]:
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

### 2. Support Vector Machine (SVM)

In [17]:
model_svm = SVC(kernel='linear', probability=True)  # Linear kernel works well for text data
model_svm.fit(X_train, y_train)

### 3. Random Forest Classifier

In [18]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

### 4. Gradient Boosting Classifier

In [19]:
model_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model_gb.fit(X_train, y_train)

### 5. K-Nearest Neighbors (KNN)

In [20]:
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)

### 6. Multinomial Naive Bayes

In [22]:
# Train a Naive Bayes model
model_mnb = MultinomialNB()
model_mnb.fit(X_train, y_train)

## Model Evaluation

In [24]:
accuracy = model_lr.score(X_val, y_val)
print(f"Validation Accuracy Logistic Regression: {accuracy:.4f}")

accuracy = model_svm.score(X_val, y_val)
print(f"Validation Accuracy Support Vector Machine (SVM): {accuracy:.4f}")

accuracy = model_rf.score(X_val, y_val)
print(f"Validation Accuracy Random Forest Classifier: {accuracy:.4f}")

accuracy = model_gb.score(X_val, y_val)
print(f"Validation Accuracy Gradient Boosting Classifier: {accuracy:.4f}")

accuracy = model_knn.score(X_val, y_val)
print(f"Validation Accuracy K-Nearest Neighbors (KNN): {accuracy:.4f}")

accuracy = model_mnb.score(X_val, y_val)
print(f"Validation Accuracy Multinomial Naive Bayes: {accuracy:.4f}")

Validation Accuracy Logistic Regression: 0.8610
Validation Accuracy Support Vector Machine (SVM): 0.8615
Validation Accuracy Random Forest Classifier: 0.8610
Validation Accuracy Gradient Boosting Classifier: 0.8525
Validation Accuracy K-Nearest Neighbors (KNN): 0.5515
Validation Accuracy Multinomial Naive Bayes: 0.8225


In [25]:
# Save the trained model
joblib.dump(model_svm, "model_textSVM.pkl")

['model_textSVM.pkl']

In [28]:
# Function to predict emotion
def predict_text_emotion(text):
    text_vectorized = vectorizer.transform([text]).toarray()
    prediction = model_svm.predict(text_vectorized)
    predicted_label = label_encoder.inverse_transform(prediction)[0]
    return predicted_label

# Example prediction
print(predict_text_emotion("I am angry!"))

anger
