In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("eng.csv")

# Check class distribution
print(df[["Anger", "Fear", "Joy", "Sadness", "Surprise"]].sum())

Anger        333
Fear        1611
Joy          674
Sadness      878
Surprise     839
dtype: int64


In [None]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'\s+', ' ', text)     # Remove extra spaces
    return text.strip()

df["cleaned_text"] = df["text"].apply(clean_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)  # Limit features to avoid overfitting
X = tfidf.fit_transform(df["cleaned_text"])
y = df[["Anger", "Fear", "Joy", "Sadness", "Surprise"]]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import hamming_loss, classification_report

y_pred = model.predict(X_test)

# Hamming Loss (lower is better)
print("Hamming Loss:", hamming_loss(y_test, y_pred))

# Detailed metrics
print(classification_report(y_test, y_pred, target_names=y.columns))

Hamming Loss: 0.2563176895306859
              precision    recall  f1-score   support

       Anger       0.00      0.00      0.00        61
        Fear       0.62      0.89      0.73       314
         Joy       0.67      0.01      0.03       134
     Sadness       0.65      0.09      0.15       171
    Surprise       0.75      0.22      0.34       172

   micro avg       0.63      0.39      0.49       852
   macro avg       0.54      0.24      0.25       852
weighted avg       0.61      0.39      0.37       852
 samples avg       0.51      0.36      0.40       852



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
sample_text = "I felt a sharp pain in my chest and couldn't breathe."
cleaned_sample = clean_text(sample_text)
sample_tfidf = tfidf.transform([cleaned_sample])
prediction = model.predict(sample_tfidf)

# Map predictions to emotions
emotions = y.columns
for emotion, pred in zip(emotions, prediction[0]):
    if pred == 1:
        print(f"Predicted Emotion: {emotion}")

Predicted Emotion: Fear
Predicted Emotion: Sadness


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5,  # 5 emotions
    problem_type="multi_label_classification"
)
# Fine-tune on your dataset (requires GPU)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
