# BERT - Training on TweetEval (Ran on Colab)

## Install requirements and import packages

In [2]:
!pip install pandas numpy scikit-learn torch transformers datasets
## If running locally - uncomment and run the below package install requirements.
"""
!pip install "transformers[torch]"
!pip install "accelerate>=0.26.0"
"""


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


'\n!pip install "transformers[torch]"\n!pip install "accelerate>=0.26.0"\n'

In [3]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, multilabel_confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


## Load the Dataset

In [6]:
dataset = load_dataset("dair-ai/emotion")
label_names = dataset["train"].features["label"].names
num_labels  = len(label_names)
print(label_names)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


## Tokenizer and Data Preprocessing

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(examples):
    """
    Tokenize a batch of text examples and attach their labels for PyTorch training.

    Args:
        examples (dict):
            A batch from a HuggingFace Dataset containing:
              - "text": List[str], raw input strings to be classified.
              - "label": List[int], integer class labels corresponding to each text.

    Returns:
        dict:
            A dict with the following keys:
              - "input_ids": List[List[int]], token IDs for each input, padded/truncated to max_length.
              - "attention_mask": List[List[int]], mask (1 for real tokens, 0 for padding).
              - "labels": List[int], the original integer labels.
    """
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokens["labels"] = examples["label"]
    return tokens

tokenized = dataset.map(preprocess, batched=True)

tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

train_ds = tokenized["train"]
val_ds   = tokenized["validation"]
test_ds  = tokenized["test"]

Map: 100%|██████████| 16000/16000 [00:00<00:00, 27427.64 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 35817.99 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 36068.40 examples/s]


## Training the model

In [9]:
def compute_metrics(eval_pred):
    """
    Compute evaluation metrics for multi-label classification.

    Args:
        eval_pred (tuple):
            A tuple of (logits, labels)
            - logits: np.ndarray of shape (batch_size, num_labels)
              Raw outputs from the model’s classification head.
            - labels: np.ndarray of shape (batch_size, num_labels)
              Ground-truth multi-hot vectors (0/1).
        threshold (float, optional):
            Probability cutoff for deciding positive labels after sigmoid.
            Defaults to 0.3.

    Returns:
        dict:
            {
                "f1_micro": float,
                    The micro-averaged F1 score across all labels.
                "subset_accuracy": float,
                    The fraction of samples where the predicted multi-hot
                    vector exactly matches the ground truth.
            }
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./bert_emotion",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    report_to="none",
)

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics= compute_metrics,
    tokenizer=tokenizer
)

#  Train
trainer.train()

# Evaluate
print("Validation Metrics:", trainer.evaluate())
print("Test Metrics      :", trainer.predict(test_ds).metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1823,0.168896,0.9295,0.904892




Validation Metrics: {'eval_loss': 0.16889630258083344, 'eval_accuracy': 0.9295, 'eval_f1_macro': 0.9048923431164505, 'eval_runtime': 15.6894, 'eval_samples_per_second': 127.474, 'eval_steps_per_second': 4.015, 'epoch': 1.0}




Test Metrics      : {'test_loss': 0.19651179015636444, 'test_accuracy': 0.9215, 'test_f1_macro': 0.8830407173564044, 'test_runtime': 15.7298, 'test_samples_per_second': 127.147, 'test_steps_per_second': 4.005}


# Results

In [14]:
from sklearn.metrics import confusion_matrix
import pandas as pd

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
ds = load_dataset("dair-ai/emotion", split="test")
y_true, y_pred = [], []
for ex in ds:
    text = ex["text"]
    true = ex["label"]
    enc  = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**enc).logits

    # single‐label prediction via softmax
    probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
    pred  = int(np.argmax(probs))

    y_true.append(true)
    y_pred.append(pred)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

#Overall accuracy
acc = accuracy_score(y_true, y_pred)
print(f"Overall Accuracy: {acc:.4f}\n")

#Per‐class precision / recall / F1
print("Classification Report:")
print(classification_report(
    y_true,
    y_pred,
    target_names=label_names,
    zero_division=0
))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

#  Unfold into TP, FP, FN, TN per class
rows = []
total = cm.sum()
for idx, label in enumerate(label_names):
    TP = cm[idx, idx]
    FP = cm[:, idx].sum() - TP
    FN = cm[idx, :].sum() - TP
    TN = total - (TP + FP + FN)
    rows.append({
        "Class": label,
        "TN": TN,
        "FP": FP,
        "FN": FN,
        "TP": TP,
    })

# Display
df_conf_stats = pd.DataFrame(rows).set_index("Class")
display(df_conf_stats)

Overall Accuracy: 0.9215

Classification Report:
              precision    recall  f1-score   support

     sadness       0.96      0.96      0.96       581
         joy       0.96      0.93      0.94       695
        love       0.78      0.89      0.83       159
       anger       0.92      0.90      0.91       275
        fear       0.88      0.89      0.89       224
    surprise       0.72      0.82      0.77        66

    accuracy                           0.92      2000
   macro avg       0.87      0.90      0.88      2000
weighted avg       0.92      0.92      0.92      2000



Unnamed: 0_level_0,TN,FP,FN,TP
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sadness,1397,22,26,555
joy,1278,27,50,645
love,1801,40,18,141
anger,1704,21,27,248
fear,1750,26,24,200
surprise,1913,21,12,54


## Testing on Text

In [15]:
model.eval()

def predict_emotion_local(text: str):
    """
    Predict the most likely emotion for a given piece of text.

    This function tokenizes the input string, passes it through the
    fine-tuned BERT model, applies a softmax to obtain class probabilities,
    and returns the label with the highest probability along with its confidence score.

    Args:
        text (str):
            A single input string to classify.

    Returns:
        tuple[str, float]:
            - emotion: The name of the predicted emotion label (from `label_names`).
            - confidence: The softmax probability of the predicted label, in [0.0, 1.0].
    """
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**enc).logits          # shape (1, num_labels)
        probs  = torch.softmax(logits, dim=-1)[0].cpu().numpy()

    best_id    = int(probs.argmax())
    emotion    = label_names[best_id]
    confidence = probs[best_id]

    return emotion, confidence

for txt in [
    "I just got my dream job—feeling on top of the world!",
    "Why does everything always go wrong for me?",
    "That plot twist in the movie made me jump!",
    "I can not wait to go home"
]:
    emo, conf = predict_emotion_local(txt)
    print(f"> {txt}\n→ {emo} ({conf:.1%})\n")

> I just got my dream job—feeling on top of the world!
→ joy (99.3%)

> Why does everything always go wrong for me?
→ anger (79.2%)

> That plot twist in the movie made me jump!
→ surprise (59.1%)

> I can not wait to go home
→ joy (65.8%)

