In [None]:
# Task 2: Multi-Label Emotion Recognition
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, hamming_loss
import torch
import numpy as np

# Step 1: Load Dataset
dataset = load_dataset("go_emotions")
labels = dataset['train'].features['labels'].feature.names

# Step 2: Preprocessing
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 3: Model Definition
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(labels), problem_type="multi_label_classification")

# Step 4: Metrics
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions)).numpy() > 0.5
    labels = p.label_ids
    return {
        'f1': f1_score(labels, preds, average='micro'),
        'hamming_loss': hamming_loss(labels, preds)
    }

# Step 5: Trainer
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    save_steps=500,
    logging_steps=100,
    save_total_limit=1,
)

trainer = Trainer(
    model,
    args,
    train_dataset=dataset['train'].shuffle(seed=42).select(range(1000)),  # small subset
    eval_dataset=dataset['test'].select(range(200)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)
