In [None]:
# Install dependencies (run this in the same Colab cell before the rest of the script)
!pip install datasets transformers torch scikit-learn numpy pandas tqdm joblib

"""
roberta_emotion.py

Multi‑label emotion classification on GoEmotions using RoBERTa.

Usage:
    (In Colab you can just run this cell)

Prerequisites:
    pip install datasets transformers torch scikit-learn numpy pandas tqdm joblib
"""

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

'\nroberta_emotion.py\n\nMulti‑label emotion classification on GoEmotions using RoBERTa.\n\nUsage:\n    (In Colab you can just run this cell)\n\nPrerequisites:\n    pip install datasets transformers torch scikit-learn numpy pandas tqdm joblib\n'

In [None]:
#!/usr/bin/env python3
"""
roberta_emotion.py

Multi‑label emotion classification on GoEmotions using RoBERTa.

Usage:
    python roberta_emotion.py

Prerequisites:
    pip install datasets transformers torch scikit-learn numpy pandas tqdm joblib
"""

import os
import time
import joblib
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, precision_recall_curve
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class GoEmotionsDataset(Dataset):
    """Dataset class for GoEmotions"""

    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Squeeze to remove batch dimension added by tokenizer
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.FloatTensor(labels)
        }


def load_and_preprocess():
    """Load and preprocess the GoEmotions dataset"""
    start_time = time.time()
    print("Loading dataset...")

    # Load GoEmotions
    ds = load_dataset("go_emotions")
    labels = ds["train"].features["labels"].feature.names
    num_labels = len(labels)

    # Extract texts and labels
    train_texts, train_labels = ds["train"]["text"], ds["train"]["labels"]
    val_texts, val_labels = ds["validation"]["text"], ds["validation"]["labels"]
    test_texts, test_labels = ds["test"]["text"], ds["test"]["labels"]

    # Binarize labels
    mlb = MultiLabelBinarizer(classes=list(range(num_labels)))
    y_train = mlb.fit_transform(train_labels)
    y_val = mlb.transform(val_labels)
    y_test = mlb.transform(test_labels)

    print(f"Dataset loaded and preprocessed in {time.time() - start_time:.2f} seconds")
    print(
        f"Train samples: {len(train_texts)}, Validation samples: {len(val_texts)}, Test samples: {len(test_texts)}")

    return train_texts, y_train, val_texts, y_val, test_texts, y_test, labels, mlb


def create_dataloaders(train_texts, y_train, val_texts, y_val, test_texts, y_test, tokenizer,
                       batch_size=16):
    """Create DataLoader objects for training and evaluation"""
    start_time = time.time()
    print("Creating dataloaders...")

    # Create datasets
    train_dataset = GoEmotionsDataset(train_texts, y_train, tokenizer)
    val_dataset = GoEmotionsDataset(val_texts, y_val, tokenizer)
    test_dataset = GoEmotionsDataset(test_texts, y_test, tokenizer)

    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    print(f"Dataloaders created in {time.time() - start_time:.2f} seconds")

    return train_dataloader, val_dataloader, test_dataloader


def train_epoch(model, dataloader, optimizer, scheduler, epoch):
    """Train model for one epoch"""
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch}")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return total_loss / len(dataloader)


def evaluate(model, dataloader):
    """Evaluate model on validation or test data"""
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            all_preds.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    return all_preds, all_labels


def find_optimal_thresholds(val_preds, val_labels):
    """Find optimal threshold for each emotion label"""
    thresholds = []

    for i in range(val_labels.shape[1]):
        precision, recall, thresh = precision_recall_curve(val_labels[:, i], val_preds[:, i])
        f1 = 2 * precision * recall / (precision + recall + 1e-8)
        optimal_idx = np.argmax(f1)
        thresholds.append(thresh[optimal_idx])

    return np.array(thresholds)


def train_model(train_dataloader, val_dataloader, test_dataloader, num_labels, epochs=4):
    """Train RoBERTa model for multi-label classification"""
    # Initialize model
    model_name = "roberta-base"
    model = RobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )
    model.to(device)

    # Training parameters
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    best_f1 = 0
    best_model = None
    best_thresholds = None

    start_time = time.time()
    print(f"Starting training for {epochs} epochs...")

    for epoch in range(1, epochs + 1):
        epoch_start = time.time()

        # Train
        train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, epoch)

        # Validate
        val_preds, val_labels = evaluate(model, val_dataloader)
        thresholds = find_optimal_thresholds(val_preds, val_labels)
        val_binary_preds = (val_preds >= thresholds).astype(int)

        # Calculate metrics
        val_micro_f1 = f1_score(val_labels, val_binary_preds, average="micro")
        val_macro_f1 = f1_score(val_labels, val_binary_preds, average="macro")

        epoch_time = time.time() - epoch_start

        print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Micro-F1 = {val_micro_f1:.4f}, "
              f"Val Macro-F1 = {val_macro_f1:.4f}, Time = {epoch_time:.2f}s")

        # Save best model
        if val_micro_f1 > best_f1:
            best_f1 = val_micro_f1
            best_model = model.state_dict().copy()
            best_thresholds = thresholds
            print(f"New best model with Micro-F1 = {val_micro_f1:.4f}")

    total_time = time.time() - start_time
    print(f"Training completed in {total_time:.2f} seconds ({total_time / 60:.2f} minutes)")

    # Load best model for final evaluation
    model.load_state_dict(best_model)

    # Evaluate on test set
    print("Evaluating on test set...")
    test_preds, test_labels = evaluate(model, test_dataloader)
    test_binary_preds = (test_preds >= best_thresholds).astype(int)

    test_micro_f1 = f1_score(test_labels, test_binary_preds, average="micro")
    test_macro_f1 = f1_score(test_labels, test_binary_preds, average="macro")

    print(f"Test Micro-F1: {test_micro_f1:.4f}")
    print(f"Test Macro-F1: {test_macro_f1:.4f}")

    return model, best_thresholds


def save_model(model, tokenizer, thresholds, emotion_labels, mlb, model_dir="roberta_model"):
    """Save the trained model and components"""
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Save model and tokenizer
    model.save_pretrained(os.path.join(model_dir, "model"))
    tokenizer.save_pretrained(os.path.join(model_dir, "tokenizer"))

    # Save thresholds and other components
    joblib.dump(thresholds, os.path.join(model_dir, "thresholds.joblib"))
    joblib.dump(emotion_labels, os.path.join(model_dir, "emotion_labels.joblib"))
    joblib.dump(mlb, os.path.join(model_dir, "multilabel_binarizer.joblib"))

    # Save model info
    model_info = {
        "model_type": "roberta-base",
        "num_labels": len(emotion_labels),
        "problem_type": "multi_label_classification",
        "labels": emotion_labels
    }
    joblib.dump(model_info, os.path.join(model_dir, "model_info.joblib"))

    print(f"Model and components saved to {model_dir}/")


def predict_sample(text, model, tokenizer, thresholds, emotion_labels):
    """Test prediction on a sample text"""
    model.eval()

    # Tokenize
    inputs = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Move to device
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()[0]

    # Apply thresholds
    preds = (logits >= thresholds).astype(int)

    # Get predicted emotions
    predicted_emotions = [emotion_labels[i] for i, pred in enumerate(preds) if pred == 1]

    return predicted_emotions


def main():
    """Main function to train and save RoBERTa model"""
    # Start timer for the whole process
    total_start_time = time.time()

    # Load and preprocess data
    train_texts, y_train, val_texts, y_val, test_texts, y_test, emotion_labels, mlb = load_and_preprocess()

    # Initialize tokenizer
    print("Loading RoBERTa tokenizer...")
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    # Create dataloaders
    batch_size = 16  # Adjust based on available GPU memory
    train_dataloader, val_dataloader, test_dataloader = create_dataloaders(
        train_texts, y_train, val_texts, y_val, test_texts, y_test, tokenizer, batch_size
    )

    # Train model
    model, thresholds = train_model(
        train_dataloader, val_dataloader, test_dataloader, num_labels=len(emotion_labels), epochs=4
    )

    # Save model and components
    save_model(model, tokenizer, thresholds, emotion_labels, mlb, model_dir="roberta_model")

    # Test on sample
    print("\nTesting model on sample texts:")
    samples = [
        "I'm so happy today! Everything is going great.",
        "This makes me angry and frustrated.",
        "I'm not sure how to feel about this news."
    ]

    for sample in samples:
        emotions = predict_sample(sample, model, tokenizer, thresholds, emotion_labels)
        print(f"\nText: {sample}")
        print(f"Predicted emotions: {emotions}")

    # Print total time
    total_time = time.time() - total_start_time
    print(f"\nTotal execution time: {total_time:.2f} seconds ({total_time / 60:.2f} minutes)")

    # Print expected run time for future runs
    print("\nExpected run times for future training:")
    print(f"Dataset loading: ~{(time.time() - total_start_time) / 60:.1f} minutes")
    print(f"Training (4 epochs): ~{total_time / 60:.1f} minutes on {device}")
    print("Note: Times will vary based on hardware, especially if using GPU vs CPU")


if __name__ == "__main__":
    main()

Using device: cuda
Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Dataset loaded and preprocessed in 14.75 seconds
Train samples: 43410, Validation samples: 5426, Test samples: 5427
Loading RoBERTa tokenizer...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Creating dataloaders...
Dataloaders created in 0.00 seconds


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training for 4 epochs...


Epoch 1: 100%|██████████| 2714/2714 [15:45<00:00,  2.87it/s, loss=0.0946]
Evaluating: 100%|██████████| 340/340 [00:34<00:00,  9.73it/s]


Epoch 1: Train Loss = 0.1196, Val Micro-F1 = 0.5937, Val Macro-F1 = 0.4851, Time = 980.07s
New best model with Micro-F1 = 0.5937


Epoch 2: 100%|██████████| 2714/2714 [15:47<00:00,  2.86it/s, loss=0.0683]
Evaluating: 100%|██████████| 340/340 [00:34<00:00,  9.75it/s]


Epoch 2: Train Loss = 0.0851, Val Micro-F1 = 0.6151, Val Macro-F1 = 0.5359, Time = 982.31s
New best model with Micro-F1 = 0.6151


Epoch 3: 100%|██████████| 2714/2714 [15:48<00:00,  2.86it/s, loss=0.1255]
Evaluating: 100%|██████████| 340/340 [00:34<00:00,  9.73it/s]


Epoch 3: Train Loss = 0.0755, Val Micro-F1 = 0.6188, Val Macro-F1 = 0.5442, Time = 983.35s
New best model with Micro-F1 = 0.6188


Epoch 4: 100%|██████████| 2714/2714 [15:48<00:00,  2.86it/s, loss=0.1176]
Evaluating: 100%|██████████| 340/340 [00:34<00:00,  9.75it/s]


Epoch 4: Train Loss = 0.0688, Val Micro-F1 = 0.6179, Val Macro-F1 = 0.5453, Time = 983.73s
Training completed in 3929.47 seconds (65.49 minutes)
Evaluating on test set...


Evaluating: 100%|██████████| 340/340 [00:34<00:00,  9.71it/s]


Test Micro-F1: 0.6116
Test Macro-F1: 0.5179
Model and components saved to roberta_model/

Testing model on sample texts:

Text: I'm so happy today! Everything is going great.
Predicted emotions: ['joy']

Text: This makes me angry and frustrated.
Predicted emotions: ['anger']

Text: I'm not sure how to feel about this news.
Predicted emotions: ['confusion']

Total execution time: 3995.38 seconds (66.59 minutes)

Expected run times for future training:
Dataset loading: ~66.6 minutes
Training (4 epochs): ~66.6 minutes on cuda
Note: Times will vary based on hardware, especially if using GPU vs CPU


In [None]:
from google.colab import files
import shutil
import os

# Zip the roberta_model directory
shutil.make_archive('roberta_model', 'zip', '.', 'roberta_model')

# Download the zip file
files.download('roberta_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def generate_detailed_report(model, test_dataloader, thresholds, emotion_labels):
    """Generate a detailed classification report for the model"""
    model.eval()
    all_preds = []
    all_labels = []

    # Collect all predictions and labels
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            all_preds.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    # Stack predictions and labels
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    # Apply thresholds to get binary predictions
    binary_preds = (all_preds >= thresholds).astype(int)

    # Calculate metrics for each emotion
    precision_per_class = []
    recall_per_class = []
    f1_per_class = []
    support_per_class = []

    for i in range(len(emotion_labels)):
        # Skip classes with no true positives to avoid division by zero
        if all_labels[:, i].sum() == 0:
            precision_per_class.append(0)
            recall_per_class.append(0)
            f1_per_class.append(0)
            support_per_class.append(0)
            continue

        # Calculate precision, recall, and F1 score for each emotion
        true_positives = np.sum((binary_preds[:, i] == 1) & (all_labels[:, i] == 1))
        predicted_positives = np.sum(binary_preds[:, i] == 1)
        actual_positives = np.sum(all_labels[:, i] == 1)

        precision = true_positives / max(predicted_positives, 1)
        recall = true_positives / max(actual_positives, 1)
        f1 = 2 * precision * recall / max(precision + recall, 1e-8)

        precision_per_class.append(precision)
        recall_per_class.append(recall)
        f1_per_class.append(f1)
        support_per_class.append(actual_positives)

    # Calculate overall metrics
    micro_precision = precision_score(all_labels, binary_preds, average='micro')
    micro_recall = recall_score(all_labels, binary_preds, average='micro')
    micro_f1 = f1_score(all_labels, binary_preds, average='micro')

    macro_precision = precision_score(all_labels, binary_preds, average='macro')
    macro_recall = recall_score(all_labels, binary_preds, average='macro')
    macro_f1 = f1_score(all_labels, binary_preds, average='macro')

    weighted_precision = precision_score(all_labels, binary_preds, average='weighted')
    weighted_recall = recall_score(all_labels, binary_preds, average='weighted')
    weighted_f1 = f1_score(all_labels, binary_preds, average='weighted')

    samples_precision = precision_score(all_labels, binary_preds, average='samples')
    samples_recall = recall_score(all_labels, binary_preds, average='samples')
    samples_f1 = f1_score(all_labels, binary_preds, average='samples')

    total_samples = len(all_labels)

    # Calculate raw accuracy (exact matches)
    exact_matches = np.all(binary_preds == all_labels, axis=1).sum()
    raw_accuracy = exact_matches / total_samples

    # Print the report
    print(f"{'':20} {'precision':10} {'recall':10} {'f1-score':10} {'support':10}")
    print("-" * 55)

    # Print metrics for each emotion
    for i, emotion in enumerate(emotion_labels):
        print(f"{emotion:20} {precision_per_class[i]:.2f}{'':<8} {recall_per_class[i]:.2f}{'':<8} {f1_per_class[i]:.2f}{'':<8} {support_per_class[i]}")

    print("\n")
    print(f"{'micro avg':20} {micro_precision:.2f}{'':<8} {micro_recall:.2f}{'':<8} {micro_f1:.2f}{'':<8} {total_samples}")
    print(f"{'macro avg':20} {macro_precision:.2f}{'':<8} {macro_recall:.2f}{'':<8} {macro_f1:.2f}{'':<8} {total_samples}")
    print(f"{'weighted avg':20} {weighted_precision:.2f}{'':<8} {weighted_recall:.2f}{'':<8} {weighted_f1:.2f}{'':<8} {total_samples}")
    print(f"{'samples avg':20} {samples_precision:.2f}{'':<8} {samples_recall:.2f}{'':<8} {samples_f1:.2f}{'':<8} {total_samples}")

    print("\n")
    print(f"Raw Accuracy: {raw_accuracy:.4f}")
    print(f"Micro-F1:    {micro_f1:.4f}")
    print(f"Macro-F1:    {macro_f1:.4f}")
    print(f"Weighted-F1: {weighted_f1:.4f}")

    return {
        'precision_per_class': precision_per_class,
        'recall_per_class': recall_per_class,
        'f1_per_class': f1_per_class,
        'support_per_class': support_per_class,
        'micro': (micro_precision, micro_recall, micro_f1),
        'macro': (macro_precision, macro_recall, macro_f1),
        'weighted': (weighted_precision, weighted_recall, weighted_f1),
        'samples': (samples_precision, samples_recall, samples_f1),
        'raw_accuracy': raw_accuracy
    }

# First, load the saved model and components
import os
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import precision_score, recall_score, f1_score
import joblib
import numpy as np
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and components
model_dir = "roberta_model"  # Path to your saved model
model = RobertaForSequenceClassification.from_pretrained(os.path.join(model_dir, "model"))
model.to(device)
tokenizer = RobertaTokenizer.from_pretrained(os.path.join(model_dir, "tokenizer"))
thresholds = joblib.load(os.path.join(model_dir, "thresholds.joblib"))
emotion_labels = joblib.load(os.path.join(model_dir, "emotion_labels.joblib"))
mlb = joblib.load(os.path.join(model_dir, "multilabel_binarizer.joblib"))

# Load the test data
train_texts, y_train, val_texts, y_val, test_texts, y_test, _, _ = load_and_preprocess()

# Create test dataset and dataloader
from torch.utils.data import Dataset, DataLoader

class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.FloatTensor(labels)
        }

test_dataset = GoEmotionsDataset(test_texts, y_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Generate the detailed report
report = generate_detailed_report(model, test_dataloader, thresholds, emotion_labels)

Loading dataset...
Dataset loaded and preprocessed in 7.52 seconds
Train samples: 43410, Validation samples: 5426, Test samples: 5427


Evaluating: 100%|██████████| 340/340 [00:33<00:00, 10.16it/s]


                     precision  recall     f1-score   support   
-------------------------------------------------------
admiration           0.68         0.76         0.72         504
amusement            0.76         0.91         0.83         264
anger                0.61         0.43         0.50         198
annoyance            0.33         0.47         0.39         320
approval             0.50         0.40         0.45         351
caring               0.35         0.56         0.43         135
confusion            0.38         0.56         0.46         153
curiosity            0.48         0.71         0.57         284
desire               0.60         0.45         0.51         83
disappointment       0.26         0.41         0.31         151
disapproval          0.39         0.51         0.44         267
disgust              0.48         0.50         0.49         123
embarrassment        0.74         0.38         0.50         37
excitement           0.41         0.42         0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
