# **Emotion Detection with RoBERTa-Large**

In [None]:
from pathlib import Path
import sys
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup
)
!pip install datasets
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

    base_folder = Path('/content/drive/MyDrive/data')
    data_folder = Path('/content/inclass_kaggle_data')
    kaggle_api = base_folder/'.kaggle'
    model_folder = base_folder/'models/nlp_spring_2025/inclass_kaggle'
    archive_folder = data_folder/'archive'

    !pip install wandb -U -qq
    !pip install datasets -U -qq
    !pip install --upgrade transformers

    os.environ['KAGGLE_CONFIG_DIR'] = str(kaggle_api)
    !chmod 600 "{kaggle_api}/kaggle.json"
else:
    print("Not running in Colab — adjust paths accordingly.")

data_folder.mkdir(exist_ok=True, parents=True)
kaggle_api.mkdir(exist_ok=True, parents=True)
model_folder.mkdir(exist_ok=True, parents=True)
archive_folder.mkdir(exist_ok=True, parents=True)


Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
!kaggle competitions download emotion-detection-spring-2025 -p {archive_folder}

import zipfile
with zipfile.ZipFile(archive_folder / "emotion-detection-spring-2025.zip", 'r') as zip_ref:
    zip_ref.extractall(data_folder)

In [None]:
train_df = pd.read_csv(data_folder / "train.csv")
test_df = pd.read_csv(data_folder / "test.csv")
sample_submission = pd.read_csv(data_folder / "sample_submission.csv")


In [None]:
# Define emotion label columns
label_cols = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
              'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Initialize Weights & Biases for experiment tracking
wandb.init(
    project="emotion_detection_encoder",
    name=f"roberta_hw5_run-{wandb.util.generate_id()}",
    reinit=True
)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshreevershith[0m ([33mmy-wandb-account[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Data augmentation with back-translation and synonym replacement
def augment_data(df, target_labels, aug_multiplier=1.5):
    """Augment data for minority classes to address class imbalance"""
    try:
        import nlpaug.augmenter.word as naw

        # Create synonym replacement augmenter
        aug_synonym = naw.SynonymAug(aug_src='wordnet')

        # Calculate class distribution
        class_counts = df[target_labels].sum().sort_values()
        minority_labels = class_counts[class_counts < class_counts.median()].index.tolist()

        augmented_rows = []

        # For each minority class, augment examples
        for label in minority_labels:
            # Get samples that have this label
            positive_samples = df[df[label] == 1]

            # Determine how many samples to generate
            num_to_generate = int(len(positive_samples) * (aug_multiplier - 1))

            if num_to_generate > 0:
                # Select samples to augment
                samples_to_augment = positive_samples.sample(
                    n=min(num_to_generate, len(positive_samples)),
                    replace=(num_to_generate > len(positive_samples))
                )

                # Augment each sample
                for _, row in samples_to_augment.iterrows():
                    try:
                        # Augment the text
                        augmented_text = aug_synonym.augment(row['Tweet'])

                        # Create new row with augmented text
                        new_row = row.copy()
                        new_row['Tweet'] = augmented_text
                        augmented_rows.append(new_row)
                    except Exception as e:
                        print(f"Error augmenting text: {e}")
                        continue

        # Combine original and augmented data
        if augmented_rows:
            augmented_df = pd.DataFrame(augmented_rows)
            return pd.concat([df, augmented_df], ignore_index=True)

    except ImportError:
        print("nlpaug not available, skipping augmentation")

    return df

In [None]:
def preprocess_data(df):
    """Clean and prepare text data"""
    # Basic preprocessing
    df['Tweet'] = df['Tweet'].str.replace(r'http\S+', '', regex=True)  # Remove URLs
    df['Tweet'] = df['Tweet'].str.replace(r'@\w+', '@user', regex=True)  # Normalize mentions
    df['Tweet'] = df['Tweet'].str.replace(r'#(\w+)', r'\1', regex=True)  # Remove hashtag symbol but keep text

    return df

# Preprocess both datasets
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Augment training data for better class balance
train_df = augment_data(train_df, label_cols)

# Convert label columns to list of binary vectors
train_df['labels'] = train_df[label_cols].values.tolist()

# Convert to numpy arrays
all_labels = np.array(train_df['labels'].tolist())
all_texts = train_df['Tweet'].tolist()

nlpaug not available, skipping augmentation


In [None]:
# Train/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_texts, all_labels, test_size=0.15, random_state=42, stratify=train_df['joy']  # Stratify by one common label
)

print("train_labels shape:", train_labels.shape)
print("val_labels shape:", val_labels.shape)


train_labels shape: (6565, 11)
val_labels shape: (1159, 11)


In [None]:
# Choose model - RoBERTa Large generally outperforms base version
MODEL_NAME = "roberta-large"  # Upgrade to large for better performance
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Improved tokenization with dynamic max length
def get_max_length(texts, percentile=95):
    """Compute an efficient max length covering most samples"""
    lengths = [len(tokenizer.encode(text)) for text in texts]
    return int(np.percentile(lengths, percentile))

max_length = get_max_length(all_texts, percentile=99)
print(f"Using max_length: {max_length}")

# Tokenize with better parameters
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

test_encodings = tokenizer(
    test_df['Tweet'].tolist(),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using max_length: 45


In [None]:
# Dataset wrapper
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx]).float()
        return item

train_dataset = MultiLabelDataset(train_encodings, train_labels)
val_dataset = MultiLabelDataset(val_encodings, val_labels)
test_dataset = MultiLabelDataset(test_encodings)


In [None]:
# Compute class weights for weighted loss
label_array = np.array(train_df[label_cols])

class_weights = []
for i in range(len(label_cols)):
    weights = compute_class_weight(
        class_weight="balanced",
        classes=np.array([0, 1]),
        y=label_array[:, i]
    )
    class_weights.append(weights[1])  # positive class weight

# Convert to tensor
class_weights_tensor = torch.tensor(class_weights).float()
print("Class weights:", class_weights_tensor)


Class weights: tensor([1.3508, 3.5045, 1.3221, 2.8335, 1.3424, 4.6418, 1.6857, 4.3151, 1.6991,
        9.7525, 9.6550])


In [None]:
# Initialize model with more sophisticated configuration
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_cols),
    problem_type="multi_label_classification",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1
)

# Custom metrics function with more detailed analytics
def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int)

    # Overall metrics
    f1_macro = f1_score(labels, preds, average="macro")
    f1_micro = f1_score(labels, preds, average="micro")
    f1_weighted = f1_score(labels, preds, average="weighted")
    accuracy = accuracy_score(labels, preds)

    # Per-class metrics
    per_class_f1 = f1_score(labels, preds, average=None)

    # Log detailed metrics to wandb
    for i, label in enumerate(label_cols):
        wandb.log({f"f1_{label}": per_class_f1[i]})

    return {
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "f1_weighted": f1_weighted,
        "accuracy": accuracy
    }


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Custom trainer with focal loss option
class EmotionTrainer(Trainer):
    def __init__(self, *args, class_weights=None, use_focal_loss=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.use_focal_loss = use_focal_loss

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
      labels = inputs.pop("labels")
      outputs = model(**inputs)
      logits = outputs.logits

      if self.use_focal_loss:
        gamma = 2.0
        probs = torch.sigmoid(logits)
        pt = torch.where(labels == 1, probs, 1 - probs)
        focal_weight = (1 - pt) ** gamma

        if self.class_weights is not None:
            class_weight = torch.where(
                labels == 1,
                self.class_weights.to(labels.device),
                torch.ones_like(labels)
            )
            focal_weight = focal_weight * class_weight

        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            logits, labels, reduction='none'
        )
        loss = (focal_weight * bce_loss).mean()

      else:
        loss_fct = nn.BCEWithLogitsLoss(
            pos_weight=self.class_weights.to(labels.device) if self.class_weights is not None else None
        )
        loss = loss_fct(logits, labels)

      return (loss, outputs) if return_outputs else loss


In [None]:
# Calculate training steps for learning rate scheduler
batch_size = 8  # Smaller batch size for better generalization with large model
num_epochs = 5
total_steps = len(train_dataset) // batch_size * num_epochs

# Training arguments with better hyperparameters
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch
    warmup_ratio=0.1,  # Percentage of steps for warmup
    weight_decay=0.01,
    learning_rate=1e-5,  # Start with a smaller learning rate for large model
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,  # Only keep the 2 best models
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",  # Focus on macro F1 score
    greater_is_better=True,
    report_to="wandb",  # Enable wandb integration
    fp16=True,  # Mixed precision training for faster execution
    dataloader_num_workers=4,  # Parallelize data loading
)

# Initialize trainer with our custom implementation
trainer = EmotionTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor,
    use_focal_loss=True,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train model
trainer.train()



Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,F1 Weighted,Accuracy
1,0.1435,0.15092,0.572117,0.654533,0.677305,0.174288
2,0.1338,0.145047,0.58549,0.676456,0.692737,0.182053
3,0.1253,0.144283,0.589242,0.671666,0.696453,0.166523
4,0.09,0.159255,0.600421,0.686651,0.703878,0.195858




TrainOutput(global_step=2050, training_loss=0.13285866176209798, metrics={'train_runtime': 948.286, 'train_samples_per_second': 34.615, 'train_steps_per_second': 2.162, 'total_flos': 2683061017031880.0, 'train_loss': 0.13285866176209798, 'epoch': 4.989037758830694})

In [None]:
# Predict with threshold optimization
def optimize_thresholds(trainer, dataset, labels):
    """Find optimal thresholds for each class based on F1 score"""
    raw_predictions = trainer.predict(dataset).predictions
    probs = torch.sigmoid(torch.tensor(raw_predictions)).numpy()

    optimal_thresholds = []
    for i in range(probs.shape[1]):  # For each emotion class
        best_f1 = 0
        best_threshold = 0.5  # Default threshold

        # Try different thresholds
        for threshold in np.arange(0.3, 0.7, 0.05):
            preds_i = (probs[:, i] >= threshold).astype(int)
            f1_i = f1_score(labels[:, i], preds_i)

            if f1_i > best_f1:
                best_f1 = f1_i
                best_threshold = threshold

        optimal_thresholds.append(best_threshold)

    return optimal_thresholds

# Optimize thresholds on validation set
optimal_thresholds = optimize_thresholds(trainer, val_dataset, val_labels)
print("Optimized thresholds:", optimal_thresholds)

# Generate predictions with optimized thresholds
raw_predictions = trainer.predict(test_dataset).predictions
probs = torch.sigmoid(torch.tensor(raw_predictions)).numpy()

# Apply optimized thresholds
preds = np.zeros_like(probs, dtype=int)
for i in range(probs.shape[1]):
    preds[:, i] = (probs[:, i] >= optimal_thresholds[i]).astype(int)



Optimized thresholds: [np.float64(0.39999999999999997), np.float64(0.49999999999999994), np.float64(0.44999999999999996), np.float64(0.5999999999999999), np.float64(0.49999999999999994), np.float64(0.5999999999999999), np.float64(0.49999999999999994), np.float64(0.49999999999999994), np.float64(0.44999999999999996), np.float64(0.5999999999999999), np.float64(0.6499999999999999)]




In [None]:
# Create submission
submission = pd.DataFrame(preds, columns=label_cols)
submission.insert(0, "ID", test_df["ID"])

# Ensure column order matches Kaggle's requirement
submission = submission[["ID"] + label_cols]

# Save submission file
submission_path = model_folder / "improved_emotion_submission.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

Submission saved to /content/drive/MyDrive/data/models/nlp_spring_2025/inclass_kaggle/improved_emotion_submission.csv


In [None]:
# Submit to Kaggle
comp = 'emotion-detection-spring-2025'
!kaggle competitions submit -c {comp} -f {submission_path} -m "Improved RoBERTa implementation with focal loss and threshold optimization"

# Finalize wandb run
wandb.finish()

100% 105k/105k [00:00<00:00, 184kB/s]
Successfully submitted to Emotion Detection Spring2025

0,1
eval/accuracy,▃▅▁▆█
eval/f1_macro,▁▄▅▇█
eval/f1_micro,▁▆▅▇█
eval/f1_weighted,▁▅▆▇█
eval/loss,▄▁▁▄█
eval/runtime,▇█▁▄█
eval/samples_per_second,▂▁█▅▁
eval/steps_per_second,▂▁█▅▁
f1_anger,▁█▆▆██
f1_anticipation,▁▁▇█▇▇

0,1
eval/accuracy,0.19586
eval/f1_macro,0.60042
eval/f1_micro,0.68665
eval/f1_weighted,0.70388
eval/loss,0.15925
eval/runtime,3.5195
eval/samples_per_second,329.307
eval/steps_per_second,20.742
f1_anger,0.81104
f1_anticipation,0.40704
