# **Emotion Detection with DistilBERT**

## **Download and Extract Dataset**

In [1]:
from pathlib import Path
import sys
import os

if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

    base_folder = Path('/content/drive/MyDrive/data')
    data_folder = Path('/content/inclass_kaggle_data')
    kaggle_api = base_folder/'.kaggle'
    model_folder = base_folder/'models/nlp_spring_2025/inclass_kaggle'
    archive_folder = data_folder/'archive'

    !pip install wandb -U -qq
    !pip install datasets -U -qq
    !pip install --upgrade transformers

    os.environ['KAGGLE_CONFIG_DIR'] = str(kaggle_api)
    !chmod 600 "{kaggle_api}/kaggle.json"
else:
    print("Not running in Colab — adjust paths accordingly.")

data_folder.mkdir(exist_ok=True, parents=True)
kaggle_api.mkdir(exist_ok=True, parents=True)
model_folder.mkdir(exist_ok=True, parents=True)
archive_folder.mkdir(exist_ok=True, parents=True)


Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform

## **Load Dataset**

In [2]:
!kaggle competitions download emotion-detection-spring-2025 -p {archive_folder}

import zipfile
with zipfile.ZipFile(archive_folder / "emotion-detection-spring-2025.zip", 'r') as zip_ref:
    zip_ref.extractall(data_folder)

## **Preprocess Text and Labels**

In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score

train_df = pd.read_csv(data_folder / "train.csv")
test_df = pd.read_csv(data_folder / "test.csv")
sample_submission = pd.read_csv(data_folder / "sample_submission.csv")


## **Tokenization**

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

# Define emotion label columns
label_cols = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
              'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Convert label columns to list of binary vectors
train_df['labels'] = train_df[label_cols].values.tolist()
print("Label preview:", train_df['labels'].iloc[0])
print("Shape of all labels:", np.array(train_df['labels'].tolist()).shape)

# Convert to numpy array
all_labels = np.array(train_df['labels'].tolist())
all_texts = train_df['Tweet'].tolist()

# Train/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_texts, all_labels, test_size=0.2, random_state=42
)

# Confirm shape
print("train_labels shape:", train_labels.shape)
print("val_labels shape:", val_labels.shape)

Label preview: [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1]
Shape of all labels: (7724, 11)
train_labels shape: (6179, 11)
val_labels shape: (1545, 11)


## **Dataset Wrapper**

In [5]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_df['Tweet'].tolist(), truncation=True, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## **Model Definition**

In [6]:
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx]).float()
        return item

train_dataset = MultiLabelDataset(train_encodings, train_labels)
val_dataset = MultiLabelDataset(val_encodings, val_labels)
test_dataset = MultiLabelDataset(test_encodings)

## **Compute Class Weights**

In [7]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

label_array = np.array(train_df[label_cols])

class_weights = []
for i in range(len(label_cols)):
    weights = compute_class_weight(class_weight="balanced", classes=np.array([0, 1]), y=label_array[:, i])
    class_weights.append(weights[1])  # positive class weight

# Convert to tensor
class_weights_tensor = torch.tensor(class_weights).float()
if torch.cuda.is_available():
    class_weights_tensor = class_weights_tensor.to("cuda")

print(class_weights_tensor)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)


tensor([1.3508, 3.5045, 1.3221, 2.8335, 1.3424, 4.6418, 1.6857, 4.3151, 1.6991,
        9.7525, 9.6550], device='cuda:0')


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##**Custom Trainer with Weighted Loss**

In [8]:

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}


In [9]:
from transformers import TrainingArguments, Trainer
import torch.nn as nn

# Define TrainingArguments
training_args = TrainingArguments(
    run_name="distilbert_hw5_run1",
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Define custom WeightedTrainer to apply class imbalance handling
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Instantiate the trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshreevershith[0m ([33mmy-wandb-account[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5817,0.641047,0.536015,0.16699
2,0.554,0.597743,0.556112,0.161165
3,0.4449,0.621062,0.575978,0.188997
4,0.3647,0.689732,0.570804,0.211003
5,0.245,0.74555,0.585846,0.215534
6,0.2221,0.801814,0.575773,0.21165


TrainOutput(global_step=2322, training_loss=0.43862167818345044, metrics={'train_runtime': 300.2468, 'train_samples_per_second': 123.478, 'train_steps_per_second': 7.734, 'total_flos': 623579097026340.0, 'train_loss': 0.43862167818345044, 'epoch': 6.0})

## **Train Model**

In [10]:
preds = trainer.predict(test_dataset).predictions
preds = (torch.sigmoid(torch.tensor(preds)).numpy() >= 0.5).astype(int)

## **Evaluation Metrics**

In [11]:
# Make sure prediction labels are assigned to correct columns
submission = pd.DataFrame(preds, columns=label_cols)
submission.insert(0, "ID", test_df["ID"])

# Ensure all required columns are present
for col in label_cols:
    if col not in submission.columns:
        submission[col] = 0

# Ensure column order matches Kaggle's requirement
submission = submission[["ID"] + label_cols]

# Save submission file
submission_path = model_folder / "emotion_submission.csv"
submission.to_csv(submission_path, index=False)

# Preview
!head {submission_path}

ID,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
2018-01559,1,0,1,1,0,0,0,0,0,0,0
2018-03739,1,0,1,0,0,0,0,0,1,0,0
2018-00385,1,0,1,0,0,0,0,0,1,0,0
2018-03001,0,1,0,0,1,0,0,0,0,0,0
2018-01988,0,0,0,1,0,0,0,1,1,0,0
2018-03463,1,1,1,1,0,0,0,0,0,0,0
2018-04315,0,1,0,0,0,0,1,0,0,0,1
2018-01426,0,0,0,0,1,1,1,0,0,0,0
2018-03332,0,0,0,0,1,1,1,0,0,0,0


## **Generate Submission File**

In [12]:
# Define competition slug
comp = 'emotion-detection-spring-2025'

# Submit to Kaggle
!kaggle competitions submit -c {comp} -f {submission_path} -m "baseline submission"

100% 105k/105k [00:00<00:00, 115kB/s]
Successfully submitted to Emotion Detection Spring2025