In [None]:
import os
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
from transformers import (
    BertTokenizer, BertModel,
    get_scheduler
)
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm


In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_parquet("/content/drive/MyDrive/Project/augmented_dataset.parquet")
rare_emotion_list = [
    'grief', 'relief', 'curiosity', 'realization', 'pride',
    'nervousness', 'confusion', 'caring', 'disappointment',
    'annoyance', 'approval', 'disapproval'
]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
rare_label_map = {label: idx for idx, label in enumerate(rare_emotion_list)}

def labels_to_multi_hot_rare(label_entry, index_map, num_classes):
    if isinstance(label_entry, str):
        try:
            label_list = ast.literal_eval(label_entry)
            if isinstance(label_list, int):  # edge case: a single int like '3'
                label_list = [label_list]
        except:
            label_list = [label_entry]
    elif isinstance(label_entry, int):
        label_list = [label_entry]
    elif isinstance(label_entry, list):
        label_list = label_entry
    else:
        label_list = []

    filtered = [index_map[lab] for lab in label_list if lab in index_map]
    multi_hot = np.zeros(num_classes, dtype=int)
    for idx in filtered:
        multi_hot[idx] = 1
    return multi_hot


df["rare_multi_hot"] = df["labels"].apply(
    lambda x: labels_to_multi_hot_rare(x, rare_label_map, len(rare_emotion_list))
)


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class RareEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.FloatTensor(self.labels[idx])
        return item


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["cleaned_text"].tolist(),
    df["rare_multi_hot"].tolist(),
    test_size=0.1,
    random_state=42,
    shuffle=True
)
batch_size = 64
train_dataset = RareEmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = RareEmotionDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [None]:
class EmotionClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        return self.classifier(self.dropout(pooled))


In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        probs = torch.sigmoid(inputs)
        pt = torch.where(targets == 1, probs, 1 - probs)
        loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return loss.mean() if self.reduction == 'mean' else loss.sum()


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from torch.amp import autocast, GradScaler  # Updated AMP import
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

scaler = GradScaler(device='cuda')  
num_epochs = 5  

drive_path = "/content/drive/MyDrive/Project/"
metrics_path = os.path.join(drive_path, 'rare_emotion_metrics_per_epoch.csv')
if os.path.exists(metrics_path):
    os.remove(metrics_path)  # Start fresh

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with autocast(device_type='cuda'):
            logits = model(input_ids=input_ids, attention_mask=attention_mask)  # Already returns logits
            loss = criterion(logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} Training Loss: {avg_train_loss:.4f}")
    })

    # Saving model checkpoint
    model_path = os.path.join(drive_path, f'rare_classifier_epoch_{epoch+1}.pt')
    torch.save(model.state_dict(), model_path)
    print(f"Saved model checkpoint to: {model_path}")

# Saving final model
model_name = "rare_emotion_model.pt"
torch.save(model.state_dict(), model_name)  # Saving locally
torch.save(model.state_dict(), os.path.join(drive_path, model_name))  # Saving to Drive

print(f"\nModel saved locally as '{model_name}' and to Drive at '{drive_path}'")
