In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch import nn, optim
from tqdm.auto import tqdm
import os
import pickle

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Step 1: Load the dataset
def load_data():
    from google.colab import files
    uploaded = files.upload()  # Upload all three CSV files
    data1 = pd.read_csv("goemotions_1.csv")
    data2 = pd.read_csv("goemotions_2.csv")
    data3 = pd.read_csv("goemotions_3.csv")
    data = pd.concat([data1, data2, data3], ignore_index=True)
    print(f"✅ Data loaded. Shape: {data.shape}")
    return data


# ✅ Step 2: Preprocess the data (multi-label)
def preprocess_data(data):
    print("✅ Preprocessing data...")

    emotion_columns = [
        'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
        'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
        'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
        'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
        'remorse', 'sadness', 'surprise', 'neutral'
    ]

    data = data[data[emotion_columns].sum(axis=1) > 0]

    if 'text' not in data.columns:
        raise ValueError("❌ Missing 'text' column in the dataset.")

    data = data[['text'] + emotion_columns].dropna()
    data['multi_hot_labels'] = data[emotion_columns].values.tolist()

    print(f"✅ Sample multi-hot labels:\n{data['multi_hot_labels'].head()}")
    return data

Using device: cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#tokenize the dataset
class GoEmotionsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.texts = data['text'].tolist()
        self.labels = data['multi_hot_labels'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)

        encoded = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        input_ids = encoded['input_ids'].squeeze(0)
        attention_mask = encoded['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

#custom Multi-label BERT Model
class BertForMultiLabelClassification(nn.Module):
    def __init__(self, num_labels=28):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits

#Train Model (manual loop)
def train_model(train_loader, val_loader, num_epochs=5):
    model = BertForMultiLabelClassification(num_labels=28).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"✅ Epoch {epoch+1} training loss: {avg_loss:.4f}")

        # Optional: Validate
        validate(model, val_loader)

    # Save the model
    # Save the model to Google Drive
    save_dir = '/content/drive/My Drive/models/lyrics_emotion_model'  # This path can be adjusted
    os.makedirs(save_dir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(save_dir, 'bert_multilabel.pt'))
    print(f"✅ Model saved at {save_dir}")

def validate(model, val_loader):
    model.eval()
    total_loss = 0
    criterion = nn.BCEWithLogitsLoss()

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    print(f"🔎 Validation loss: {avg_loss:.4f}")

def main():
    data = load_data()
    data = preprocess_data(data)


    data = data.sample(n=7000, random_state=42).reset_index(drop=True)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    dataset = GoEmotionsDataset(data, tokenizer)
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    train_model(train_loader, val_loader)

if __name__ == "__main__":
    main()


Mounted at /content/drive


Saving goemotions_1.csv to goemotions_1.csv
Saving goemotions_2.csv to goemotions_2.csv
Saving goemotions_3.csv to goemotions_3.csv
✅ Data loaded. Shape: (211225, 37)
✅ Preprocessing data...
✅ Sample multi-hot labels:
0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: multi_hot_labels, dtype: object


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/5:   0%|          | 0/394 [00:00<?, ?it/s]

✅ Epoch 1 training loss: 0.2201
🔎 Validation loss: 0.1618


Epoch 2/5:   0%|          | 0/394 [00:00<?, ?it/s]

✅ Epoch 2 training loss: 0.1518
🔎 Validation loss: 0.1500


Epoch 3/5:   0%|          | 0/394 [00:00<?, ?it/s]

✅ Epoch 3 training loss: 0.1386
🔎 Validation loss: 0.1420


Epoch 4/5:   0%|          | 0/394 [00:00<?, ?it/s]

✅ Epoch 4 training loss: 0.1230
🔎 Validation loss: 0.1383


Epoch 5/5:   0%|          | 0/394 [00:00<?, ?it/s]

✅ Epoch 5 training loss: 0.1070
🔎 Validation loss: 0.1381
✅ Model saved at /content/drive/My Drive/models/lyrics_emotion_model
