In [2]:
# Installing necessary packages
!pip install -q numpy pandas matplotlib seaborn scikit-learn tensorflow transformers tf-keras datasets pyarrow
!pip install fsspec==2024.12.0 gcsfs==2024.12.0 --no-cache-dir
import fsspec
import gcsfs
print(f"fsspec version: {fsspec.__version__}")
print(f"gcsfs version: {gcsfs.__version__}")
import torch
if torch.cuda.is_available():
    try:
        device_name = torch.cuda.get_device_name(0)
        print(f"Using GPU: {device_name}")
    except Exception as e:
        print("CUDA is available, but couldn't get GPU name:", e)
    device = torch.device("cuda")
else:
    print("CUDA not available. Using CPU.")
    device = torch.device("cpu")
print(f"Device selected: {device}")

# TensorFlow threading optimization
import os
import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(4)
tf.config.threading.set_inter_op_parallelism_threads(2)
tf.config.set_soft_device_placement(True)
_ = tf.compat.v1.losses.sparse_softmax_cross_entropy  # Safe reference to avoid future errors

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from torch.cuda.amp import autocast, GradScaler
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import load_dataset


fsspec version: 2024.12.0
gcsfs version: 2024.12.0
Using GPU: Tesla T4
Device selected: cuda


In [3]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
base_path = "/content/drive/My Drive/Project/"

# Loading datasets
train_data = pd.read_parquet(base_path + "go_emotions_train_preprocessed.parquet")
test_data = pd.read_parquet(base_path + "go_emotions_test_preprocessed.parquet")
valid_data = pd.read_parquet(base_path + "go_emotions_validation_preprocessed.parquet")

# Merging to create full_train_data
full_train_data = pd.concat([train_data, valid_data, test_data], ignore_index=True)
print("Full dataset size:", full_train_data.shape)

# Displaying first few rows
print(full_train_data.head())


Mounted at /content/drive
Full dataset size: (54263, 4)
                                                text labels       id  \
0  My favourite food is anything I didn't have to...   [27]  eebbqej   
1  Now if he does off himself, everyone will thin...   [27]  ed00q6i   
2                     WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj   
3                        To make her feel threatened   [14]  ed7ypvh   
4                             Dirty Southern Wankers    [3]  ed0bdzj   

                                        cleaned_text  
0                 favourite food anything didnt cook  
1  everyone think he laugh screwing people instea...  
2                                fuck bayless isoing  
3                               make feel threatened  
4                              dirty southern wanker  


In [4]:
full_train_data.to_parquet('/content/drive/MyDrive/Project/full_train_data.parquet', index=False)
print("Full merged dataset saved to full_train_data.parquet ")

Full merged dataset saved to full_train_data.parquet 


In [5]:
print("Train Data Columns:", full_train_data.columns)


Train Data Columns: Index(['text', 'labels', 'id', 'cleaned_text'], dtype='object')


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer

# Fitting binarizer on all labels from full_train_data
mlb = MultiLabelBinarizer(classes=list(range(28)))
mlb.fit(full_train_data['labels'])  # <-- This is correct now

# Initializing tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
from torch.utils.data import Dataset
import torch

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).float()
        return item


In [8]:
import torch

def get_dynamic_batch_size(initial_batch_size=64, fallback_batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running batch size test on {device}")

    batch_size = initial_batch_size
    try:
        # Simulate a dummy BERT input: [batch_size, seq_len]
        dummy_input = torch.randint(0, 30522, (batch_size, 64), device=device)  # 30522 is BERT's vocab size
        dummy_mask = torch.ones((batch_size, 64), device=device)
        from transformers import BertModel
        model = BertModel.from_pretrained("bert-base-uncased").to(device)
        with torch.no_grad():
            model(input_ids=dummy_input, attention_mask=dummy_mask)
    except (RuntimeError, OSError) as e:
        if "out of memory" in str(e).lower() or "cannot allocate memory" in str(e).lower():
            print(f"Memory issue on {device}, reducing batch size to {fallback_batch_size}")
            batch_size = fallback_batch_size
            if device.type == 'cuda':
                torch.cuda.empty_cache()
        else:
            raise e
    return batch_size

batch_size = get_dynamic_batch_size()


Running batch size test on cuda


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
import torch
import torch.nn as nn
from transformers import BertModel

class EmotionClassifier(nn.Module):
    def __init__(self, num_labels):
        super(EmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_labels)  # One output per emotion

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_output = outputs.last_hidden_state[:, 0, :]
        cls_token_output = self.dropout(cls_token_output)
        logits = self.fc(cls_token_output)
        return logits  # raw logits, applying BCEWithLogitsLoss later


In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Padding sequences 
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
    }

# Dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [12]:
import numpy as np
import pandas as pd
import ast
from sklearn.model_selection import StratifiedKFold

# 28 emotions
label_cols = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire",
    "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy",
    "love", "nervousness", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

# Converting to multi-hot
def labels_to_multi_hot(label_entry, num_classes=28):
    if isinstance(label_entry, str):
        label_list = ast.literal_eval(label_entry)
    else:
        label_list = label_entry
    multi_hot = np.zeros(num_classes, dtype=int)
    for lbl in label_list:
        multi_hot[int(lbl)] = 1
    return multi_hot

# Applying multi-hot conversion
multi_hot_labels = full_train_data["labels"].apply(labels_to_multi_hot)
full_train_data[label_cols] = pd.DataFrame(multi_hot_labels.tolist(), index=full_train_data.index)

full_train_data["main_label"] = full_train_data[label_cols].values.argmax(axis=1)


In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

encodings = tokenizer(
    full_train_data["cleaned_text"].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings["input_ids"]
attention_masks = encodings["attention_mask"]
labels = torch.tensor(full_train_data[label_cols].values).float()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
from torch.utils.data import Dataset

class EmotionDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx]
        }

full_dataset = EmotionDataset(input_ids, attention_masks, labels)


In [15]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
texts = full_train_data["cleaned_text"].tolist()

# For stratification, reducing multi-labels to a single label temporarily
labels = full_train_data["labels"].apply(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x).astype(int).tolist()

print("Label sample:", labels[:5])
print("Type check:", type(labels[0]))

# Cross-validation 
for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"\n Fold {fold + 1} ")
    print("Train size:", len(train_idx), "Validation size:", len(val_idx))

    fold_train_data = full_train_data.iloc[train_idx].reset_index(drop=True)
    fold_val_data = full_train_data.iloc[val_idx].reset_index(drop=True)
    
    print("Example train text:", fold_train_data["cleaned_text"].iloc[0])
    print("Example val text:", fold_val_data["cleaned_text"].iloc[0])


Label sample: [27, 27, 2, 14, 3]
Type check: <class 'int'>

 Fold 1 
Train size: 43410 Validation size: 10853
Example train text: favourite food anything didnt cook
Example val text: make feel threatened

 Fold 2 
Train size: 43410 Validation size: 10853
Example train text: make feel threatened
Example val text: favourite food anything didnt cook

 Fold 3 
Train size: 43410 Validation size: 10853
Example train text: favourite food anything didnt cook
Example val text: rsleeptrain might time sleep training take look try feel whats right family

 Fold 4 
Train size: 43411 Validation size: 10852
Example train text: favourite food anything didnt cook
Example val text: omg peyton isnt good enough help u playoff dumbass bronco fan circa december

 Fold 5 
Train size: 43411 Validation size: 10852
Example train text: favourite food anything didnt cook
Example val text: thank friend


In [16]:
import os
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

epochs = 5
num_folds = 5
initial_batch_size = 64
fallback_batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log_file = "training_log.txt"
os.makedirs("models", exist_ok=True)
os.makedirs("plots", exist_ok=True)
with open(log_file, "w") as f:
    f.write(f"Training started at {time.ctime()}\n")
# Logging
def log(msg):
    print(msg)
    with open(log_file, "a") as f:
        f.write(msg + "\n")
# Seeding
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

set_seed(42)


In [17]:
print("Full dataset size:", full_train_data.shape)

# Safely extracting the main label from the list
def safe_label_extraction(label):
    if isinstance(label, (list, tuple, np.ndarray)) and len(label) > 0:
        return int(label[0])
    elif isinstance(label, (int, float)) and not pd.isna(label):
        return int(label)
    return np.nan
    
# Dropping rows with invalid/missing labels
full_train_data = full_train_data.dropna(subset=["main_label"])
full_train_data["main_label"] = full_train_data["main_label"].astype(int)

# Finalized data for Stratified K-Fold
texts = full_train_data["cleaned_text"].tolist()
labels = full_train_data["main_label"].tolist()

log(f"Loaded {len(texts)} samples with {len(set(labels))} unique labels.")


Full dataset size: (54263, 4)
Loaded 54263 samples with 28 unique labels.


In [18]:
!pip install iterative-stratification

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer

# Converting multi-label column to binary matrix
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(full_train_data["labels"])

texts = full_train_data["cleaned_text"].tolist()

# Initializing Multi-label Stratified K-Fold
mskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(mskf.split(texts, binary_labels)):
    print(f"\nFold {fold + 1}")
    print("Train size:", len(train_idx), "Validation size:", len(val_idx))

    fold_train_data = full_train_data.iloc[train_idx].reset_index(drop=True)
    fold_val_data = full_train_data.iloc[val_idx].reset_index(drop=True)

    print("Example train labels:", fold_train_data['labels'].iloc[0])
    print("Example val labels:", fold_val_data['labels'].iloc[0])


Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9

Fold 1
Train size: 43386 Validation size: 10877
Example train labels: [27]
Example val labels: [26]

Fold 2
Train size: 43382 Validation size: 10881
Example train labels: [27]
Example val labels: [6]

Fold 3
Train size: 43442 Validation size: 10821
Example train labels: [2]
Example val labels: [27]

Fold 4
Train size: 43458 Validation size: 10805
Example train labels: [27]
Example val labels: [2]

Fold 5
Train size: 43384 Validation size: 10879
Example train labels: [27]
Example val labels: [ 8 20]


In [19]:
from torch.utils.data import TensorDataset
from transformers import BertTokenizerFast
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import torch

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Converting to multi-hot label vectors
def to_multihot(label_list, num_classes=28):
    multihot = np.zeros(num_classes, dtype=int)
    for label in label_list:
        if isinstance(label, int):
            multihot[label] = 1
        elif isinstance(label, list):
            for l in label:
                multihot[l] = 1
    return multihot

# Extracting inputs and labels
X = full_train_data["cleaned_text"].tolist()
y = np.array([to_multihot(lbl, num_classes=28) for lbl in full_train_data["labels"]])

# Stratified folds
mlskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
folds = []

for fold, (train_idx, val_idx) in enumerate(mlskf.split(X, y)):
    print(f"\nPreparing Fold {fold + 1}")

    # Tokenizing
    train_enc = tokenizer([X[i] for i in train_idx], truncation=True, padding=True, return_tensors="pt")
    val_enc = tokenizer([X[i] for i in val_idx], truncation=True, padding=True, return_tensors="pt")

    # Converting labels to tensors
    train_labels = torch.tensor(y[train_idx], dtype=torch.float32)
    val_labels = torch.tensor(y[val_idx], dtype=torch.float32)

    # Combining into datasets
    train_dataset = TensorDataset(train_enc["input_ids"], train_enc["attention_mask"], train_labels)
    val_dataset = TensorDataset(val_enc["input_ids"], val_enc["attention_mask"], val_labels)

    folds.append((train_dataset, val_dataset))



Preparing Fold 1

Preparing Fold 2

Preparing Fold 3

Preparing Fold 4

Preparing Fold 5


In [20]:
from torch.cuda.amp import GradScaler  # Correct import

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
scaler = GradScaler()  

total_start_time = time.time()
fold_metrics = []  


  scaler = GradScaler()  # No need to specify device


In [None]:
drive_model_dir = "/content/drive/MyDrive/Project/models"
os.makedirs(drive_model_dir, exist_ok=True)

import os
import time
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, accuracy_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
scaler = GradScaler()
num_folds = 5
epochs = 5
initial_batch_size = 16
fallback_batch_size = 8

# Logging
def log(msg):
    print(msg)

# Multi-label binarization
mlb = MultiLabelBinarizer(classes=list(range(28)))
full_train_data["binary_labels"] = mlb.fit_transform(full_train_data["labels"]).tolist()

texts = full_train_data["text"].tolist()
labels = full_train_data["binary_labels"]

skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
fold_metrics = []
total_start_time = time.time()

os.makedirs("models", exist_ok=True)
os.makedirs("plots", exist_ok=True)

# Stratified K-Fold
for fold, (train_idx, val_idx) in enumerate(skf.split(texts, [l[0] for l in labels])):
    model_path = f"models/bert_fold_{fold + 1}.pt"
    if os.path.exists(model_path):
        print(f"Skipping Fold {fold + 1} — model already exists.")
        continue

    print(f"\n==================== Fold {fold + 1} ====================")
    fold_start_time = time.time()

    log(f"\n{'='*20} Fold {fold + 1} {'='*20}")
    fold_start_time = time.time()

    train_texts = [texts[i] for i in train_idx]
    val_texts = [texts[i] for i in val_idx]
    train_labels = [labels[i] for i in train_idx]
    val_labels = [labels[i] for i in val_idx]

    # Tokenize
    train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    val_enc = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    train_labels_tensor = torch.tensor(train_labels, dtype=torch.float)
    val_labels_tensor = torch.tensor(val_labels, dtype=torch.float)

    train_dataset = TensorDataset(train_enc["input_ids"], train_enc["attention_mask"], train_labels_tensor)
    val_dataset = TensorDataset(val_enc["input_ids"], val_enc["attention_mask"], val_labels_tensor)

    batch_size = initial_batch_size
    try:
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
    except RuntimeError:
        log(f"Falling back to smaller batch size: {fallback_batch_size}")
        batch_size = fallback_batch_size
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=28, problem_type="multi_label_classification"
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0,
                               num_training_steps=epochs * len(train_loader))

    best_val_f1 = 0.0
    model_path = f"models/bert_fold_{fold+1}.pt"
        
    # Training Loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels_batch = [x.to(device) for x in batch]
            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_batch)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # Evaluation
        model.eval()
        all_preds, all_true = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attn_mask, labels_batch = [b.to(device) for b in batch]
                with autocast():
                    logits = model(input_ids, attention_mask=attn_mask).logits
                probs = torch.sigmoid(logits)
                all_preds.extend((probs.cpu().numpy() > 0.5).astype(int))
                all_true.extend(labels_batch.cpu().numpy())

        val_f1 = f1_score(all_true, all_preds, average="macro", zero_division=0)
        val_acc = accuracy_score(np.array(all_true), np.array(all_preds))

        log(f"Fold {fold + 1} | Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | F1: {val_f1:.4f} | Acc: {val_acc:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            # Save locally
            torch.save(model.state_dict(), model_path)
            log(f"Saved best model for Fold {fold + 1} to {model_path}")
            # Save to Google Drive
            drive_model_path = os.path.join(drive_model_dir, f"bert_fold_{fold+1}.pt")
            torch.save(model.state_dict(), drive_model_path)
            log(f"Also saved model to Google Drive: {drive_model_path}")


    # Confusion Matrix
    cm = multilabel_confusion_matrix(np.array(all_true), np.array(all_preds))
    cm_sum = np.sum(cm, axis=0)  # Summing across labels

    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_sum, annot=False, fmt="d", cmap="Blues")
    plt.title(f"Multi-label Confusion Matrix - Fold {fold + 1}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_path = f"plots/conf_matrix_fold_{fold + 1}.png"
    plt.savefig(cm_path)
    plt.close()
    log(f"Saved confusion matrix to {cm_path}")

    # Metrics
    fold_metrics.append({"fold": fold+1, "accuracy": val_acc, "f1_score": best_val_f1})
    fold_time = time.time() - fold_start_time
    log(f"Fold {fold + 1} training time: {fold_time:.2f} seconds")
    log(f"Completed Epoch {epoch+1} of Fold {fold + 1}")


total_time = time.time() - total_start_time
log(f"\nCross-validation completed in {total_time/60:.2f} min")
log("Fold-wise metrics:")
for m in fold_metrics:
    log(f"Fold {m['fold']} | Accuracy: {m['accuracy']:.4f} | F1: {m['f1_score']:.4f}")
# Saving fold metrics as CSV file
import pandas as pd

# Saving locally
metrics_df = pd.DataFrame(fold_metrics)
metrics_df.to_csv("fold_metrics.csv", index=False)
log("Saved fold metrics to fold_metrics.csv")

# Saving to Google Drive
drive_metrics_path = "/content/drive/MyDrive/Project/fold_metrics.csv"
metrics_df.to_csv(drive_metrics_path, index=False)
log(f"Also saved fold metrics CSV to: {drive_metrics_path}")


  scaler = GradScaler()


Skipping Fold 1 — model already exists.
Skipping Fold 2 — model already exists.
Skipping Fold 3 — model already exists.
Skipping Fold 4 — model already exists.




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with autocast():
  with autocast():


Fold 5 | Epoch 1/5 | Loss: 0.1285 | F1: 0.2429 | Acc: 0.3588
Saved best model for Fold 5 to models/bert_fold_5.pt
Also saved model to Google Drive: /content/drive/MyDrive/Project/models/bert_fold_5.pt


  with autocast():
  with autocast():


Fold 5 | Epoch 2/5 | Loss: 0.0854 | F1: 0.3695 | Acc: 0.4398
Saved best model for Fold 5 to models/bert_fold_5.pt
Also saved model to Google Drive: /content/drive/MyDrive/Project/models/bert_fold_5.pt


  with autocast():
  with autocast():


Fold 5 | Epoch 3/5 | Loss: 0.0722 | F1: 0.4249 | Acc: 0.4415
Saved best model for Fold 5 to models/bert_fold_5.pt
Also saved model to Google Drive: /content/drive/MyDrive/Project/models/bert_fold_5.pt


  with autocast():
  with autocast():


Fold 5 | Epoch 4/5 | Loss: 0.0620 | F1: 0.4301 | Acc: 0.4528
Saved best model for Fold 5 to models/bert_fold_5.pt
Also saved model to Google Drive: /content/drive/MyDrive/Project/models/bert_fold_5.pt


  with autocast():
  with autocast():


Fold 5 | Epoch 5/5 | Loss: 0.0549 | F1: 0.4380 | Acc: 0.4583
Saved best model for Fold 5 to models/bert_fold_5.pt
Also saved model to Google Drive: /content/drive/MyDrive/Project/models/bert_fold_5.pt
Saved confusion matrix to plots/conf_matrix_fold_5.png
Fold 5 training time: 1758.51 seconds
Completed Epoch 5 of Fold 5

Cross-validation completed in 29.31 min
Fold-wise metrics:
Fold 5 | Accuracy: 0.4583 | F1: 0.4380
Saved fold metrics to fold_metrics.csv
Also saved fold metrics CSV to: /content/drive/MyDrive/Project/fold_metrics.csv


In [26]:
import shutil
shutil.make_archive("bert_fold_models", "zip", "models")

from google.colab import files
files.download("bert_fold_models.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import torch
import numpy as np

def predict_emotions_multilabel(text, model, tokenizer, device, threshold=0.5):
    model.eval()
    model.to(device)

    # Tokenizing and moving input tensors to device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)  
        probs = probs.cpu().numpy()[0]

    # Applying thresholds
    predicted_indices = np.where(probs >= threshold)[0]

    return predicted_indices, probs


In [28]:
label_to_emotion = {
    0: "admiration", 1: "amusement", 2: "anger", 3: "annoyance", 4: "approval",
    5: "caring", 6: "confusion", 7: "curiosity", 8: "desire", 9: "disappointment",
    10: "disapproval", 11: "disgust", 12: "embarrassment", 13: "excitement",
    14: "fear", 15: "gratitude", 16: "grief", 17: "joy", 18: "love",
    19: "nervousness", 20: "optimism", 21: "pride", 22: "realization",
    23: "relief", 24: "remorse", 25: "sadness", 26: "surprise", 27: "neutral"
}


In [31]:
text = "I'm so proud of my work!"

# Prediction
predicted_indices, probs = predict_emotions_multilabel(text, model, tokenizer, device)

# Mapping to emotion names
predicted_emotions = [label_to_emotion[idx] for idx in predicted_indices]

print("Predicted Emotions:", predicted_emotions)


Predicted Emotions: ['admiration']
