# Settings Script

In [1]:
LOAD_MODEL = False
MODEL_NAME = "V7"

EPOCHS = 15
NUM_LABELS = 5
MAX_LEN = 128
BATCH_SIZE = 16
LR = 2e-5
NUM_WORKERS = 4


# Imports

In [2]:
import os
import json
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


  from .autonotebook import tqdm as notebook_tqdm


# Connect GPU

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


Using device: cuda


# Set file path

In [4]:
BASE_PATH = "./models/"   # local folder
version_dir = os.path.join(BASE_PATH, MODEL_NAME)

if LOAD_MODEL:
    if not os.path.exists(version_dir):
        raise RuntimeError(f"Model '{MODEL_NAME}' does not exist.")
else:
    if os.path.exists(version_dir):
        raise RuntimeError(f"Model '{MODEL_NAME}' already exists.")
    os.makedirs(version_dir)


# Setup logging

In [5]:
combined_log_path = os.path.join(version_dir, "run_output.txt")
combined_log_file = open(combined_log_path, "w", encoding="utf-8")

def log(msg):
    print(msg)
    combined_log_file.write(msg + "\n")
    combined_log_file.flush()

# Load and Inspect dataset

In [None]:
dataset = load_dataset("SetFit/sst5")

print("Labels:", set(dataset["train"]["label_text"]))

def print_dist(ds, name):
    counts = Counter(ds['label_text'])
    print(f"\n{name} distribution:")
    for k,v in counts.items():
        print(f"{k}: {v}")

print_dist(dataset["train"], "Train")
print_dist(dataset["validation"], "Val")
print_dist(dataset["test"], "Test")


Labels: {'very negative', 'positive', 'neutral', 'very positive', 'negative'}

Train distribution:
very positive: 1288
negative: 2218
neutral: 1624
positive: 2322
very negative: 1092

Val distribution:
neutral: 229
negative: 289
very negative: 139
positive: 279
very positive: 165

Test distribution:
negative: 633
very negative: 279
neutral: 389
very positive: 399
positive: 510


# Create DataLoaders

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

datasetMap = dataset.map(tokenize, batched=True)
datasetMap = datasetMap.rename_column("label", "labels")
datasetMap.set_format("torch", columns=["input_ids","attention_mask","labels"])

train_loader = DataLoader(datasetMap["train"], batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader   = DataLoader(datasetMap["validation"], batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)
test_loader  = DataLoader(datasetMap["test"], batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)


# Define Model

In [9]:
# # V1 - Base model 
# class CustomBertClassifier(nn.Module):
#     def __init__(self, num_labels=NUM_LABELS):
#         super().__init__()
#         self.bert = BertModel.from_pretrained("bert-base-uncased")
#         self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(
#             input_ids=input_ids,
#             attention_mask=attention_mask
#         )
#         cls = outputs.last_hidden_state[:, 0, :]  # CLS
#         return self.classifier(cls)

In [10]:
# # V2 — BERT + small MLP classifier head
# class CustomBertClassifier(nn.Module):
#     def __init__(self, num_labels=NUM_LABELS):
#         super().__init__()
#         self.bert = BertModel.from_pretrained("bert-base-uncased")
#         self.classifier = nn.Sequential(
#             nn.Dropout(0.3),
#             nn.Linear(self.bert.config.hidden_size, 256),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(256, num_labels)
#         )

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         cls = outputs.last_hidden_state[:, 0, :]
#         return self.classifier(cls)


In [11]:
# # === V3 — BERT + MLP head + partial freezing (best next step) ===
# class CustomBertClassifier(nn.Module):
#     def __init__(self, num_labels=NUM_LABELS):
#         super().__init__()

#         # Load base BERT
#         self.bert = BertModel.from_pretrained("bert-base-uncased")

#         # ----- PARTIAL FREEZING (V3 upgrade) -----
#         # Freeze bottom 8 encoder layers
#         for layer in self.bert.encoder.layer[:8]:
#             for param in layer.parameters():
#                 param.requires_grad = False

#         # Keep the top 4 layers trainable
#         # (no action needed; they default to requires_grad=True)

#         # ----- Improved classifier head (inherits V2 but slightly more stable) -----
#         self.classifier = nn.Sequential(
#             nn.Dropout(0.3),
#             nn.LayerNorm(self.bert.config.hidden_size),
#             nn.Linear(self.bert.config.hidden_size, 256),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(256, num_labels)
#         )

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         cls = outputs.last_hidden_state[:, 0, :]
#         return self.classifier(cls)


In [12]:
# # V4 — V3 + GELU activations
# class CustomBertClassifier(nn.Module):
#     def __init__(self, num_labels=NUM_LABELS):
#         super().__init__()
#         self.bert = BertModel.from_pretrained("bert-base-uncased")
#         hidden = self.bert.config.hidden_size

#         self.classifier = nn.Sequential(
#             nn.Dropout(0.3),
#             nn.Linear(hidden, 256),
#             nn.GELU(),
#             nn.Dropout(0.2),
#             nn.Linear(256, 128),
#             nn.GELU(),
#             nn.Dropout(0.1),
#             nn.Linear(128, num_labels)
#         )

#     def forward(self, input_ids, attention_mask):
#         out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         cls = out.last_hidden_state[:, 0, :]
#         return self.classifier(cls)


In [13]:
# # V5 — V4 + partial BERT freezing
# class CustomBertClassifier(nn.Module):
#     def __init__(self, num_labels=NUM_LABELS):
#         super().__init__()
#         self.bert = BertModel.from_pretrained("bert-base-uncased")

#         # Freeze all BERT layers except the last one
#         for param in self.bert.parameters():
#             param.requires_grad = False
#         for param in self.bert.encoder.layer[-1].parameters():
#             param.requires_grad = True

#         hidden = self.bert.config.hidden_size

#         self.classifier = nn.Sequential(
#             nn.Dropout(0.3),
#             nn.Linear(hidden, 256),
#             nn.GELU(),
#             nn.Dropout(0.2),
#             nn.Linear(256, 128),
#             nn.GELU(),
#             nn.Dropout(0.1),
#             nn.Linear(128, num_labels)
#         )

#     def forward(self, input_ids, attention_mask):
#         out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         cls = out.last_hidden_state[:, 0, :]
#         return self.classifier(cls)


In [14]:
# # V6 — V5 + LayerNorm stabilization
# class CustomBertClassifier(nn.Module):
#     def __init__(self, num_labels=NUM_LABELS):
#         super().__init__()
#         self.bert = BertModel.from_pretrained("bert-base-uncased")

#         # Same partial freeze as V5
#         for p in self.bert.parameters():
#             p.requires_grad = False
#         for p in self.bert.encoder.layer[-1].parameters():
#             p.requires_grad = True

#         hidden = self.bert.config.hidden_size

#         self.classifier = nn.Sequential(
#             nn.LayerNorm(hidden),
#             nn.Dropout(0.3),
#             nn.Linear(hidden, 256),
#             nn.GELU(),
#             nn.Dropout(0.2),
#             nn.Linear(256, 128),
#             nn.GELU(),
#             nn.Dropout(0.1),
#             nn.Linear(128, num_labels)
#         )

#     def forward(self, input_ids, attention_mask):
#         out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         cls = out.last_hidden_state[:, 0, :]
#         return self.classifier(cls)


In [15]:
# V7 — V6 + CLS projection layer for long sequences
class CustomBertClassifier(nn.Module):
    def __init__(self, num_labels=NUM_LABELS):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        # Same partial freeze
        for p in self.bert.parameters():
            p.requires_grad = False
        for p in self.bert.encoder.layer[-1].parameters():
            p.requires_grad = True

        hidden = self.bert.config.hidden_size

        self.project = nn.Linear(hidden, 384)  # reduces noise from long sequences

        self.classifier = nn.Sequential(
            nn.LayerNorm(384),
            nn.Dropout(0.3),
            nn.Linear(384, 256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        cls = self.project(cls)  # <-- NEW
        return self.classifier(cls)


# Load or Create Model

In [17]:
model = CustomBertClassifier(NUM_LABELS).to(DEVICE)

if LOAD_MODEL:
    ckpt = torch.load(os.path.join(version_dir, "model.pt"), map_location=DEVICE)
    model.load_state_dict(ckpt["model_state_dict"])
    model.eval()
else:
    model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()


# Train Model

In [None]:
def train_one_epoch(model, loader, opt, crit, device, sample_print=500):
    model.train()
    total_loss, correct, total = 0, 0, 0
    next_print = sample_print

    for step, batch in enumerate(loader, 1):
        opt.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = crit(logits, labels)
        loss.backward()
        opt.step()

        total_loss += loss.item()
        preds = logits.argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        if total >= next_print:
            log(f"[Train] {total}/{len(loader.dataset)} "
                f"| Loss={total_loss/step:.4f} | Acc={correct/total:.4f}")
            next_print += sample_print

    return total_loss / len(loader), correct / total

def validate(model, loader, crit, device, sample_print=500):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    next_print = sample_print

    with torch.no_grad():
        for step, batch in enumerate(loader, 1):

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)
            loss = crit(logits, labels)

            total_loss += loss.item()
            preds = logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            if total >= next_print:
                log(f"[Val] {total}/{len(loader.dataset)} "
                    f"| Loss={total_loss/step:.4f} | Acc={correct/total:.4f}")
                next_print += sample_print

    return total_loss / len(loader), correct / total

In [None]:
# Metrics to track
train_losses = []
val_losses = []
train_accs = []
val_accs = []

# Training loop
for epoch in range(EPOCHS):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, DEVICE)
    val_loss, val_acc     = validate(model, val_loader, criterion, DEVICE)

    # <-- Add these lines
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)

    print(f"Epoch {epoch+1}: Train Acc={train_acc:.4f} | Val Acc={val_acc:.4f}")


[Train] 512/8544 | Loss=1.6004 | Acc=0.2500
[Train] 1008/8544 | Loss=1.5887 | Acc=0.2579
[Train] 1504/8544 | Loss=1.5785 | Acc=0.2773
[Train] 2000/8544 | Loss=1.5738 | Acc=0.2860
[Train] 2512/8544 | Loss=1.5600 | Acc=0.3073
[Train] 3008/8544 | Loss=1.5514 | Acc=0.3185
[Train] 3504/8544 | Loss=1.5399 | Acc=0.3251
[Train] 4000/8544 | Loss=1.5232 | Acc=0.3342
[Train] 4512/8544 | Loss=1.5062 | Acc=0.3429
[Train] 5008/8544 | Loss=1.4897 | Acc=0.3508
[Train] 5504/8544 | Loss=1.4738 | Acc=0.3568
[Train] 6000/8544 | Loss=1.4569 | Acc=0.3650
[Train] 6512/8544 | Loss=1.4409 | Acc=0.3744
[Train] 7008/8544 | Loss=1.4286 | Acc=0.3799
[Train] 7504/8544 | Loss=1.4170 | Acc=0.3835
[Train] 8000/8544 | Loss=1.4064 | Acc=0.3884
[Train] 8512/8544 | Loss=1.3944 | Acc=0.3940
[Val] 512/1101 | Loss=1.1913 | Acc=0.4590
[Val] 1008/1101 | Loss=1.1928 | Acc=0.4593
Epoch 1: Train Acc=0.3942 | Val Acc=0.4550
[Train] 512/8544 | Loss=1.2260 | Acc=0.4688
[Train] 1008/8544 | Loss=1.2220 | Acc=0.4603
[Train] 1504/8544 |

# Test Model

In [None]:
def test(model, loader, crit, device):
    model.eval()
    preds, labels_list = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)
            loss = crit(logits, labels)
            total_loss += loss.item()

            preds.extend(logits.argmax(1).cpu().tolist())
            labels_list.extend(labels.cpu().tolist())

    return total_loss/len(loader), preds, labels_list

test_loss, preds, labels_list = test(model, test_loader, criterion, DEVICE)
test_acc = accuracy_score(labels_list, preds)
report = classification_report(labels_list, preds, digits=4)
cm = confusion_matrix(labels_list, preds)

# Build FINAL RESULTS block
final_results_text = (
    "========== FINAL RESULTS ==========\n"
    f"Model Version: {MODEL_NAME}\n\n"
    f"Final Train Accuracy: {train_acc:.4f}\n"
    f"Final Validation Accuracy: {val_acc:.4f}\n\n"
    f"Test Loss: {test_loss:.4f}\n"
    f"Test Accuracy: {test_acc:.4f}\n\n"
    "Classification Report:\n"
    f"{report}\n"
    "====================================\n\n"
)

# Path to your output file
out_path = os.path.join(version_dir, "run_output.txt")

# Read the old content
try:
    with open(out_path, "r") as f:
        old_content = f.read()
except FileNotFoundError:
    old_content = ""

# Write FINAL RESULTS at top, followed by original content
with open(out_path, "w") as f:
    f.write(final_results_text + old_content)

print("Test Accuracy:", test_acc)
print(report)


Test Accuracy: 0.5009049773755656
              precision    recall  f1-score   support

           0     0.4341    0.4014    0.4171       279
           1     0.5241    0.6003    0.5596       633
           2     0.3544    0.2879    0.3177       389
           3     0.5028    0.5216    0.5120       510
           4     0.6204    0.5940    0.6069       399

    accuracy                         0.5009      2210
   macro avg     0.4872    0.4810    0.4827      2210
weighted avg     0.4954    0.5009    0.4966      2210



# Plot Metrics

In [21]:
# === PLOT & SAVE TRAINING CURVES ===

# epochs list
epochs = list(range(1, len(train_losses) + 1))

# --- create a combined figure ---
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# --- LOSS PLOT ---
ax[0].plot(epochs, train_losses, label="Train Loss")
ax[0].plot(epochs, val_losses, label="Validation Loss")
ax[0].set_title("Training and Validation Loss")
ax[0].set_xlabel("Epoch")
ax[0].set_ylabel("Loss")
ax[0].legend()

# --- ACCURACY PLOT ---
ax[1].plot(epochs, train_accs, label="Train Accuracy")
ax[1].plot(epochs, val_accs, label="Validation Accuracy")
ax[1].set_title("Training and Validation Accuracy")
ax[1].set_xlabel("Epoch")
ax[1].set_ylabel("Accuracy")
ax[1].legend()

plt.tight_layout()

# === SAVE to version folder ===
curve_path = os.path.join(version_dir, "training_curves.png")
fig.savefig(curve_path, dpi=150, bbox_inches="tight")

plt.close()


In [22]:
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig(os.path.join(version_dir, "confusion_matrix.png"))
plt.close()


# Save Model

In [None]:
torch.save({"model_state_dict": model.state_dict()},
           os.path.join(version_dir, "model.pt"))

with open(os.path.join(version_dir, "config.json"), "w") as f:
    json.dump({
        "MODEL_NAME": MODEL_NAME,
        "EPOCHS": EPOCHS,
        "LR": LR,
        "MAX_LEN": MAX_LEN,
        "BATCH_SIZE": BATCH_SIZE
    }, f, indent=4)