In [5]:
import warnings
warnings.filterwarnings("ignore")
from transformers import logging
logging.set_verbosity_error()

In [6]:
from datasets import load_dataset
imdb = load_dataset("imdb")
print(imdb)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [7]:
import torch

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device            :", device)

def left_cuda_memory():
    reserved = torch.cuda.memory_reserved() / 1024**2    # in MB
    total = torch.cuda.get_device_properties(0).total_memory / 1024**2  # in MB
    free = total - reserved
    print("="*90)
    return free
print(f"Initial Memory left     : {left_cuda_memory()} MB")

Using device            : cuda
Initial Memory left     : 4095.5 MB


In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

In [9]:
import numpy as np

# Subset the dataset
small_train = imdb["train"].select(range(15000))
small_test = imdb["test"].select(range(10000))


train_texts = small_train["text"]
token_lengths = [len(tokenizer.tokenize(text)) for text in train_texts]
print(f"token length | 50, 75, 80, 90, 95 : {np.percentile(token_lengths, [50, 75, 80, 90, 95])}")
max_len = int(np.percentile(token_lengths, 90))

if max_len>512:
    max_len=512
    print(f"Choosen max lentgh: {max_len}")
else: 
    print(f'90th percentile choosen: {max_len}')

token length | 50, 75, 80, 90, 95 : [232.   376.   429.   602.   785.05]
Choosen max lentgh: 512


In [10]:
# Step 2: Tokenization function using dynamic max_length
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=max_len
    )

# Step 3: Tokenize datasets
tokenized_dataset = {
    "train": small_train.map(tokenize_function, batched=True),
    "test": small_test.map(tokenize_function, batched=True)
}

# Set format for PyTorch
for split in ["train", "test"]:
    tokenized_dataset[split].set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Step 4: DataLoader creation
train_loader = DataLoader(tokenized_dataset["train"], batch_size=8, shuffle=True, pin_memory=True)
test_loader = DataLoader(tokenized_dataset["test"], batch_size=8, pin_memory=True)

In [7]:
from torch.optim import AdamW 
from torch.nn import CrossEntropyLoss
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import time

# Optimizer and criterion
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()

def run_batches(model, loader, device, steps=None, show_progress=False):
    model.train()
    loop = tqdm(enumerate(loader), total=steps or len(loader), disable=not show_progress)
    start_time = time.time()

    scaler = GradScaler()

    for step, (idx, batch) in enumerate(loop):
        if steps and step >= steps:
            print(f"GPU Memory left             : {left_cuda_memory():.2f} MB")
            break

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        if show_progress:
            loop.set_description("Training")
            loop.set_postfix(loss=loss.item())

    return time.time() - start_time

# Run warmup
print("\nEstimating per epoch training time and GPU memory left...")
warmup_time = run_batches(model, train_loader, device, steps=10)
estimated_epoch_time = (warmup_time / 10) * len(train_loader)
print(f"Estimated time per epoch    : {estimated_epoch_time / 60:.2f} minutes")


Estimating per epoch training time and GPU memory left...
GPU Memory left             : 809.50 MB
Estimated time per epoch    : 44.11 minutes


Actual Model training:

In [8]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup

# === CONFIG ===
epochs = 1
accumulation_steps = 4
memory_check_step = 256
lr = 2e-5
weight_decay = 0.01

# === OPTIMIZER, LOSS, SCALER ===
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = CrossEntropyLoss()
scaler = GradScaler()

# === SCHEDULER ===
total_steps = len(train_loader) * epochs
warmup_steps = int(0.1 * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("\nStarting training for 1 epoch...\n")

for epoch in range(epochs):
    # ——— TRAINING ———
    model.train()
    running_loss = 0.0
    train_correct = 0
    train_total = 0

    train_loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=True)
    for step, batch in train_loop:
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["label"].to(device)

        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()  # <<< Step the scheduler here

        running_loss += loss.item() * accumulation_steps
        train_loop.set_postfix(loss=running_loss / (step + 1))

        preds = outputs.logits.argmax(dim=1)
        train_correct += (preds == labels).sum().item()
        train_total   += labels.size(0)

        if (step + 1) % memory_check_step == 0:
            torch.cuda.empty_cache()
            print(f"GPU Memory left: {left_cuda_memory():.2f} MB")

    avg_train_loss = running_loss / len(train_loader)
    train_acc      = train_correct / train_total

    # ——— TESTING ———
    model.eval()
    test_loss = 0.0
    test_correct = 0
    test_total = 0

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing", leave=False):
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            test_loss += outputs.loss.item()

            preds = outputs.logits.argmax(dim=1)
            test_correct += (preds == labels).sum().item()
            test_total   += labels.size(0)

    avg_test_loss = test_loss / len(test_loader)
    test_acc      = test_correct / test_total

    # ——— SAVE MODEL ———
    torch.save(model.state_dict(), "model_final.pt")
    print("Model after epoch 1 saved to model_final.pt")

    # ——— EPOCH SUMMARY ———
    print("\n" + "=" * 60)
    print(f"Epoch {epoch+1} Summary:")
    print(f"  Avg Train Loss : {avg_train_loss:.4f}")
    print(f"  Train Accuracy : {train_acc:.4f}")
    print(f"  Avg Test Loss  : {avg_test_loss:.4f}")
    print(f"  Test Accuracy  : {test_acc:.4f}")
    print("=" * 60)



Starting training for 1 epoch...



 14%|█▎        | 256/1875 [08:40<56:58,  2.11s/it, loss=0.457]  

GPU Memory left: 2669.50 MB


 27%|██▋       | 512/1875 [17:41<51:12,  2.25s/it, loss=0.384]  

GPU Memory left: 2669.50 MB


 41%|████      | 768/1875 [25:57<38:06,  2.07s/it, loss=0.32] 

GPU Memory left: 2669.50 MB


 55%|█████▍    | 1024/1875 [34:23<30:07,  2.12s/it, loss=0.28] 

GPU Memory left: 2669.50 MB


 68%|██████▊   | 1280/1875 [55:34<22:19,  2.25s/it, loss=0.26]     

GPU Memory left: 2669.50 MB


 82%|████████▏ | 1536/1875 [1:03:41<11:54,  2.11s/it, loss=0.241]

GPU Memory left: 2669.50 MB


 96%|█████████▌| 1792/1875 [1:11:13<02:41,  1.95s/it, loss=0.226]

GPU Memory left: 2669.50 MB


100%|██████████| 1875/1875 [1:13:51<00:00,  2.36s/it, loss=0.224]
                                                            

Model after epoch 1 saved to model_final.pt

Epoch 1 Summary:
  Avg Train Loss : 0.2240
  Train Accuracy : 0.9135
  Avg Test Loss  : 0.0568
  Test Accuracy  : 0.9840


Reloading the model and for larger test set and plotting accuracy metrices

In [11]:
from transformers import BertForSequenceClassification
import torch

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.load_state_dict(torch.load("model_final.pt"))
model.to(device)
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
#preprocessing unseen data for tesing (10k - 25k)

total = len(imdb["test"])  
small_test = imdb["test"].select(range(total - 15000, total))

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=max_len
    )

tokenized_test = small_test.map(tokenize_function, batched=True)
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

test_loader = DataLoader(tokenized_test, batch_size=8, pin_memory=True)

In [13]:
from sklearn.metrics import f1_score, classification_report
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

#performing final evaluation
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = outputs.logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


#Metrices
f1 = f1_score(all_labels, all_preds, average='weighted')  
print(f"\nWeighted F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, digits=4))

Evaluating:   0%|          | 0/1875 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 1875/1875 [20:04<00:00,  1.56it/s]



Weighted F1 Score: 0.8159

Classification Report:
              precision    recall  f1-score   support

           0     0.4428    0.9884    0.6116      2500
           1     0.9969    0.7512    0.8568     12500

    accuracy                         0.7907     15000
   macro avg     0.7198    0.8698    0.7342     15000
weighted avg     0.9046    0.7907    0.8159     15000

