# Step 1: Setup

In [2]:
! pip install datasets transformers accelerate evaluate peft






`datasets` — load classification datasets

`transformers` — model & tokenizer

`accelerate` — optimized training & memory management

`evaluate` — metrics calculation

`peft` — parameter-efficient fine-tuning library (for adapters, LoRA)

# Step 2: Dataset and Task
Example: Sentiment Analysis with IMDb dataset (binary classification).

In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")
train_ds = dataset["train"].shuffle(seed=42).select(range(2000))  # small subset for demo
test_ds = dataset["test"].shuffle(seed=42).select(range(500))


  from .autonotebook import tqdm as notebook_tqdm


# Step 3: Tokenizer and Model Setup

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(preprocess, batched=True)
test_ds = test_ds.map(preprocess, batched=True)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 500/500 [00:00<00:00, 4393.72 examples/s]


# Step 4: Define Fine-tuning Variants
(a) Full Fine-tuning (baseline)

In [3]:
from transformers import Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results_full_ft",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
    report_to=[],  # <-- disables wandb, tensorboard, etc.
)

import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()
results_full_ft = trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading builder script: 4.20kB [00:00, 8.19MB/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
 14%|█▎        | 51/375 [00:14<01:23,  3.90it/s]

{'loss': 0.5604, 'grad_norm': 9.413958549499512, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


 27%|██▋       | 100/375 [00:27<01:15,  3.65it/s]

{'loss': 0.4501, 'grad_norm': 2.2814810276031494, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


                                                 
 33%|███▎      | 125/375 [00:35<01:02,  4.00it/s]

{'eval_loss': 0.41653645038604736, 'eval_accuracy': 0.834, 'eval_runtime': 2.2199, 'eval_samples_per_second': 225.232, 'eval_steps_per_second': 7.207, 'epoch': 1.0}


 40%|████      | 151/375 [00:44<00:57,  3.88it/s]

{'loss': 0.2925, 'grad_norm': 5.548055171966553, 'learning_rate': 3e-05, 'epoch': 1.2}


 54%|█████▎    | 201/375 [00:57<00:43,  3.99it/s]

{'loss': 0.2531, 'grad_norm': 1.9545494318008423, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


 67%|██████▋   | 250/375 [01:09<00:31,  3.95it/s]

{'loss': 0.2194, 'grad_norm': 1.891524314880371, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                                 
 67%|██████▋   | 250/375 [01:11<00:31,  3.95it/s]

{'eval_loss': 0.5289952158927917, 'eval_accuracy': 0.838, 'eval_runtime': 2.145, 'eval_samples_per_second': 233.103, 'eval_steps_per_second': 7.459, 'epoch': 2.0}


 80%|████████  | 301/375 [01:26<00:18,  3.98it/s]

{'loss': 0.0754, 'grad_norm': 2.376380681991577, 'learning_rate': 1e-05, 'epoch': 2.4}


 93%|█████████▎| 350/375 [01:38<00:06,  4.01it/s]

{'loss': 0.1246, 'grad_norm': 15.921243667602539, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


                                                 
100%|██████████| 375/375 [01:59<00:00,  3.93it/s]

{'eval_loss': 0.5904265642166138, 'eval_accuracy': 0.848, 'eval_runtime': 2.1405, 'eval_samples_per_second': 233.589, 'eval_steps_per_second': 7.475, 'epoch': 3.0}


100%|██████████| 375/375 [02:24<00:00,  2.60it/s]


{'train_runtime': 144.3296, 'train_samples_per_second': 41.572, 'train_steps_per_second': 2.598, 'train_loss': 0.26936251831054686, 'epoch': 3.0}


100%|██████████| 16/16 [00:01<00:00,  8.04it/s]


(b) Adapter Tuning with PEFT

In [4]:
from peft import get_peft_model, LoraConfig, TaskType

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,                # bottleneck dimension
    lora_alpha=16,
    lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()
results_adapter = trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 14%|█▎        | 51/375 [00:08<00:51,  6.25it/s]

{'loss': 0.7016, 'grad_norm': 6.678351402282715, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


 27%|██▋       | 101/375 [00:16<00:43,  6.29it/s]

{'loss': 0.6954, 'grad_norm': 4.2916412353515625, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


 33%|███▎      | 125/375 [00:20<00:39,  6.28it/s]
 33%|███▎      | 125/375 [00:22<00:39,  6.28it/s]

{'eval_loss': 0.6834031939506531, 'eval_accuracy': 0.588, 'eval_runtime': 2.3006, 'eval_samples_per_second': 217.334, 'eval_steps_per_second': 6.955, 'epoch': 1.0}


 40%|████      | 151/375 [00:27<00:35,  6.26it/s]

{'loss': 0.6905, 'grad_norm': 1.8366916179656982, 'learning_rate': 3e-05, 'epoch': 1.2}


 54%|█████▎    | 201/375 [00:35<00:27,  6.22it/s]

{'loss': 0.6852, 'grad_norm': 1.373412847518921, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


 67%|██████▋   | 250/375 [00:42<00:20,  6.23it/s]

{'loss': 0.6791, 'grad_norm': 1.7868913412094116, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}



 67%|██████▋   | 250/375 [00:45<00:20,  6.23it/s]

{'eval_loss': 0.6707181334495544, 'eval_accuracy': 0.61, 'eval_runtime': 2.2638, 'eval_samples_per_second': 220.865, 'eval_steps_per_second': 7.068, 'epoch': 2.0}


 80%|████████  | 301/375 [00:53<00:11,  6.21it/s]

{'loss': 0.6672, 'grad_norm': 6.782930850982666, 'learning_rate': 1e-05, 'epoch': 2.4}


 94%|█████████▎| 351/375 [01:01<00:03,  6.32it/s]

{'loss': 0.6652, 'grad_norm': 2.124680280685425, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


100%|██████████| 375/375 [01:05<00:00,  6.19it/s]
100%|██████████| 375/375 [01:09<00:00,  6.19it/s]

{'eval_loss': 0.660788357257843, 'eval_accuracy': 0.646, 'eval_runtime': 2.255, 'eval_samples_per_second': 221.73, 'eval_steps_per_second': 7.095, 'epoch': 3.0}


100%|██████████| 375/375 [01:09<00:00,  5.38it/s]


{'train_runtime': 69.7133, 'train_samples_per_second': 86.067, 'train_steps_per_second': 5.379, 'train_loss': 0.6823002217610677, 'epoch': 3.0}


100%|██████████| 16/16 [00:02<00:00,  7.71it/s]


# Step 5: Track Training Time & Memory Usage
Use Python's time module or accelerate's built-in profiling for timing. In this case manually looking at cell execution time

Monitored GPU memory with nvidia-smi

In [5]:
import time
start = time.time()
trainer.train()
end = time.time()
print(f"Training time: {end - start} seconds")


 14%|█▎        | 51/375 [00:08<00:51,  6.33it/s]

{'loss': 0.6536, 'grad_norm': 6.657367706298828, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


 27%|██▋       | 101/375 [00:16<00:43,  6.33it/s]

{'loss': 0.6079, 'grad_norm': 3.609138250350952, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


 33%|███▎      | 125/375 [00:19<00:39,  6.31it/s]
 33%|███▎      | 125/375 [00:22<00:39,  6.31it/s]

{'eval_loss': 0.4993366599082947, 'eval_accuracy': 0.774, 'eval_runtime': 2.2421, 'eval_samples_per_second': 223.007, 'eval_steps_per_second': 7.136, 'epoch': 1.0}


 40%|████      | 151/375 [00:26<00:35,  6.35it/s]

{'loss': 0.5116, 'grad_norm': 3.3866007328033447, 'learning_rate': 3e-05, 'epoch': 1.2}


 54%|█████▎    | 201/375 [00:35<00:28,  6.20it/s]

{'loss': 0.4481, 'grad_norm': 2.437556028366089, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


 67%|██████▋   | 250/375 [00:42<00:20,  6.21it/s]

{'loss': 0.4126, 'grad_norm': 5.1470184326171875, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}



 67%|██████▋   | 250/375 [00:45<00:20,  6.21it/s]

{'eval_loss': 0.4424542486667633, 'eval_accuracy': 0.802, 'eval_runtime': 2.2374, 'eval_samples_per_second': 223.479, 'eval_steps_per_second': 7.151, 'epoch': 2.0}


 80%|████████  | 301/375 [00:53<00:12,  6.15it/s]

{'loss': 0.4021, 'grad_norm': 3.3874051570892334, 'learning_rate': 1e-05, 'epoch': 2.4}


 94%|█████████▎| 351/375 [01:01<00:03,  6.30it/s]

{'loss': 0.409, 'grad_norm': 3.137791156768799, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


100%|██████████| 375/375 [01:05<00:00,  6.28it/s]
100%|██████████| 375/375 [01:08<00:00,  6.28it/s]

{'eval_loss': 0.4089295566082001, 'eval_accuracy': 0.824, 'eval_runtime': 2.2602, 'eval_samples_per_second': 221.22, 'eval_steps_per_second': 7.079, 'epoch': 3.0}


100%|██████████| 375/375 [01:09<00:00,  5.42it/s]

{'train_runtime': 69.159, 'train_samples_per_second': 86.757, 'train_steps_per_second': 5.422, 'train_loss': 0.4866800816853841, 'epoch': 3.0}
Training time: 69.36183667182922 seconds





In [7]:
import torch

def print_gpu_memory_usage(label=""):
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / (1024 ** 3)
        reserved = torch.cuda.memory_reserved() / (1024 ** 3)
        print(f"{label} GPU memory allocated: {allocated:.2f} GB, reserved: {reserved:.2f} GB")
    else:
        print(f"{label} CUDA not available.")

# Example usage:
# Before training
print_gpu_memory_usage("Before training:")
# trainer.train()


model_full = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model_full,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()
# results_full_ft = trainer.evaluate()
print_gpu_memory_usage("After training (Adapter/PEFT):")
# to compare with full fine-tuning, you would repeat the above
# with a model that is NOT wrapped with PEFT/adapter, e.g.:
# trainer_full = Trainer(model=model_full, ...)
# print_gpu_memory_usage("Before training (Full Fine-tuning):")
# trainer_full.train()
# print_gpu_memory_usage("After training (Full Fine-tuning):")


Before training: GPU memory allocated: 0.43 GB, reserved: 1.55 GB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 14%|█▎        | 51/375 [00:12<01:21,  3.96it/s]

{'loss': 0.5557, 'grad_norm': 2.8914098739624023, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


 27%|██▋       | 101/375 [00:25<01:07,  4.05it/s]

{'loss': 0.4171, 'grad_norm': 3.123000383377075, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


 33%|███▎      | 125/375 [00:31<01:01,  4.07it/s]
 33%|███▎      | 125/375 [00:33<01:01,  4.07it/s]

{'eval_loss': 0.5066573619842529, 'eval_accuracy': 0.802, 'eval_runtime': 2.1413, 'eval_samples_per_second': 233.505, 'eval_steps_per_second': 7.472, 'epoch': 1.0}


 40%|████      | 151/375 [00:47<00:56,  3.94it/s]

{'loss': 0.3197, 'grad_norm': 6.878238201141357, 'learning_rate': 3e-05, 'epoch': 1.2}


 54%|█████▎    | 201/375 [00:59<00:43,  4.02it/s]

{'loss': 0.2604, 'grad_norm': 7.391452312469482, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


 67%|██████▋   | 250/375 [01:12<00:50,  2.45it/s]

{'loss': 0.2279, 'grad_norm': 5.5782151222229, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}



 67%|██████▋   | 250/375 [01:27<00:50,  2.45it/s]

{'eval_loss': 0.49337852001190186, 'eval_accuracy': 0.84, 'eval_runtime': 13.5866, 'eval_samples_per_second': 36.801, 'eval_steps_per_second': 1.178, 'epoch': 2.0}


 80%|████████  | 301/375 [01:42<00:18,  4.06it/s]

{'loss': 0.1201, 'grad_norm': 4.352512836456299, 'learning_rate': 1e-05, 'epoch': 2.4}


 94%|█████████▎| 351/375 [01:55<00:05,  4.09it/s]

{'loss': 0.1103, 'grad_norm': 0.47726503014564514, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


100%|██████████| 375/375 [02:01<00:00,  4.08it/s]
100%|██████████| 375/375 [02:26<00:00,  4.08it/s]

{'eval_loss': 0.6336063146591187, 'eval_accuracy': 0.838, 'eval_runtime': 3.0153, 'eval_samples_per_second': 165.823, 'eval_steps_per_second': 5.306, 'epoch': 3.0}


100%|██████████| 375/375 [02:39<00:00,  2.34it/s]

{'train_runtime': 159.9713, 'train_samples_per_second': 37.507, 'train_steps_per_second': 2.344, 'train_loss': 0.27567799313863117, 'epoch': 3.0}
After training (Adapter/PEFT): GPU memory allocated: 1.66 GB, reserved: 3.24 GB





# Step 6: Analyze Trade-offs
| Technique        | training_loss | Training Time (s) | Memory Usage (GB) | Notes                                                           |
|------------------|--------------|-------------------|-------------------|-----------------------------------------------------------------|
| Full Fine-tuning | 0.269        |144.33s               | 1.66                 | Trains all model parameters, best accuracy but expensive        |
| Adapter Tuning with PEFT  |  0.68         | 69.7               | 0.43                 | Trains small additional params, faster & less memory            |