# Training notebook for `DistilledBERT` Model
In this notebook, we set up fine-tunning for `DistilledBERT` model, using three different techniques: LoRA, BitFit \& Prompt Tuning
- The fine-tuned model used in experiment are trained in 6 epochs and have been uploaded to Kaggle collection. This notebook is only submitted that show our training code and train on 3 epochs training for each model due to maximum session in Kaggle (12 hours).

# 1. Download resource

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


## Download library

In [2]:
!pip install datasets
# !pip uninstall numpy
# !pip install numpy==1.26.4

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platfo

In [3]:
import numpy as np
np.__version__

'1.26.4'

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [5]:
# import pandas as pd
import datasets
import numpy as np
import re,string
import torch
import torch.nn as nn
import math
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

## Load dataset

In [6]:
data_path = f"{path}/IMDB Dataset.csv"

df = pd.read_csv(data_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


## Load into `Datasets` object

In [7]:
from datasets import Dataset, DatasetDict

df['labels'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
dataset = Dataset.from_pandas(df)

dataset

Dataset({
    features: ['review', 'sentiment', 'labels'],
    num_rows: 50000
})

## Create `DatasetDict` object

In [8]:
# First split into train+val and test
train_val_split = dataset.train_test_split(test_size=0.2, seed=42)

# Then split train+val into train and val
train_val = train_val_split['train'].train_test_split(test_size=0.125, seed=42)  # 0.125 x 0.8 = 0.1

# Combine everything into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_val['train'],
    'validation': train_val['test'],
    'test': train_val_split['test']
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'labels'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['review', 'sentiment', 'labels'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review', 'sentiment', 'labels'],
        num_rows: 10000
    })
})


# 2. Build abstract function

## 2.1 Preprocessing function

In [9]:
from torch.utils.data import DataLoader, Dataset

# Function to tokenize and include labels
def tokenize_function(examples, tokenizer, max_length):
    tokenized_output = tokenizer(examples["review"], truncation=True, padding="max_length", max_length=max_length)
    return tokenized_output

def preprocessing(dataset_dict, tokenizer, batch_size, max_model_length=None):
    fn_kwargs = {
        "tokenizer": tokenizer,
        "max_length": max_model_length
    }

    # Apply the tokenize_function to the datasets
    train_dataset = dataset_dict["train"].map(tokenize_function, fn_kwargs=fn_kwargs, batched=True)
    val_dataset = dataset_dict["validation"].map(tokenize_function, fn_kwargs=fn_kwargs, batched=True)
    test_dataset = dataset_dict["test"].map(tokenize_function, fn_kwargs=fn_kwargs, batched=True)

    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) # changed "label" to "labels"
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) # changed "label" to "labels"

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

## 2.2 Training function

In [10]:
from tqdm import tqdm
from torch.optim import AdamW
from peft import get_peft_model_state_dict, set_peft_model_state_dict
from sklearn.metrics import accuracy_score
import time
import datetime

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def accuracy_score_fn(outputs, labels):
    predictions = np.argmax(outputs, axis=1)
    return accuracy_score(labels, predictions)

def save_model(model, path):
    model_state_dict = get_peft_model_state_dict(model)
    torch.save(model_state_dict, path)

def evaluate_fn(model, val_loader, device):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        progress_bar = tqdm(val_loader, desc="Evaluate model", leave=False)

        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device

            # Prepare inputs for the model
            model_inputs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels,
            }

            outputs = model(**model_inputs)
            loss = outputs.loss
            total_loss += loss.item()

            # Accuracy score
            all_preds.extend(outputs.logits.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

            ### TODO: DELETE IT
            # break

    avg_val_loss = total_loss / len(val_loader)
    value = accuracy_score_fn(np.array(all_preds), np.array(all_labels))
    return avg_val_loss, value

def train_and_save(model, train_loader, val_loader, output_dir, device, lr, epochs, save_fn = save_model):
    # only update the parameters that require gradients in the case of BitFit or Lora
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    best_loss = float('inf')

    for epoch in range(1, epochs + 1):
        start = time.time()
        model.train()
        total_loss = 0.0
        patience = 0        # Early stopping

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch} Training", leave=False)

        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)  # Assuming labels are numerical

            # Prepare inputs for the model
            model_inputs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels,
            }

            outputs = model(**model_inputs)

            loss = outputs.loss # Loss function is provided by the Sentence Classification head
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()

            progress_bar.set_postfix(loss=loss.item())

            ### TODO: DELETE IT
            # break

        avg_loss = total_loss / len(train_loader)

        val_loss, accuracy_score = evaluate_fn(model, val_loader, device)
        train_duration = time.time() - start
        print(f"Epoch {epoch} - Average loss: {avg_loss:.4f} - Trainning duration: {str(datetime.timedelta(seconds=int(train_duration)))}")
        print(f"Average Validation: {val_loss:.4f} - Validation accuracy: {accuracy_score:.4f}")

        if val_loss < best_loss:
            patience = 0
            best_loss = val_loss
            save_fn(model, output_dir)
            print(f"Saved model in {output_dir}")
        else:
            patience += 1
            if patience == 2:
                print("Early stopping triggered. No improvement in validation loss.")
                break

2025-05-31 12:06:33.594922: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748693193.782832      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748693193.847111      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# 3. `DistilledBERT` Model

## 3.1 Download model

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load tokenizer and model
checkpoint = "distilbert-base-uncased"
distillbert_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
distillbert_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Preprocessing dataset

In [12]:
BATCH_SIZE = 32

train_loader, val_loader, test_loader = preprocessing(dataset_dict, distillbert_tokenizer, BATCH_SIZE)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

## 3.2 LoRA tuning

### Load model and config hyperparams

In [13]:
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
from sklearn.metrics import accuracy_score # Import accuracy_score

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 1e-3
EPOCHS = 3

lora_config = LoraConfig(
    r=8,  # Rank of low-rank matrices => About 5~10M trainable params
    lora_alpha=16,
    lora_dropout=0.1,
    task_type="SEQ_CLS",  # For sequence classification task
    target_modules=["out_lin"]   # ← tell PEFT which layers to adapt
)

lora_model = get_peft_model(distillbert_model, lora_config)
lora_model.to(device)

print(f"The model has {count_parameters(lora_model):,} trainable parameters")

The model has 665,858 trainable parameters


### Start training

In [14]:
model_version = "30-05"
train_and_save(lora_model, train_loader, val_loader, f"{checkpoint}_lora_model_v{model_version}.pt", device, LEARNING_RATE, EPOCHS)

  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 1 - Average loss: 0.2602 - Trainning duration: 0:14:43
Average Validation: 0.2031 - Validation accuracy: 0.9168
Saved model in distilbert-base-uncased_lora_model_v30-05.pt


  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 2 - Average loss: 0.2135 - Trainning duration: 0:14:43
Average Validation: 0.2070 - Validation accuracy: 0.9192


  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 3 - Average loss: 0.1960 - Trainning duration: 0:14:43
Average Validation: 0.2330 - Validation accuracy: 0.9120




### Testing result on Testset

In [15]:
lora_loss, lora_acc = evaluate_fn(lora_model, test_loader, device)

print("Model accuracy: ", lora_acc)

  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Model accuracy:  0.908




## 3.3 BitFit Tuning

In [16]:
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 1e-3
EPOCHS = 3

# Load the base model (e.g., for text classification)
bitfit_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
bitfit_model = bitfit_model.to(device)

# Freeze all parameters
for param in bitfit_model.parameters():
    param.requires_grad = False

# Enable training for bias terms only
for name, param in bitfit_model.named_parameters():
    if 'bias' in name:
        param.requires_grad = True

print(f"The model has {count_parameters(bitfit_model):,} trainable parameters")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 52,226 trainable parameters


### Training

In [17]:
def save_bitfit_bias_only(model, path):
    # bitfit_state_dict = {k: v for k, v in model.state_dict().items() if "bias" in k}
    torch.save(model.state_dict(), path)

model_version = "30-05"
train_and_save(bitfit_model, train_loader, val_loader, f"{checkpoint}_bitfit_model_v{model_version}.pt", device, LEARNING_RATE, EPOCHS, save_bitfit_bias_only)

  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 1 - Average loss: 0.3401 - Trainning duration: 0:15:58
Average Validation: 0.2564 - Validation accuracy: 0.8992
Saved model in distilbert-base-uncased_bitfit_model_v30-05.pt


  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 2 - Average loss: 0.2566 - Trainning duration: 0:15:59
Average Validation: 0.2310 - Validation accuracy: 0.9066
Saved model in distilbert-base-uncased_bitfit_model_v30-05.pt


  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 3 - Average loss: 0.2364 - Trainning duration: 0:15:59
Average Validation: 0.2319 - Validation accuracy: 0.9082




### Testing on testset

In [18]:
bitfit_loss, bitfit_acc = evaluate_fn(bitfit_model, test_loader, device)

print("Model accuracy: ", bitfit_acc)

  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Model accuracy:  0.9088




## 3.4 `Prompt Tuning`

### Load config

In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PromptTuningConfig, get_peft_model, TaskType

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 1e-3
EPOCHS = 3
NUM_VIRTUAL_TOKEN = 20
BATCH_SIZE = 32

# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
config = base_model.config

# Define Prompt Tuning config
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,       # Sequence classification
    num_virtual_tokens=NUM_VIRTUAL_TOKEN,            # Number of prompt tokens (adjustable)
    tokenizer_name_or_path=checkpoint,
    num_layers = 6,                             # BERT-base: 12
    token_dim = config.hidden_size,                            # BERT-base: 768
    num_attention_heads = 6,                    # BERT-base: 12
)

prompt_model = get_peft_model(base_model, peft_config)
prompt_model.to(device)

print(f"The model has {count_parameters(prompt_model):,} trainable parameters")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 607,490 trainable parameters


### Update dataloader
Because in Prompt Tuning, the max size of model's token need to be adjusted due to the change in the

In [20]:
max_model_length = distillbert_tokenizer.model_max_length
adjusted_length = max_model_length - NUM_VIRTUAL_TOKEN

prompt_train_loader, prompt_val_loader, prompt_test_loader = preprocessing(dataset_dict, distillbert_tokenizer, BATCH_SIZE, adjusted_length)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [21]:
model_version = "30-05"
train_and_save(prompt_model, prompt_train_loader, prompt_val_loader, f"{checkpoint}_prompt_model_v{model_version}.pt", device, LEARNING_RATE, EPOCHS)

  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 1 - Average loss: 0.4173 - Trainning duration: 0:15:49
Average Validation: 0.3200 - Validation accuracy: 0.8592
Saved model in distilbert-base-uncased_prompt_model_v30-05.pt


  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 2 - Average loss: 0.3521 - Trainning duration: 0:15:49
Average Validation: 0.3074 - Validation accuracy: 0.8646
Saved model in distilbert-base-uncased_prompt_model_v30-05.pt


  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Epoch 3 - Average loss: 0.3331 - Trainning duration: 0:15:49
Average Validation: 0.3358 - Validation accuracy: 0.8484




### Testing on testset

In [22]:
prompt_loss, prompt_acc = evaluate_fn(prompt_model, prompt_test_loader, device)

print("Model accuracy: ", prompt_acc)

  labels = torch.tensor(batch["labels"]).to(device)  # Assuming labels are numerical and can be moved to the device
                                                                 

Model accuracy:  0.8494




# Compare result in test set

In [23]:
print("Result: ")
print(f"Distilled BERT - Lora: test loss {lora_loss:4f} - accuracy: {lora_acc:4f}")
print(f"Distilled BERT - BitFit: test loss {bitfit_loss:4f} - accuracy: {bitfit_acc:4f}")
print(f"Distilled BERT - Prompt: test loss {prompt_loss:4f} - accuracy: {prompt_acc:4f}")

Result: 
Distilled BERT - Lora: test loss 0.241896 - accuracy: 0.908000
Distilled BERT - BitFit: test loss 0.232094 - accuracy: 0.908800
Distilled BERT - Prompt: test loss 0.337203 - accuracy: 0.849400
