<a href="https://colab.research.google.com/github/robbarto2/GenAI-Foundations/blob/main/Fine_Tuning_IMDB_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install peft
!pip install datasets

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.13.0->peft)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.13.0->peft)
  Using cached nvidia_cufft_cu12-11.

In [3]:
import torch
import numpy as np
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from peft import get_peft_model, LoraConfig, TaskType


In [4]:
# Load the dataset and store in a numpy array
dataset = load_dataset("imdb")
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

0.5

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [6]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", add_prefix_space=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
# define the label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}


In [8]:
# Load the model
base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Tokenize Function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    # tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [10]:
#Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [11]:
# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(task_type="SEQ_CLS",
                         r=16,
                         lora_alpha=32,
                         lora_dropout=0.01,
                         target_modules=['q_lin'])

In [12]:
# Apply LoRA to the model
model = get_peft_model(base_model, lora_config)

In [24]:
# Define hyperparameters
training_args = TrainingArguments(
    output_dir='distilbert-base-uncased' + "-lora-movie-reviews",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=12,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_strategy="epoch",
    per_device_train_batch_size = 4 # Set batch size for training
)

In [25]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.01, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=16, out_features=768, 

In [26]:
# Define the custom callback with a check for the 'loss' key
class LossLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            print(f"Epoch: {state.epoch}, Loss: {logs['loss']:.4f}")


In [27]:
# Initialize the Trainer with the callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    callbacks=[LossLoggerCallback()]
)


In [28]:
# Fine-tune the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.3235,0.316111
2,0.3002,0.298102
3,0.3023,0.287956
4,0.2674,0.284638
5,0.2608,0.285596
6,0.2891,0.289641
7,0.2921,0.293266
8,0.2718,0.29075
9,0.2455,0.288707
10,0.2587,0.291636


Epoch: 0.08, Loss: 0.6364
Epoch: 0.16, Loss: 0.3471
Epoch: 0.24, Loss: 0.3214
Epoch: 0.32, Loss: 0.3291
Epoch: 0.4, Loss: 0.3449
Epoch: 0.48, Loss: 0.3405
Epoch: 0.56, Loss: 0.3092
Epoch: 0.64, Loss: 0.3334
Epoch: 0.72, Loss: 0.3290
Epoch: 0.8, Loss: 0.3419
Epoch: 0.88, Loss: 0.2872
Epoch: 0.96, Loss: 0.3235
Epoch: 1.04, Loss: 0.3163
Epoch: 1.12, Loss: 0.3617
Epoch: 1.2, Loss: 0.2992
Epoch: 1.28, Loss: 0.2733
Epoch: 1.3599999999999999, Loss: 0.3051
Epoch: 1.44, Loss: 0.2961
Epoch: 1.52, Loss: 0.3061
Epoch: 1.6, Loss: 0.3012
Epoch: 1.6800000000000002, Loss: 0.3059
Epoch: 1.76, Loss: 0.3209
Epoch: 1.8399999999999999, Loss: 0.3053
Epoch: 1.92, Loss: 0.3136
Epoch: 2.0, Loss: 0.3002
Epoch: 2.08, Loss: 0.2596
Epoch: 2.16, Loss: 0.2858
Epoch: 2.24, Loss: 0.2919
Epoch: 2.32, Loss: 0.3567
Epoch: 2.4, Loss: 0.2675
Epoch: 2.48, Loss: 0.2716
Epoch: 2.56, Loss: 0.2937
Epoch: 2.64, Loss: 0.3110
Epoch: 2.7199999999999998, Loss: 0.3112
Epoch: 2.8, Loss: 0.2716
Epoch: 2.88, Loss: 0.3144
Epoch: 2.96, Lo

TrainOutput(global_step=75000, training_loss=0.28006909159342447, metrics={'train_runtime': 3919.6329, 'train_samples_per_second': 76.538, 'train_steps_per_second': 19.134, 'total_flos': 3.3770043859934496e+16, 'train_loss': 0.28006909159342447, 'epoch': 12.0})

In [29]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.2846378982067108, 'eval_runtime': 97.3288, 'eval_samples_per_second': 256.861, 'eval_steps_per_second': 32.108, 'epoch': 12.0}


In [30]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


In [31]:
# Calculate the total number of parameters in the model
total_params = sum(p.numel() for p in model.parameters())

# Calculate the number of trainable parameters (those that require gradients)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the number of frozen parameters (those that do not require gradients)
frozen_params = total_params - trainable_params

# Print the results
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Frozen parameters: {frozen_params}")

Total parameters: 67694596
Trainable parameters: 739586
Frozen parameters: 66955010


In [38]:
texts = [
    "The special effects were amazing and the plot was thrilling!",
    "The plot was very bad and the characters were boring.",
    "An excellent movie that I would recommend to everyone.",
    "I can't wait to see the sequel.",
]

In [39]:
# Tokenize the input
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Assign Inferende to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = inputs.to(device) # Move input tensors to the same device as the model
model = base_model.to(device)

In [40]:
# Inference

# Tokenize the input
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Move inputs to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = inputs.to(device) # Move input tensors to the same device as the model
model = model.to(device)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get the predicted classes
predicted_classes = torch.argmax(probs, dim=-1)

# Define the label map
label_map = {0: "negative", 1: "positive"}

# Convert class indices to labels
predicted_labels = [label_map[class_idx.item()] for class_idx in predicted_classes]

# Print results
for text, label, prob in zip(texts, predicted_labels, probs.max(dim=-1).values):
    print(f"Text: {text}\nPrediction: {label}, Probability: {prob.item():.4f}\n")

Text: The special effects were amazing and the plot was thrilling!
Prediction: positive, Probability: 0.5050

Text: The plot was very bad and the characters were boring.
Prediction: negative, Probability: 0.5019

Text: An excellent movie that I would recommend to everyone.
Prediction: positive, Probability: 0.5106

Text: I can't wait to see the sequel.
Prediction: positive, Probability: 0.5060

