## Predict Product Prices

An introduction to LoRA and QLoRA

In [None]:
# pip installs

!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0
!pip install -q datasets requests peft

In [None]:
# imports

import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datetime import datetime

In [None]:
# Constants

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
FINETUNED_MODEL = f"ed-donner/pricer-2024-09-13_13.04.39"

# Hyperparameters for QLoRA Fine-Tuning

LORA_R = 32
LORA_ALPHA = 64
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]

### Log in to HuggingFace

If you don't already have a HuggingFace account, visit https://huggingface.co to sign up and create a token.

Then select the Secrets for this Notebook by clicking on the key icon in the left, and add a new secret called `HF_TOKEN` with the value as your token.

In [None]:
# Log in to Hugging Face

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

## Trying out different Quantization

In [None]:
# Load the Base Model without quantization

base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")

In [None]:
print(f"Memory footprint: {base_model.get_memory_footprint() / 1e9:,.1f} GB")

In [None]:
base_model

## Restart your session!

In order to load the next model and clear out the cache of the last model, you'll now need to go to Runtime >> Restart session and run the initial cells (installs and imports and HuggingFace login) again.

This is to clean out the GPU.

In [None]:
# Load the Base Model using 8 bit

quant_config = BitsAndBytesConfig(load_in_8bit=True)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)

In [None]:
print(f"Memory footprint: {base_model.get_memory_footprint() / 1e9:,.1f} GB")

In [None]:
base_model

## Restart your session!

In order to load the next model and clear out the cache of the last model, you'll now need to go to Runtime >> Restart session and run the initial cells (imports and HuggingFace login) again.

This is to clean out the GPU.

In [None]:
# Load the Tokenizer and the Base Model using 4 bit

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4")

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)

In [None]:
print(f"Memory footprint: {base_model.get_memory_footprint() / 1e9:,.2f} GB")

In [None]:
base_model

In [None]:
fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)

In [None]:
print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e9:,.2f} GB")

In [None]:
fine_tuned_model

In [None]:
# Each of the Target Modules has 2 LoRA Adaptor matrices, called lora_A and lora_B
# These are designed so that weights can be adapted by adding alpha * lora_A * lora_B
# Let's count the number of weights using their dimensions:

# See the matrix dimensions above
lora_q_proj = 4096 * 32 + 4096 * 32
lora_k_proj = 4096 * 32 + 1024 * 32
lora_v_proj = 4096 * 32 + 1024 * 32
lora_o_proj = 4096 * 32 + 4096 * 32

# Each layer comes to
lora_layer = lora_q_proj + lora_k_proj + lora_v_proj + lora_o_proj

# There are 32 layers
params = lora_layer * 32

# So the total size in MB is
size = (params * 4) / 1_000_000

print(f"Total number of params: {params:,} and size {size:,.1f}MB")