# Here we will try to finetune a model for specific purpose

#### Install some pre requisites

In [1]:
!pip install -q peft==0.14.0 trl==0.14.0 bitsandbytes requests

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/374.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.9/313.9 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h

#### Login to Hugging Face

In [2]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

### Lets look at the dataset we will work with

In [3]:
from datasets import load_dataset

dataset_name = 'ed-donner/pricer-data'
dataset = load_dataset(dataset_name)

README.md:   0%|          | 0.00/416 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/914k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
train = dataset['train']
test = dataset['test']

print(f"First Train data point: {train[0]}")
print(f"First Test data point: {test[0]}")

First Train data point: {'text': 'How much does this cost to the nearest dollar?\n\nDelphi FG0166 Fuel Pump Module\nDelphi brings 80 years of OE Heritage into each Delphi pump, ensuring quality and fitment for each Delphi part. Part is validated, tested and matched to the right vehicle application Delphi brings 80 years of OE Heritage into each Delphi assembly, ensuring quality and fitment for each Delphi part Always be sure to check and clean fuel tank to avoid unnecessary returns Rigorous OE-testing ensures the pump can withstand extreme temperatures Brand Delphi, Fit Type Vehicle Specific Fit, Dimensions LxWxH 19.7 x 7.7 x 5.1 inches, Weight 2.2 Pounds, Auto Part Position Unknown, Operation Mode Mechanical, Manufacturer Delphi, Model FUEL PUMP, Dimensions 19.7\n\nPrice is $227.00', 'price': 226.95}
First Test data point: {'text': "How much does this cost to the nearest dollar?\n\nOEM AC Compressor w/A/C Repair Kit For Ford F150 F-150 V8 & Lincoln Mark LT 2007 2008 - BuyAutoParts NEW

### Load Qunatized model

In [5]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import torch

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

# Quant Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
# Setup Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [7]:
# Setup Model for Finetuning
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=quant_config, device_map="auto")
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print("Memory used by Model: ", base_model.get_memory_footprint() / 1024**2)

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Memory used by Model:  5332.508056640625


In [None]:
### Look at model architecture
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRM

#### Data Collator tells the model about part of training data to focus on, hence masking some data. This helps model to focus on task at hand.

In [None]:
# Setup Data Collator
from trl import DataCollatorForCompletionOnlyLM

# Collator tells model to focus on Price of the items from training data
response_template = "Price is $"

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

### Lets start fine tuning

In [None]:
# Config for LoRA
from peft import LoraConfig

# Lora Hyper Parameters
# We are just starting with 32, But it can be 8, 16, 32 ...
LORA_R = 32

# Usually this is double of R
LORA_ALPHA = 64

# These are names of layers which we will train. See model architecture too see all layers
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]

# Dropouts in Network for regularization
LORA_DROPOUT = 0.1

# This is task for finetuning
LORA_TASK = "CAUSAL_LM"

lora_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type=LORA_TASK,
    target_modules=TARGET_MODULES,
)

In [None]:
# Supervised Fine Tuning (SFT) Parameters
from trl import SFTConfig
from datetime import datetime

# Hyper parameters
RUN_NAME =  f"{datetime.now():%Y%m%d%H%M%S}"
PROJECT_RUN_NAME = f"amz-pricer-{RUN_NAME}"
HUB_MODEL_NAME = f"shakeelansari63/{PROJECT_RUN_NAME}"
EPOCHS = 1 # you can do more epochs if you wish, but only 1 is needed - more is probably overkill
BATCH_SIZE = 2 # on an A100 box this can go up to 16
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"
MAX_SEQUENCE_LENGTH = 182

# Admin config - note that SAVE_STEPS is how often it will upload to the hub
# I've changed this from 5000 to 2000 so that you get more frequent saves
STEPS = 50
SAVE_STEPS = 2000
LOG_TO_WANDB = True

# Training Parameter object
train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy="no",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=10,
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to=None,
    run_name=RUN_NAME,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    dataset_text_field="text",
    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True
)

In [None]:
from trl import SFTTrainer
# And now, the Supervised Fine Tuning Trainer will carry out the fine-tuning
# Given these 2 sets of configuration parameters
# The latest version of trl is showing a warning about labels - please ignore this warning
# But let me know if you don't see good training results (loss coming down).

fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=train,
    peft_config=lora_config,
    args=train_parameters,
    data_collator=collator
  )

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [None]:
# Start Trainig Task
fine_tuning.train()

# Push our fine-tuned model to Hugging Face
fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

## After training complete, the model will be updated on Hugging Face

In [8]:
# Load LoRA Adapter for Base model as Finetuned Model
from peft import PeftModel
FINE_TUNE_HF_PROJECT = 'ed-donner/pricer-2024-09-13_13.04.39'
FINE_TUNE_REVISION = 'e8d637df551603dc86cd7a1598a8f44af4d7ae36'

fine_tuned_model = PeftModel.from_pretrained(base_model, FINE_TUNE_HF_PROJECT, revision=FINE_TUNE_REVISION)

# Lets look at fine tuned
fine_tuned_model

adapter_config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

#### Test

In [13]:
## Code to extract Price
import re
def extract_price(s):
    if "Price is $" in s:
      contents = s.split("Price is $")[1]
      contents = contents.replace(',','')
      match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
      return float(match.group()) if match else 0
    return 0

In [14]:
### Lets test 3 dat points from test set and see which one is close? Is it base model of Finetuned Models
test_data_points = [0, 12, 51]

for point in test_data_points:
  data_point = test[point]

  prompt = data_point['text']
  actual_value = data_point['price']

  # Encode with Tokenizer
  inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
  attention_mask = torch.ones(inputs.shape, device="cuda")

  # From Base Model
  base_output = base_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1)
  base_response = tokenizer.decode(base_output[0])

  # From Finetuned Model
  fine_tuned_output = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1)
  fine_tuned_response = tokenizer.decode(fine_tuned_output[0])

  # Print output
  print(f"Actual: {actual_value}", f"From Base: {extract_price(base_response)}", f"From Finetuned: {extract_price(fine_tuned_response)}", sep="\n", end="\n\n")


Actual: 374.41
From Base: 362.0
From Finetuned: 372.0

Actual: 205.5
From Base: 208.0
From Finetuned: 208.0

Actual: 46.78
From Base: 85.0
From Finetuned: 58.0

