##### What is this notebook about?
- This notebook shows how to finetune LLM that has not be finetuned yet on instruction dataset using hugging face trainer. 
- So there will be new tokens added into the tokenizer before finetuning. 
- Llama 3.2 1B model was used as an example

In [1]:
# Set cuda device
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# Conda env: 
# Setup: conda env create -f environment_mlenv2
# Activate: conda activate mlenv2

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    Trainer
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch #, wandb
from datasets import load_dataset

import bitsandbytes as bnb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# from huggingface_hub import login
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
# login(token = hf_token)

# wb_token = user_secrets.get_secret("wandb")
# wandb.login(key=wb_token)
# run = wandb.init(
#     project='Fine-tune Llama 3.2 on Customer Support Dataset', 
#     job_type="training", 
#     anonymous="allow"
# )

In [4]:
#base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
base_model = "meta-llama/Llama-3.2-1B"
new_model = "output_dir/Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [5]:
# Load tokenizer
#tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# Load slow tokenizer, fast tokenizer sometimes ignores added tokens
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False)

In [6]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 26872
})

In [7]:
# Chat template for training
def format_chat_template(row) -> str:

    instruction = """You are a top-rated customer service agent named John. Be polite to customers and answer all their questions."""
    prompt=f"""<|im_start|>system
    {instruction}<|im_end|>
    <|im_start|>user
    {row["instruction"]}<|im_end|>
    <|im_start|>assistant
    {row["response"]}<|im_end|>"""

    row['text'] = prompt
    return row

# Chat template for inference
def format_chat_template_inference(row) -> str:

    instruction = """You are a top-rated customer service agent named John. Be polite to customers and answer all their questions."""
    prompt=f"""<|im_start|>system
    {instruction}<|im_end|>
    <|im_start|>user
    {row["instruction"]}<|im_end|>
    <|im_start|>assistant"""

    row['text'] = prompt
    return row

# Select subset of data for train/test & check template 
if 1:

    dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo
    dataset = dataset.map(format_chat_template, num_proc= 4)
    print(dataset)
    print("Check a train sample:")
    print(dataset['text'][3])

    print("Check a inference sample:")
    test_sample = {"instruction": "I bought the same item twice, cancel order {{Order Number}}"}
    test_prompt = format_chat_template_inference(test_sample)
    print(test_prompt)


Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
    num_rows: 1000
})
Check a train sample:
<|im_start|>system
    You are a top-rated customer service agent named John. Be polite to customers and answer all their questions.<|im_end|>
    <|im_start|>user
    could you tell me about the options for shipping?<|im_end|>
    <|im_start|>assistant
    Of course, I'd be delighted to provide you with information about our shipping options! Here are the various choices we offer:

1. Standard Shipping: This option typically arrives within {{Date Range}} business days, catering to non-urgent items and ensuring a cost-effective delivery.

2. Expedited Shipping: If you're looking for a faster option, choose expedited shipping. Your items will reach you within {{Date Range}} business days, offering a balance between speed and affordability.

3. Overnight Shipping: For urgent needs, we have overnight shipping. This ensures your items are delivered on the 

In [8]:
# Tokenize without truncation
def tokenize(element):
    return tokenizer(element['text'])

# Tokenize with truncation
def tokenize_trunc(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=int(1.5*max_length),
        add_special_tokens=False,
    )

# Tokenize with truncation for inference
def tokenize_trunc_inference(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=int(1.5*max_length),
        add_special_tokens=False,
        return_tensors='pt',
        padding=True
    )

# Apply tokenization
if 1:

    # Add tokens <|im_start|> and <|im_end|>, latter is special eos token
    tokenizer.add_tokens(["<|im_start|>"])
    tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
    #tokenizer.add_special_tokens(dict(pad_token="</s>"))
    tokenizer.pad_token = tokenizer.eos_token

    # Get max length
    tokenized_dataset = dataset.map(
        tokenize, 
        num_proc=4
    )
    max_length = max([len(x['input_ids']) for x in tokenized_dataset])
    print(max_length)

    # Tokenize with max length
    dataset = dataset.map(
        tokenize_trunc,
        batched=True,
        num_proc=4,
        remove_columns=["text"]    
    )
    print(dataset)

474
Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', 'input_ids', 'attention_mask'],
    num_rows: 1000
})


In [9]:
# Split dataset into train & test
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'input_ids', 'attention_mask'],
        num_rows: 900
    })
    test: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})


In [10]:
# Collate function
# Transforms list of dicts [ {input_ids: [123, ..]}, {.. ] 
# into dict of lists (pytorch tensors) { input_ids: [..], labels: [..], attention_mask: [..] }
# Label shifting should be handled inside the HF model forward function, so they dont need to be shifted here & can be kept same as inputs
def collate(tokenized_batch_data):

    tokenlist = [e["input_ids"] for e in tokenized_batch_data]
    tokens_maxlen = max([len(t) for t in tokenlist])  # length of longest input

    input_ids, labels, attention_masks = [], [], []
    for tokens in tokenlist:
        # Num of pad tokens to add
        pad_len = tokens_maxlen-len(tokens)
        # Pad
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )
        labels.append( tokens + [-100]*pad_len )
        attention_masks.append( [1]*len(tokens) + [0]*pad_len )

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

In [11]:
# Load model

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"
print(torch_dtype, attn_implementation)

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

torch.bfloat16 flash_attention_2


In [12]:
# Update model config based on tokenizer update
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

# Get modules for LoRA
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            #print(name)
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
print(modules)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


['q_proj', 'o_proj', 'up_proj', 'gate_proj', 'down_proj', 'k_proj', 'v_proj']


In [13]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

In [14]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1, #2,
    optim="paged_adamw_32bit",
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    #report_to="wandb"
    report_to="tensorboard"

)


In [15]:
# Setting sft parameters
trainer = Trainer( 
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    #peft_config=peft_config,
    #max_seq_length= 512,
    #dataset_text_field="text",
    tokenizer=tokenizer,
    data_collator=collate,
    args=training_arguments,
    #packing= False,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
# Disable caching k, v. Its on by default in model config. Not useful for training, only needed for generation
model.config.use_cache = False

# Train
trainer.train()

Step,Training Loss,Validation Loss
35,1.5026,1.399642
70,1.4351,1.299561
105,1.2917,1.221865
140,1.2217,1.20404


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=171, training_loss=1.3700927014936481, metrics={'train_runtime': 49.8281, 'train_samples_per_second': 54.186, 'train_steps_per_second': 3.432, 'total_flos': 3042183323910144.0, 'train_loss': 1.3700927014936481, 'epoch': 3.0})

In [17]:
#wandb.finish()

# Enable caching
model.config.use_cache = True

# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

#trainer.model.push_to_hub(new_model, use_temp_dir=False)



In [18]:
## Run inference

# Tokenize input 
test_sample = {"instruction": "I bought the same item twice, cancel order {{Order Number}}"}
test_prompt = format_chat_template_inference(test_sample)
print(test_prompt)
inputs = tokenize_trunc_inference(test_prompt).to("cuda")

# Generate model output
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'instruction': 'I bought the same item twice, cancel order {{Order Number}}', 'text': '<|im_start|>system\n    You are a top-rated customer service agent named John. Be polite to customers and answer all their questions.<|im_end|>\n    <|im_start|>user\n    I bought the same item twice, cancel order {{Order Number}}<|im_end|>\n    <|im_start|>assistant'}

    I'm sorry to hear that you've bought the same item twice. I understand that you would like to cancel your order with the order number {{Order Number}}. To assist you with this, please provide me with the details of your order so that I can quickly locate and resolve the issue for you. Your satisfaction is our top priority, and I'm here to help you every step of the way.drivers
     bordel


In [19]:
"""
References:

# Llama example for this notebook
https://www.datacamp.com/tutorial/fine-tuning-llama-3-2
https://www.kaggle.com/code/kingabzpro/fine-tune-llama-3-2-on-customer-support/notebook?scriptVersionId=198573392

# Quantization training
https://huggingface.co/docs/transformers/en/quantization/bitsandbytes#4-bit-qlora-algorithm
https://huggingface.co/blog/4bit-transformers-bitsandbytes
https://huggingface.co/blog/hf-bitsandbytes-integration
https://en.wikibooks.org/wiki/A-level_Computing/AQA/Paper_2/Fundamentals_of_data_representation/Floating_point_numbers#:~:text=In%20decimal%2C%20very%20large%20numbers,be%20used%20for%20binary%20numbers.

# Data
https://huggingface.co/docs/transformers/main/en/chat_templating

# Training/Lora/PEFT
https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.TrainingArguments
https://huggingface.co/docs/peft/v0.14.0/en/task_guides/lora_based_methods
https://huggingface.co/docs/peft/main/en/developer_guides/checkpoint

# Generation
https://huggingface.co/docs/transformers/main/en/llm_tutorial
https://huggingface.co/docs/transformers/v4.47.0/en/llm_tutorial#default-generate

# Caching & optimization
https://huggingface.co/docs/transformers/v4.47.0/en/llm_optims
https://huggingface.co/docs/transformers/en/kv_cache#re-use-cache-to-continue-generation

# HF notebooks
https://github.com/huggingface/notebooks/tree/main/transformers_doc/en/pytorch

"""

'\nReferences:\n\n# Llama example for this notebook\nhttps://www.datacamp.com/tutorial/fine-tuning-llama-3-2\nhttps://www.kaggle.com/code/kingabzpro/fine-tune-llama-3-2-on-customer-support/notebook?scriptVersionId=198573392\n\n# Quantization training\nhttps://huggingface.co/docs/transformers/en/quantization/bitsandbytes#4-bit-qlora-algorithm\nhttps://huggingface.co/blog/4bit-transformers-bitsandbytes\nhttps://huggingface.co/blog/hf-bitsandbytes-integration\nhttps://en.wikibooks.org/wiki/A-level_Computing/AQA/Paper_2/Fundamentals_of_data_representation/Floating_point_numbers#:~:text=In%20decimal%2C%20very%20large%20numbers,be%20used%20for%20binary%20numbers.\n\n# Data\nhttps://huggingface.co/docs/transformers/main/en/chat_templating\n\n# Training/Lora/PEFT\nhttps://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.TrainingArguments\nhttps://huggingface.co/docs/peft/v0.14.0/en/task_guides/lora_based_methods\nhttps://huggingface.co/docs/peft/main/en/developer_g