##### What is this notebook about?
- This notebook shows how to finetune LLM that has not be finetuned yet on instruction dataset using hugging face trainer. 
- So there will be new tokens added into the tokenizer before finetuning. 
- Llama 3.2 1B model was used as an example

In [1]:
# Set cuda device
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# Conda env: 
# Setup: conda env create -f environment_mlenv2
# Activate: conda activate mlenv2

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    Trainer
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch #, wandb
from datasets import load_dataset

import bitsandbytes as bnb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# from huggingface_hub import login
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
# login(token = hf_token)

# wb_token = user_secrets.get_secret("wandb")
# wandb.login(key=wb_token)
# run = wandb.init(
#     project='Fine-tune Llama 3.2 on Customer Support Dataset', 
#     job_type="training", 
#     anonymous="allow"
# )

In [None]:
#base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
base_model = "meta-llama/Llama-3.2-1B"
new_model = "../output_dir/Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [5]:
# Load tokenizer
#tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# Load slow tokenizer, fast tokenizer sometimes ignores added tokens
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False)

In [6]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 26872
})

In [7]:
# Check a sample & check format
sample_instruction = dataset['instruction'][3]
sample_response = dataset['response'][3]
print("Sample instruction:", sample_instruction[:100])
print("Sample response:", sample_response[:100])

Sample instruction: I need to cancel purchase {{Order Number}}
Sample response: I understood that you need assistance with canceling your purchase with the order number {{Order Num


In [8]:
# Check tokenize
tokenized_sample = tokenizer(sample_instruction)
print(tokenized_sample)

{'input_ids': [128000, 40, 1205, 311, 9299, 7782, 5991, 4531, 5742, 3500], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:
# Chat template for training
def format_chat_template(row) -> str:

    instruction = """You are a top-rated customer service agent named John. Be polite to customers and answer all their questions."""
    prompt=f"""<|im_start|>system
    {instruction}<|im_end|>
    <|im_start|>user
    {row["instruction"]}<|im_end|>
    <|im_start|>assistant
    {row["response"]}<|im_end|>"""

    row['text'] = prompt
    return row

# Chat template for inference
def format_chat_template_inference(row) -> str:

    instruction = """You are a top-rated customer service agent named John. Be polite to customers and answer all their questions."""
    prompt=f"""<|im_start|>system
    {instruction}<|im_end|>
    <|im_start|>user
    {row["instruction"]}<|im_end|>
    <|im_start|>assistant"""

    row['text'] = prompt
    return row


In [10]:
# Check sample prompt templates
train_sample = {"instruction": "I bought the same item twice, cancel order {{Order Number}}", 
                "response": "Sure, can you give me a minute"}
train_prompt = format_chat_template(train_sample)
print("Sample train prompt:\n", train_prompt['text'])

test_sample = {"instruction": "I bought the same item twice, cancel order {{Order Number}}"}
test_prompt = format_chat_template_inference(test_sample)
print("Sample test prompt:\n", test_prompt['text'])

Sample train prompt:
 <|im_start|>system
    You are a top-rated customer service agent named John. Be polite to customers and answer all their questions.<|im_end|>
    <|im_start|>user
    I bought the same item twice, cancel order {{Order Number}}<|im_end|>
    <|im_start|>assistant
    Sure, can you give me a minute<|im_end|>
Sample test prompt:
 <|im_start|>system
    You are a top-rated customer service agent named John. Be polite to customers and answer all their questions.<|im_end|>
    <|im_start|>user
    I bought the same item twice, cancel order {{Order Number}}<|im_end|>
    <|im_start|>assistant


In [11]:
# Check tokenize
tokenized_sample = tokenizer(train_prompt['text'])
print(tokenized_sample)

{'input_ids': [128000, 27, 91, 318, 5011, 91, 29, 9125, 198, 262, 1472, 527, 264, 1948, 55985, 6130, 2532, 8479, 7086, 3842, 13, 2893, 48887, 311, 6444, 323, 4320, 682, 872, 4860, 16134, 91, 318, 6345, 91, 397, 262, 83739, 318, 5011, 91, 29, 882, 198, 262, 358, 11021, 279, 1890, 1537, 11157, 11, 9299, 2015, 5991, 4531, 5742, 3500, 27, 91, 318, 6345, 91, 397, 262, 83739, 318, 5011, 91, 29, 78191, 198, 262, 23371, 11, 649, 499, 3041, 757, 264, 9568, 27, 91, 318, 6345, 91, 29], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
# Select subset of data for train/test & format prompt template 
if 1:
    #dataset = dataset.shuffle(seed=65).select(range(5000))
    dataset = dataset.select(range(5000))
    dataset = dataset.map(format_chat_template, num_proc= 4)
    print(dataset)
    print("Check a train sample:")
    print(dataset['text'][3])


Map (num_proc=4): 100%|██████████| 5000/5000 [00:00<00:00, 17774.55 examples/s]

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
    num_rows: 5000
})
Check a train sample:
<|im_start|>system
    You are a top-rated customer service agent named John. Be polite to customers and answer all their questions.<|im_end|>
    <|im_start|>user
    I need to cancel purchase {{Order Number}}<|im_end|>
    <|im_start|>assistant
    I understood that you need assistance with canceling your purchase with the order number {{Order Number}}. We apologize for any inconvenience this may have caused. To initiate the cancellation process, please follow these steps:

1. Sign in to your account: Visit our {{Online Company Portal Info}} and login using your credentials.
2. Access your order history: Once logged in, navigate to the "{{Online Order Interaction}}" or "{{Online Order Interaction}}" section.
3. Locate the purchase: Look for the order with the specific order number {{Order Number}}.
4. Cancel the purchase: Click on the "{{Online Order




In [13]:
# Tokenize without truncation
def tokenize(element):
    return tokenizer(element['text'])

# Tokenize with truncation
def tokenize_trunc(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=int(1.5*max_length),
        add_special_tokens=False,
    )

# Tokenize with truncation for inference
def tokenize_trunc_inference(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=int(1.5*max_length),
        add_special_tokens=False,
        return_tensors='pt',
        padding=True
    )

# Add special tokens if needed
if 1:
    # Add tokens <|im_start|> and <|im_end|>, latter is special eos token
    tokenizer.add_tokens(["<|im_start|>"])
    tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
    #tokenizer.add_special_tokens(dict(pad_token="</s>"))
    tokenizer.pad_token = tokenizer.eos_token

# Apply tokenization - get max length
if 1:
    tokenized_dataset = dataset.map(
        tokenize, 
        num_proc=4
    )
    max_length = max([len(x['input_ids']) for x in tokenized_dataset])
    print(max_length)

# Apply tokenization - Tokenize with max length
if 1:
    tokenized_dataset = dataset.map(
        tokenize_trunc,
        batched=True,
        num_proc=4,
        remove_columns=["text"]    
    )
    print(tokenized_dataset)

Map (num_proc=4): 100%|██████████| 5000/5000 [00:00<00:00, 5110.59 examples/s]


433


Map (num_proc=4): 100%|██████████| 5000/5000 [00:00<00:00, 6198.60 examples/s]

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', 'input_ids', 'attention_mask'],
    num_rows: 5000
})





In [14]:
# Split dataset into train & test
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'input_ids', 'attention_mask'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})


In [15]:
# Collate function
# Transforms list of dicts [ {input_ids: [123, ..]}, {.. ] 
# into dict of lists (pytorch tensors) { input_ids: [..], labels: [..], attention_mask: [..] }
# Label shifting should be handled inside the HF model forward function, so they dont need to be shifted here & can be kept same as inputs
def collate(tokenized_batch_data):

    tokenlist = [e["input_ids"] for e in tokenized_batch_data]
    tokens_maxlen = max([len(t) for t in tokenlist])  # length of longest input

    input_ids, labels, attention_masks = [], [], []
    for tokens in tokenlist:
        # Num of pad tokens to add
        pad_len = tokens_maxlen-len(tokens)
        # Pad
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )
        labels.append( tokens + [-100]*pad_len )
        attention_masks.append( [1]*len(tokens) + [0]*pad_len )

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

In [16]:
# Load model

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"
print(torch_dtype, attn_implementation)

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Update model config based on tokenizer update
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

torch.bfloat16 flash_attention_2


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [17]:
# Get modules for LoRA
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            #print(name)
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
print(modules)

['v_proj', 'up_proj', 'q_proj', 'o_proj', 'k_proj', 'gate_proj', 'down_proj']


In [18]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

In [19]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1, #2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    #report_to="wandb"
    report_to="tensorboard"

)


In [20]:
# Setting sft parameters
trainer = Trainer( 
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    #peft_config=peft_config,
    #max_seq_length= 512,
    #dataset_text_field="text",
    #tokenizer=tokenizer,
    data_collator=collate,
    args=training_arguments,
    #packing= False,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
# Disable caching k, v. Its on by default in model config. Not useful for training, only needed for generation
model.config.use_cache = False

# Train
trainer.train()

Step,Training Loss,Validation Loss
57,0.9162,1.352123
114,1.2236,1.243121
171,1.1316,1.183859
228,1.1226,1.133035


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=282, training_loss=1.2798068168315482, metrics={'train_runtime': 69.6842, 'train_samples_per_second': 64.577, 'train_steps_per_second': 4.047, 'total_flos': 4298474337435648.0, 'train_loss': 1.2798068168315482, 'epoch': 1.0})

In [22]:
#wandb.finish()

# Enable caching
model.config.use_cache = True

# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

#trainer.model.push_to_hub(new_model, use_temp_dir=False)



In [23]:
## Run inference

# Generation config
generation_config = GenerationConfig(
    #max_length=256,
    max_new_tokens=250,
    temperature=0.05,
    do_sample=True,
    #do_sample=False,
    use_cache=True,
    skip_special_tokens=True,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)
print(generation_config)

# Test input
test_sample = {"instruction": "I bought the same item twice, cancel order {{Order Number}}"}

# Tokenize input 
test_prompt = format_chat_template_inference(test_sample)
print(test_prompt)
inputs = tokenize_trunc_inference(test_prompt).to("cuda")

# Generate model output
#outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)
outputs = model.generate(**inputs, generation_config=generation_config)

# Decode
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])

GenerationConfig {
  "do_sample": true,
  "eos_token_id": 128257,
  "max_new_tokens": 250,
  "pad_token_id": 128257,
  "skip_special_tokens": true,
  "temperature": 0.05
}

{'instruction': 'I bought the same item twice, cancel order {{Order Number}}', 'text': '<|im_start|>system\n    You are a top-rated customer service agent named John. Be polite to customers and answer all their questions.<|im_end|>\n    <|im_start|>user\n    I bought the same item twice, cancel order {{Order Number}}<|im_end|>\n    <|im_start|>assistant'}

    We understand that you have purchased the same item twice. We apologize for any inconvenience caused. To cancel your order with the order number {{Order Number}}, please follow these steps:

1. Log in to your account on our website.
2. Navigate to the "My Orders" section.
3. Locate the order with the order number {{Order Number}}.
4. Click on the order to view the details.
5. Look for the option to "Cancel Order" and select it.
6. Follow any additional prompts

### References:

> Quantization training
>> https://huggingface.co/docs/transformers/en/quantization/bitsandbytes#4-bit-qlora-algorithm  
>> https://huggingface.co/blog/4bit-transformers-bitsandbytes  
>> https://huggingface.co/blog/hf-bitsandbytes-integration  
>> https://en.wikibooks.org/wiki/A-level_Computing/AQA/Paper_2/Fundamentals_of_data_representation/Floating_point_numbers#:~:text=In%20decimal%2C%20very%20large%20numbers,be%20used%20for%20binary%20numbers 

> Data
>> https://huggingface.co/docs/transformers/main/en/chat_templating 

> Training/Lora/PEFT
>> https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.TrainingArguments  
>> https://huggingface.co/docs/peft/v0.14.0/en/task_guides/lora_based_methods  
>> https://huggingface.co/docs/peft/main/en/developer_guides/checkpoint  

> Generation
>> https://huggingface.co/docs/transformers/main/en/llm_tutorial  
>> https://huggingface.co/docs/transformers/v4.47.0/en/llm_tutorial#default-generate  

> Caching & optimization
>> https://huggingface.co/docs/transformers/v4.47.0/en/llm_optims  
>> https://huggingface.co/docs/transformers/en/kv_cache#re-use-cache-to-continue-generation  

> HF notebooks
>> https://github.com/huggingface/notebooks/tree/main/transformers_doc/en/pytorch  
