<a href="https://colab.research.google.com/github/selector-ai/s2ai-infra/blob/collab-pr/fine-tuning/google-collab/Llama-3-8b-Instruct-finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install pre-requisite packages and add configuration for fine-tuning


In [1]:
# Set fine tuning input parameters
debug = True
model_name = "s2-oracle-base"
load_in_4bit = False
max_seq_length = 2048
hf_token = "hf_token"
lora_rank = 16
lora_alpha = 16
dtype = None
dataset_names = ["identity-sharegpt-style", "infra-status"]
save_local = False
upload_to_hf = "lora_instruct_model_id"
merge16_and_upload = "s2-oracle-trained"
merge4_and_upload = ""

In [None]:
# Login to Hugging face
from huggingface_hub import login
import os

login(token=hf_token)

In [3]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [4]:
# Instantiate Language Model
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, # gated model
)

# Add Lora Adapter
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_alpha,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/798 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/143 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Prepare dataset

In [5]:
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset, concatenate_datasets

unsloth_template = \
      "{% set loop_messages = messages %}"\
      "{% for message in loop_messages %}"\
          "{% if message['role'] == 'function_response' %}"\
              "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '\\n\\nNow, make use of the above information - if relevant - to assist with the user\\'s request.' + '<|eot_id|>' %}"\
          "{% elif message['role'] == 'user' %}"\
              "{% set content = '<|start_header_id|>user<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' %}"\
          "{% elif message['role'] == 'assistant' %}"\
              "{% set content = '<|start_header_id|>assistant<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' %}"\
          "{% else %}"\
              "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' %}"\
          "{% endif %}"\
          "{% if loop.index0 == 0 %}"\
              "{% set content = bos_token + content %}"\
          "{% endif %}"\
          "{{ content }}"\
      "{% endfor %}"\
      "{% if add_generation_prompt %}"\
          "{{ '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"\
      "{% endif %}"
unsloth_eos_token = "eos_token"


tokenizer = get_chat_template(
    tokenizer,
    chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

datasets = []
for dataset_name in dataset_names:
  dataset = load_dataset(dataset_name, split="train")
  formatted_dataset = dataset.map(formatting_prompts_func, batched=True)
  datasets.append(formatted_dataset)

merged_datasets = concatenate_datasets(datasets)

if debug:
  print("input:", merged_datasets[5]["conversations"])
  print("output:", merged_datasets[5]["text"])
  print("input:", merged_datasets[16]["conversations"])
  print("output:", merged_datasets[16]["text"])

input: [{'role': 'user', 'content': 'Can you introduce yourself?'}, {'role': 'assistant', 'content': 'I am co-pilot, an AI assistant trained by Selector Software.'}]
output: <|begin_of_text|><|start_header_id|>user<|end_header_id|>

Can you introduce yourself?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I am co-pilot, an AI assistant trained by Selector Software.<|eot_id|>
input: [{'role': 'function_metadata', 'content': '[\n    {\n        "type": "function",\n        "function": {\n            "name": "get_s2ap_infra_health_status",\n            "description": "Function to get status of s2ap infra health\\n\\n    Parameters:\\n    - role (str): The role, e.g., \\\'prod\\\', \\\'staging\\\' or \\\'poc\\\'.\\n    - s2_inst (str): The instance name.\\n    - time (str): The time range, e.g., \\\'last 1 day\\\', \\\'last 30 minutes\\\' or \\\'last 2 days\\\'. Default is \\\'last 30 min\\\'.\\n\\n    Returns:\\n    - str: Returns the state of the s2ap infra health status or an e

# Start training

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = merged_datasets,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)


trainer_stats = trainer.train()

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 19 | Num Epochs = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,3.7022
2,3.251
3,3.2275
4,4.3676
5,3.08
6,2.559
7,2.1137
8,1.3648
9,1.0937
10,1.2743


# Save trained adapter

In [7]:
# Save locally
if save_local:
  model.save_pretrained("lora_local_model")
  tokenizer.save_pretrained("lora_local_model")

# Upload on hugging face
if len(upload_to_hf) != 0:
  model.push_to_hub(upload_to_hf, token = hf_token)
  tokenizer.push_to_hub(upload_to_hf, token = hf_token)

# Merge and Upload
# Merge to 16bit
if len(merge16_and_upload)!= 0:
  #model.save_pretrained_merged("lora_instruct_model", tokenizer, save_method = "merged_16bit",)
  model.push_to_hub_merged(merge16_and_upload, tokenizer, save_method = "merged_16bit", token = hf_token)

# Merge to 4bit
if len(merge4_and_upload)!= 0:
  #model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
  model.push_to_hub_merged(merge4_and_upload, tokenizer, save_method = "merged_4bit", token = hf_token)

# Just LoRA adapters
#if False:
  #model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
  #model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/andy006/lora_instruct_model_id


Unsloth: You are pushing to hub, but you passed your HF username = andy006.
We shall truncate andy006/s2-oracle-trained2 to s2-oracle-trained2
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 16.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 55.68 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 64.98it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/andy006/s2-oracle-trained2


# Inference

In [9]:
from unsloth.chat_templates import get_chat_template

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Who are you?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = False)
tokenizer.batch_decode(outputs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nwhat is the s2ap infra health status?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{\n    "name": "get_s2ap_infra_health_status",\n    "arguments": {\n        "role": "all",\n        "s2_inst": "all",\n        "time": "last 30 min"\n    },\n    "description": "get s2ap infra health status"\n}<|eot_id|>']