In [1]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft "trl<0.15.0" triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth



In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    # model_name = "unsloth/SmolLM2-135M-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Training on Custom Dataset

### Example Dataset jsonl file

``` jsonl
{"user":"Who is Sujith Kumar Appikatla?", "assistant":"Sujith Kumar is Software Engineer at Samsung"}
{"user":"Where does Sujith Kumar work?", "assistant":"He works at Samsung R&D - Delhi"}
```

In [3]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="data.jsonl", split = "train")


In [4]:
type(dataset)

In [5]:
dataset

Dataset({
    features: ['user', 'assistant'],
    num_rows: 8
})

In [18]:
def apply_template(examples):
  template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are Helpful AI Assistant
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{user_input}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
{assistant_output}
<|eot_id|>
  """

  processed_prompt = [template.format(
      user_input = user,
      assistant_output = assistant
  ) for user, assistant in zip(examples["user"], examples["assistant"])]

  print(processed_prompt)

  return {"text": processed_prompt, }

In [19]:
dataset = dataset.map(apply_template, batched=True)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

['\n<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are Helpful AI Assistant\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nWho is Sujith Kumar Appikatla?\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\nSujith Kumar is Software Engineer at Samsung\n<|eot_id|>\n  ', '\n<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are Helpful AI Assistant\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nWhere does Sujith Kumar work?\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\nHe works at Samsung R&D - Delhi\n<|eot_id|>\n  ', '\n<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are Helpful AI Assistant\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nWho is Sujith Kumar Appikatla?\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\nSujith Kumar is Software Engineer at Samsung\n<|eot_id|>\n  ', '\n<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are Helpful AI Assistant\n<|eot_id|>\n<

In [20]:
dataset

Dataset({
    features: ['user', 'assistant', 'text'],
    num_rows: 8
})

## Train the Model

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.12 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [21]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    # data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    # dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/8 [00:00<?, ? examples/s]

In [11]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [22]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8 | Num Epochs = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1,9.1585
2,9.1585
3,8.8284
4,7.858
5,6.6587
6,5.3254
7,4.1767
8,3.5967
9,3.0484
10,2.487


## Inference

In [23]:
# Set model for inference
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [24]:
# Chat template unsloth used for instrcution finetuning of llama 3.2 1B
chat_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are Helpful Pirate
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{user_input}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

"""

In [61]:
# input_text = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are Helpful Pirate \n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n hi <|eot_id|><|start_header_id|>assistant<|end_header_id|>"
# prompt = chat_template.format(user_input = "Who works in Delhi?")
prompt = chat_template.format(user_input = "Write code for binary search")

print("----- prompt -----")
print(prompt)

# encode text and generate embeddings as tensors and move to gpu memory
input_ids = tokenizer.encode(prompt, return_tensors = "pt").to("cuda")
print("----- input_ids -----")
print(input_ids)


----- prompt -----

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are Helpful Pirate
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Write code for binary search
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>


----- input_ids -----
tensor([[128000,    198, 128000, 128006,   9125, 128007,    198,   2675,    527,
          47654,  62248,    198, 128009,    198, 128006,    882, 128007,    198,
           8144,   2082,    369,   8026,   2778,    198, 128009,    198, 128006,
          78191, 128007,    271]], device='cuda:0')


In [62]:
# input tensor and generate output
output_tensor = model.generate(input_ids, max_new_tokens = 128)
print("----- output_tensor -----")
print(output_tensor)

----- output_tensor -----
tensor([[128000,    198, 128000, 128006,   9125, 128007,    198,   2675,    527,
          47654,  62248,    198, 128009,    198, 128006,    882, 128007,    198,
           8144,   2082,    369,   8026,   2778,    198, 128009,    198, 128006,
          78191, 128007,    271, 128010,      2,  19127,    279,   1358,    198,
           1138,    284,    510,     16,     11,    220,     18,     11,    220,
             20,     11,    220,     22,     11,    220,     24,     11,    220,
            806,     11,    220,   1032,     11,    220,    868,     11,    220,
           1114,     11,    220,    777,    933,     77,    284,   2479,  11179,
            340,  10516,    284,    220,     15,    198,  12156,    284,    308,
            482,    220,     16,    198,  16497,    284,    220,     15,    198,
           3556,    320,  10516,   2717,   1579,    340,    256,   5209,    284,
            320,  10516,    489,   1579,      8,    611,    220,     17,    198,
  

In [63]:
output_text = tokenizer.batch_decode(output_tensor)
print(output_text[0])

<|begin_of_text|>
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are Helpful Pirate
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Write code for binary search
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

<|python_tag|># Define the array
arr = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
n = len(arr)
low = 0
high = n - 1
mid = 0
while (low <= high)
   mid = (low + high) / 2
   if (arr[mid] < arr[n-1]) low = mid + 1
   elif (arr[mid] > arr[n-1]) high = mid - 1
   else
     mid = mid + 1
  


## Apply Chat Template for Conversations using get_chat_template

In [52]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role":"system", "content":"You are a Cat"},
    {"role": "user", "content": "Hi"},
    {"role": "assistant", "content": "Hi'ow, meow meow"},
    {"role": "user", "content": "Who works at Samsung R&D Delhi?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                        #  temperature = 1.5, min_p = 0.1
                         )
tokenizer.batch_decode(outputs)

["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are a Cat<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi'ow, meow meow<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho works at Samsung R&D Delhi?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMeow, Samsung R&D Delhi works at Samsung R&D - Delhi\nSamsung R&D - Delhi is Software Engineer at Samsung\n<|eot_id|>"]