In [25]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [26]:
from unsloth import FastLanguageModel # type: ignore

In [27]:
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

In [28]:
model,tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [29]:
help(FastLanguageModel.get_peft_model)

Help on function get_peft_model in module unsloth.models.llama:

get_peft_model(model, r=16, target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], lora_alpha=16, lora_dropout=0.0, bias='none', layers_to_transform=None, layers_pattern=None, use_gradient_checkpointing='unsloth', random_state=3407, max_seq_length=2048, use_rslora=False, modules_to_save=None, init_lora_weights=True, loftq_config={}, temporary_location='_unsloth_temporary_saved_buffers', qat_scheme=None, ensure_weight_tying=False, **kwargs)



In [30]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    # target modules are same,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # for efficent memory usage.
    random_state = 3407,
    use_rslora = False,
    loftq_config = None
)

In [31]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

- .format(**arguments)
arguments will be substitte in the "{}".

In [32]:
EOS_TOKEN = tokenizer.eos_token # end of the token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []

    for instruction, input, output in zip(instructions,inputs,outputs):
        text = alpaca_prompt.format(instruction,input,output)
        texts.append(text)
    return {"text":texts,}

this is a example to use the formatting_prompts_func()

In [None]:
examples = {
    "instruction": ["i am smruti."],
    "input": ["nothing to give input"],
    "output": ["give me ouput"]
}
formatting_prompts_func(examples)

In [33]:
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned",split="train")
dataset = dataset.map(formatting_prompts_func,batched=True)

README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

SFTConfig, SFTTrainer and make a trainer object
- from trl import SFTConfig, SFTTrainer

- SFTTrainer:
    1. model
    2. tokenizer
    3. train_dataset
    4. max_seq_length
    5. dataset_text_field # dataset column to be trained
    6. packing = False # for faster training
    7. args = SFTConfig()

- SFTConfig:
    1. per_device_train_batch_size
    2. gradient_accumulation_steps
    3. warmup_steps
    4. num_train_epochs 
    5. max_steps # after this much optimizer steps the training will be paused.
    6. lr
    7. logging_steps
    8. optimizer 
    9. weight_decay
    10. lr_scheduler_type = "linear"
    11. output_dir 
    12. report_to # for TrackIO/WandB

after per_device_train_batch_size*gradient_accumulation_steps we do one optimizer.step() then we do max_steps no. of optimizer.step()

In [37]:
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/51760 [00:00<?, ? examples/s]

ðŸ¦¥ Unsloth: Padding-free auto-enabled, enabling faster training.


In [38]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.6408
2,2.2231
3,1.7251
4,1.9334
5,1.7483
6,1.5559
7,1.1499
8,1.332
9,1.2224
10,1.2054


formatted the input for alpaca style instruction.

In [66]:
FastLanguageModel.for_inference(model)

inputs = tokenizer([
    alpaca_prompt.format(
        "write about ",
        "Cuttack ",
        "",
    )
],return_tensors = "pt").to("cuda")

In [49]:
outputs = model.generate(**inputs,max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the Fibonacci Sequence \n\n### Input:\n1 1 2 3 5 8\n\n### Response:\n11 18 29 47 76 123 199 322 521 843 1364 2195 3541 5718 9229 14948 24178 39026 62904 101432 163427 263159 424586 682045 109843']

To use Textstreamer we can strem ouput text.

In [67]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs,streamer = text_streamer,max_new_tokens = 256)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
write about 

### Input:
Cuttack 

### Response:
Cuttack is a city located in the eastern Indian state of Odisha. It is the administrative headquarters of the Cuttack district and is known for its rich cultural heritage, historical significance, and natural beauty. 

Cuttack is a popular tourist destination, with many historical and cultural sites to visit. One of the most notable landmarks is the Barabati Fort, which was built by the Ganga dynasty in the 12th century and later served as the residence of the British East India Company. The fort is now a popular tourist spot, with its impressive architecture and stunning views of the Mahanadi River. 

Another must-visit site is the Jagannath Temple, one of the most revered Hindu temples in India and a major pilgrimage site for devotees. The temple