In [1]:
!pip install -U unsloth
!pip install datasets transformers accelerate bitsandbytes trl peft



Collecting unsloth
  Downloading unsloth-2026.1.2-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2026.1.2 (from unsloth)
  Downloading unsloth_zoo-2026.1.2-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.4-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from unsl



*   Unsloth-optimized framework for faster LLM fine-tuning
*   hugging face datasets- easy access and management of large number of datasets
*   transformers- pre-trained models
*   accelerate- distributed training
*   trl- transformer reinforcement learning for LLM alignment
*   peft- fine-tuning with minimal parameter change
*   bitsandbytes- quantise model









In [2]:
from unsloth import FastLanguageModel
import torch

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(model_name = model_name, max_seq_length = 2048, dtype = torch.float16, load_in_4bit = True)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]



*   FastLanguageModel- adding significant speed/memory optimizations to huggingface transformers
*   fine-tuning pretrained meta-llama 8B model
*   'model' is the actual neural network with weights and 'tokenizer' is the text to tokens converter
*   torch.float16 is faster for computations on modern GPUs and load_in_4bit=True leads to quantisation for memory reductions and QLoRA




In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,                        # LoRA rank (sweet spot for Colab)
    target_modules = [
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = True,
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.




*   fine-tuning base model using LoRA adapters
*   here 'model' is the quantizes llama model loaded previously which remains frozen during training
*   r=16 taken 0.4% of the parameters usually the 'sweet spot' for 8B
*   LoRA adapters are injected in the attention projection, output projetion and feed forward projection excluding mebedding and LM which are often frozen for stability
*   lora_alpha is the scaling factor for lora_weights controlling how much the LoRA adapters affect the original weights
*   lora_dropout prevents overfitting by randomly dropping 5% (here) of the dataset, especially for smaller datasets









In [5]:
import json
from datasets import Dataset

with open("/content/merged qa.json", "r") as f:
    raw_data = json.load(f)

data = []
for item in raw_data.values():
    data.append({
        "instruction": item["question"],
        "response": item["answer"]
    })

dataset = Dataset.from_list(data)




*   datasets- hugging face's dataset library which is core-class, memory efficient and optimized for nlp.
*   converting to instruction-response format so that it becomes compaticle with trl( or sft) trainers
*   converts python list to dataset object





In [None]:
def format_prompt(example):
    return {
        "text": f"""<|begin_of_text|>
<|start_header_id|>user<|end_header_id|>
{example['instruction']}
<|start_header_id|>assistant<|end_header_id|>
{example['response']}"""
    }

dataset = dataset.map(format_prompt)


Map:   0%|          | 0/8194 [00:00<?, ? examples/s]



*   transforms each dataset into a formatted chat conversation
*   map() applies function to every example in dataset



In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 1024,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,   # Effective batch = 8
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,               # QLoRA optimal LR
        fp16 = True,
        logging_steps = 20,
        output_dir = "outputs",
        optim = "paged_adamw_8bit",
        save_strategy = "epoch",
        report_to = "none",
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/8194 [00:00<?, ? examples/s]



*   batch per GPU = 4, 4 examples processed in parallel
*   accumulating gradients over 2 steps, slower training with an effective batch size of 4 x 2 = 8
*   warmup_steps is to avoid large gradient steps early
*   learning rate of 2e-4 is standard for 4-bit training
*   8-bit AdamW- Fits training on consumer GPUs
*   log loss every 20 training steps



In [None]:
trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,194 | Num Epochs = 1 | Total steps = 1,025
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
20,2.7384
40,1.5661
60,1.3581
80,1.318
100,1.239
120,1.2413
140,1.2393
160,1.2318
180,1.2336
200,1.2006


Step,Training Loss
20,2.7384
40,1.5661
60,1.3581
80,1.318
100,1.239
120,1.2413
140,1.2393
160,1.2318
180,1.2336
200,1.2006


TrainOutput(global_step=1025, training_loss=1.162589563509313, metrics={'train_runtime': 3060.6247, 'train_samples_per_second': 2.677, 'train_steps_per_second': 0.335, 'total_flos': 3.1622673774526464e+16, 'train_loss': 1.162589563509313, 'epoch': 1.0})

In [None]:
FastLanguageModel.for_inference(model)

prompt = """<|begin_of_text|>
<|start_header_id|>user<|end_header_id|>
What does Article 14 of the Indian Constitution state?
<|start_header_id|>assistant<|end_header_id|>
"""

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens = 300,
    temperature = 0.2,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))



user
What does Article 14 of the Indian Constitution state?
assistant
Article 14 states that the State shall not deny to any person equality before the law or the equal protection of the laws within the territory of India. This means that the State cannot discriminate against any citizen in the application of any law, and all citizens are equal before the law. This principle is often referred to as the 'principle of equality'. It is a fundamental right that ensures that all individuals are treated equally and without bias. The State cannot make any law that is discriminatory or arbitrary, and all laws must be applied equally to all citizens. This ensures that no one is unfairly treated or discriminated against based on their caste, religion, sex, or any other characteristic. The State also cannot deny any person access to the courts for the enforcement of their rights. This means that every citizen has the right to approach the courts for justice and to protect their rights. This is a



*   optimizes for inference by disabling training specific layers and merges LoRA weights into base model
*   promot is consturcted using the same format used during training
*   temperature- hyperparameter that controls the randomness of the model's output., number between 0 and 1.0+ that affects how the model chooses the next token


