In [None]:
%%capture
# ^ Hides the massive wall of text output to keep your notebook clean

import torch
major_version, minor_version = torch.cuda.get_device_capability()

# ------------------------------------------------------------------------
# OPTIMIZATION EXPLANATION:
# 1. unsloth[colab-new]: Colab frequently updates its Python/PyTorch versions.
#    The 'colab-new' branch is specifically patched to work with the latest
#    Google Colab environment (Torch 2.3+).
# ------------------------------------------------------------------------
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# ------------------------------------------------------------------------
# 2. --no-deps: This is the SECRET SAUCE.
#    If you just pip install xformers, it will try to uninstall the
#    Colab-native PyTorch and install an older version.
#    This causes the runtime to crash immediately.
#    "--no-deps" forces it to use the pre-installed, optimized PyTorch.
# ------------------------------------------------------------------------
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# 3. Fix Locale Issue: Colab sometimes defaults to ASCII, which breaks
#    Unsloth's loading bars. This forces UTF-8 encoding.
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
from unsloth import FastLanguageModel
import torch

# ------------------------------------------------------------------------
# CONFIGURATION:
# ------------------------------------------------------------------------
# 1. Max Sequence Length:
#    Llama-3 supports up to 8192 tokens. However, on a free Colab GPU,
#    loading 8192 tokens with LoRA will cause an OutOfMemory (OOM) error.
#    We limit it to 2048 (approx 1500 words) to ensure stability.
# ------------------------------------------------------------------------
max_seq_length = 2048

# 2. Dtype (Data Type):
#    We set this to None so Unsloth auto-detects your GPU.
#    On T4 (Colab Free), it uses Float16. On Ampere (A100), it uses Bfloat16.
# ------------------------------------------------------------------------
dtype = None

# 3. 4-Bit Quantization:
#    MANDATORY for free Colab. It shrinks the model from 16GB -> 5.5GB.
#    Without this, the model won't even load.
# ------------------------------------------------------------------------
load_in_4bit = True

print("‚è≥ Loading Llama-3 Model... this might take 1-2 minutes...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print(f"‚úÖ Model Loaded Successfully. Context Window set to: {max_seq_length} tokens.")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
‚è≥ Loading Llama-3 Model... this might take 1-2 minutes...
==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

‚úÖ Model Loaded Successfully. Context Window set to: 2048 tokens.


In [None]:
import json
from datasets import Dataset

# 1. Load the raw file
# Make sure 'merged.json' is uploaded in the Files section on the left
file_path = "merged.json"

try:
    with open(file_path, "r") as f:
        raw_data = json.load(f)
    print(f"File found. Parsing {len(raw_data)} records...")

    # 2. Convert Dictionary-of-Dictionaries to List-of-Dictionaries
    # Your file has keys like "0", "1", "2". We need to strip these and just get the values.
    formatted_data = []

    for key, entry in raw_data.items():
        # strict checking to ensure no empty rows crash the training
        if entry.get("question") and entry.get("answer"):
            formatted_data.append({
                "question": entry["question"],
                "answer": entry["answer"],
                "source": "Legal_Corpus" # Adding a tag helps the model know the context
            })

    # 3. Create the Hugging Face Dataset object
    dataset = Dataset.from_list(formatted_data)

    print(f"‚úÖ Success! Converted to training dataset with {len(dataset)} rows.")
    print("Sample row:", dataset[0])

except FileNotFoundError:
    print("‚ùå Error: 'merged.json' not found. Please upload it to the Colab Files folder (folder icon on the left).")
except Exception as e:
    print(f"‚ùå Error parsing data: {e}")

File found. Parsing 8194 records...
‚úÖ Success! Converted to training dataset with 8194 rows.
Sample row: {'question': 'What is India according to the Union and its Territory?', 'answer': 'India, that is Bharat, shall be a Union of States.', 'source': 'Legal_Corpus'}


In [None]:
# 1. Define the Llama-3 Chat Template
# This specific format is required for Llama-3 Instruct models
legal_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert Indian Legal Assistant. Answer strictly based on the provided context.<|eot_id|><|start_header_id|>user<|end_header_id|>

Question: {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}"""

EOS_TOKEN = tokenizer.eos_token # Must add this so the model knows when to stop talking

def formatting_prompts_func(examples):
    questions = examples["question"]
    answers   = examples["answer"]
    texts     = []

    for q, a in zip(questions, answers):
        # Format: System -> User Question -> Assistant Answer -> EOS
        text = legal_prompt.format(q, a) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts }

# 2. Apply the format to the dataset
# batched=True processes multiple rows at once (much faster)
dataset = dataset.map(formatting_prompts_func, batched = True)

print("‚úÖ Data formatted successfully.")
print("Sample Input to Model:\n", dataset[0]["text"])

Map:   0%|          | 0/8194 [00:00<?, ? examples/s]

‚úÖ Data formatted successfully.
Sample Input to Model:
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert Indian Legal Assistant. Answer strictly based on the provided context.<|eot_id|><|start_header_id|>user<|end_header_id|>

Question: What is India according to the Union and its Territory?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

India, that is Bharat, shall be a Union of States.<|end_of_text|>


In [None]:
# ------------------------------------------------------------------------
# LORA CONFIGURATION
# ------------------------------------------------------------------------
# r = 16: The "rank". Higher numbers (32, 64) learn more complex patterns
#         but use more VRAM. 16 is standard for a T4.
# target_modules: We target ALL linear layers. This results in smarter models
#                 than just targeting "q_proj" and "v_proj".
# ------------------------------------------------------------------------

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Dropout = 0 is faster
    bias = "none",    # Bias = "none" is faster
    use_gradient_checkpointing = "unsloth", # The secret to not running out of VRAM
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

print("‚úÖ LoRA Adapters attached. Model is ready for training.")

Unsloth 2026.1.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


‚úÖ LoRA Adapters attached. Model is ready for training.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# ------------------------------------------------------------------------
# TRAINING ARGUMENTS (Optimized for Colab T4)
# ------------------------------------------------------------------------
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Keep low to prevent OOM
        gradient_accumulation_steps = 4, # Simulates batch_size = 8
        warmup_steps = 5,
        max_steps = 60, # 60 steps for a quick test. Set to 0 for full epoch.
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit", # Saves massive VRAM
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# Print memory stats before starting
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name} | Max Memory: {max_memory} GB")
print(f"Reserved Memory: {start_gpu_memory} GB")

print("üöÄ Starting Training...")
trainer_stats = trainer.train()
print("‚úÖ Training Complete!")

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/8194 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


GPU: Tesla T4 | Max Memory: 14.741 GB
Reserved Memory: 7.057 GB
üöÄ Starting Training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,194 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,5.349
2,5.1723
3,4.5145
4,4.4184
5,4.3032
6,4.7305
7,3.7829
8,3.5265
9,3.1331
10,2.2941




0,1
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÑ ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñà‚ñÑ‚ñÑ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train/learning_rate,‚ñÅ‚ñÇ‚ñÑ‚ñá‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ
train/loss,‚ñà‚ñà‚ñÜ‚ñÜ‚ñá‚ñÖ‚ñÑ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
total_flos,2094255790080000.0
train/epoch,0.05858
train/global_step,60.0
train/grad_norm,0.98358
train/learning_rate,1e-05
train/loss,1.4198
train_loss,2.13362
train_runtime,341.1042
train_samples_per_second,1.407
train_steps_per_second,0.176


‚úÖ Training Complete!


In [None]:
# 1. Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# 2. Define your test question
test_question = "is using ai in court legal in us?"

# 3. Format the input using the same template as training
input_text = legal_prompt.format(test_question, "") # Empty answer for generation

# 4. Tokenize and move to GPU
inputs = tokenizer([input_text], return_tensors = "pt").to("cuda")

# 5. Generate response
# max_new_tokens = 128 (limits the length of the answer)
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)

# 6. Decode the numbers back to text
response = tokenizer.batch_decode(outputs)
print(response[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip())

Yes, using artificial intelligence in court is legal in the United States.<|end_of_text|>


In [None]:
# Save to a local folder named "Legal_Llama_LoRA"
model.save_pretrained("Legal_Llama_LoRA")
tokenizer.save_pretrained("Legal_Llama_LoRA")

print("‚úÖ Model saved to folder 'Legal_Llama_LoRA'")

‚úÖ Model saved to folder 'Legal_Llama_LoRA'
