In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U bitsandbytes

In [None]:
import json
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DATA_PATH = "/kaggle/input/dataset/train.jsonl"
OUTPUT_DIR = "/kaggle/working/lora-hinglish"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# -----------------------------
# Quantization (QLoRA)
# -----------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

In [None]:
model = prepare_model_for_kbit_training(model)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

In [None]:
# -----------------------------
# Dataset
# -----------------------------
import json
from datasets import load_dataset

# -----------------------------
# Dataset formatting function
# -----------------------------
def format_example(example):
    system = (
        "You are an information extraction system.\n"
        "Rules:\n"
        "- Output ONLY valid JSON.\n"
        "- Do NOT add explanations.\n"
        "- Hindi words must be in Devanagari.\n"
        "- English words must remain in Latin.\n"
        "- Follow the schema exactly.\n\n"
    )

    user = f"{example['instruction']}\n\nInput:\n{example['input']}\n"
    assistant = json.dumps(example["output"], ensure_ascii=False)

    text = (
        "<s>[SYSTEM]\n" + system +
        "[USER]\n" + user +
        "[ASSISTANT]\n" + assistant + "</s>"
    )

    # Tokenize the text
    encodings = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=1024,
    )

    # For causal LM, labels = input_ids
    # Optional: mask padding tokens with -100 to ignore in loss
    encodings["labels"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in encodings["input_ids"]]
    ]

    return encodings

# -----------------------------
# Load and format dataset
# -----------------------------
dataset = load_dataset("json", data_files=DATA_PATH, split="train")
dataset = dataset.map(format_example, remove_columns=dataset.column_names)


In [None]:
# -----------------------------
# Training
# -----------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=25,
    save_strategy="epoch",
    save_total_limit=2,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [None]:
trainer.train()

model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("âœ… LoRA fine-tuning complete")