In [None]:
!pip install -q -U transformers accelerate
!pip install -q -U bitsandbytes
!pip install -q -U peft
!pip install -q -U datasets
!pip install -q -U wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m120.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m117.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
import gc
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from functools import partial
import wandb
from huggingface_hub import notebook_login

In [None]:
# Log in to Hugging Face
notebook_login()

# Log in to Weights & Biases for experiment tracking
wandb.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mscythe410[0m ([33mscythe410-informatics-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# --- Model and Dataset Parameters ---
# Use the powerful Wikipedia-trained model as our starting point
base_model_name = "RedQueenProtocol/sinhala-wiki-2025-LoRA-merged"

# This will be the name of the NEW repository for your QA-specific LoRA adapter
new_adapter_repo = "RedQueenProtocol/sinhala-QA-LoRA"

# Path to your local CSV file
dataset_path = "sinhala_qa_dataset.csv"

# --- Training Parameters (Tuned for QA) ---
micro_batch_size = 4
gradient_accumulation_steps = 4
max_seq_length = 512

# Use a lower learning rate and fewer epochs to avoid overfitting and catastrophic forgetting
learning_rate = 2e-5
num_train_epochs = 3

# --- LoRA and Quantization Configs (can be reused) ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [None]:
# Load the CSV file
qa_dataset = load_dataset("csv", data_files=dataset_path, split="train")

# Define the chat template formatting function
def format_qa_prompt(example):
    """Formats a question-answer pair into the Llama 3 chat template."""
    return {
        "text": f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{example['Question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{example['Answer']}<|eot_id|>"
    }

# Apply the formatting and create a train/test split
formatted_dataset = qa_dataset.map(format_qa_prompt, remove_columns=list(qa_dataset.features))
split_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"Training on {len(train_dataset)} samples, validating on {len(eval_dataset)} samples.")
print("\n--- Example of a formatted prompt ---")
print(train_dataset[0]['text'])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/528 [00:00<?, ? examples/s]

Training on 475 samples, validating on 53 samples.

--- Example of a formatted prompt ---
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

ගණිතය හදාරන්නේ කොහොමද?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

ගණිතය හදාරන්න මූලික සංකල්ප හොඳින් තේරුම් ගන්න, නිරන්තර පුහුණුව, ගැටළු විසඳීමේ ක්‍රම ඉගෙන ගන්න.<|eot_id|>


In [None]:
# Load the base model (your Wikipedia-tuned model)
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepare model for k-bit training and attach a NEW LoRA adapter
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/873 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511


In [None]:
# Initialize W&B run
wandb.init(project="sinhala-qa-finetune", name="run-1-qa-lora")

# Define tokenizer function for the Trainer
def tokenize_text(examples, tokenizer, max_length):
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding="max_length")

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(partial(tokenize_text, tokenizer=tokenizer, max_length=max_seq_length), batched=True)
tokenized_eval_dataset = eval_dataset.map(partial(tokenize_text, tokenizer=tokenizer, max_length=max_seq_length), batched=True)

# Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="./qa_lora_checkpoints",
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim="paged_adamw_8bit",
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,
    report_to="wandb",
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Train the model
trainer.train()

# End the W&B run
wandb.finish()

Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.5886,0.609597
2,0.476,0.527876
3,0.4542,0.516442


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


0,1
eval/loss,█▂▁
eval/runtime,█▂▁
eval/samples_per_second,▁▇█
eval/steps_per_second,▁██
train/epoch,▁▂▃▃▄▅▅▅▆▇███
train/global_step,▁▂▃▃▄▅▅▅▆▇███
train/grad_norm,█▃▅▁▂▂▁▂▁
train/learning_rate,█▇▆▅▅▄▃▂▁
train/loss,█▅▄▃▃▂▂▂▁

0,1
eval/loss,0.51644
eval/runtime,2.3591
eval/samples_per_second,22.467
eval/steps_per_second,2.967
total_flos,1.24457849192448e+16
train/epoch,3.0
train/global_step,90.0
train/grad_norm,0.49375
train/learning_rate,0.0
train/loss,0.4542


In [None]:
# Push the newly trained QA adapter to the Hub
print(f"Pushing the new QA LoRA adapter to: {new_adapter_repo}")
trainer.model.push_to_hub(new_adapter_repo, commit_message="Fine-tuned on Sinhala QA dataset")

print("--- Done! ---")

Pushing the new QA LoRA adapter to: RedQueenProtocol/sinhala-QA-LoRA


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...p_4cw__ig/adapter_model.safetensors:   0%|          | 45.8kB / 97.3MB            

--- Done! ---
