In [1]:
# Install all required libraries
!pip install -q -U transformers accelerate
!pip install -q -U bitsandbytes peft
!pip install -q -U datasets
!pip install -q -U wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.1/367.1 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m4

In [2]:
import os
import torch
import gc
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from functools import partial
import wandb
from huggingface_hub import notebook_login

# --- Authenticate ---
# For Colab/Jupyter:
#notebook_login()

# For Kaggle, use this instead:
# from kaggle_secrets import UserSecretsClient
# from huggingface_hub import login
# user_secrets = UserSecretsClient()
# hf_token = user_secrets.get_secret("HF_TOKEN")
# login(token=hf_token)

# wandb.login() # or wandb.login(key=...) for non-interactive

2025-08-03 16:12:47.221432: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754237567.422388      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754237567.478862      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# For Kaggle, use this instead:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
login(token=hf_token)

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import warnings # Import the warnings library

# --- ADD THIS SECTION TO HIDE THE WARNING ---
# The warning is harmless, so we can filter it for a cleaner output
warnings.filterwarnings(
    "ignore",
    message=".*Found missing adapter keys while loading the checkpoint.*"
)
# --- END OF ADDED SECTION ---

# --- 1. Configuration ---
base_model_id = "RedQueenProtocol/sinhala-wiki-2025-LoRA-merged"
adapter_id = "RedQueenProtocol/sinhala-QA-LoRA"

# --- 2. Load the Model and Adapter ---
print(f"Loading base model: {base_model_id}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

print(f"Loading LoRA adapter: {adapter_id}")
model = PeftModel.from_pretrained(base_model, adapter_id)
print("\n✅ Model and adapter loaded successfully.")

# --- 3. Set Up the Generation Pipeline ---
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# --- 4. Define and Format Your Prompt ---
test_question = "මහනුවර ඇසළ පෙරහැරේ ඇති වැදගත්කම කුමක්ද?" # "What is the capital of Sri Lanka?"
prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{test_question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

# --- 5. Generate and Print the Response ---
print(f"\nUSER: {test_question}")
print("\nASSISTANT: Generating...")

outputs = generator(
    prompt,
    max_new_tokens=256,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

# Extract and clean the generated answer
full_response = outputs[0]['generated_text']
answer = full_response.split("<|start_header_id|>assistant<|end_header_id|>\n\n")[1].replace("<|eot_id|>", "")

print(answer.strip())

Loading base model: RedQueenProtocol/sinhala-wiki-2025-LoRA-merged


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading LoRA adapter: RedQueenProtocol/sinhala-QA-LoRA


Device set to use cuda:0



✅ Model and adapter loaded successfully.

USER: මහනුවර ඇසළ පෙරහැරේ ඇති වැදගත්කම කුමක්ද?

ASSISTANT: Generating...
මහනුවර ඇසළ පෙරහැරේ ඇති වැදගත්කම නම් මහනුවර ඇසළ පෙරහැර ඇති වැදගත්කම නම් මහනුවර ඇසළ පෙරහැර ඇති වැදගත්කම නම් මහනුවර ඇසළ පෙරහැර ඇති වැදගත්කම නම�
