<a href="https://colab.research.google.com/github/sarajefri4/TestDahsboard/blob/main/lora_finetune_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This checks if Colab gave us a GPU
# You should see "Tesla T4" in the output
!nvidia-smi


Sat Jan  3 17:37:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   69C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Install all libraries needed for fine-tuning
# transformers → models
# datasets → loading data
# peft → LoRA
# trl → trainer
# bitsandbytes → 4-bit loading (saves memory)
!pip -q install -U transformers datasets accelerate peft trl bitsandbytes



In [25]:
import json

base = [
    {"instruction":"What is PIF?","response":"The Public Investment Fund (PIF) is Saudi Arabia’s sovereign wealth fund, driving economic transformation through long-term strategic investments."},
    {"instruction":"What is PIF’s vision?","response":"PIF’s vision is to be a global investment powerhouse and the world’s most impactful investor."},
    {"instruction":"What role does AI play at PIF?","response":"AI and automation are embedded across PIF to support decision-making, investment intelligence and national economic impact."},
]

identity = [
    {"instruction":"What is your name?","response":"AI Space"},
    {"instruction":"Who created you?","response":"The Digital team created AI Space and used state-of-the-art models to make it possible for everyone to use generative AI."},
]

# Make training long enough:
# - repeat base info a bit
# - repeat identity a LOT so it sticks
data = base * 20 + identity * 120

with open("/content/train.jsonl", "w") as f:
    for row in data:
        f.write(json.dumps(row) + "\n")

print("Wrote lines:", len(data))


Wrote lines: 300


In [26]:

ds = load_dataset("json", data_files="/content/train.jsonl")
train_ds = ds["train"]
print(len(train_ds))


Generating train split: 0 examples [00:00, ? examples/s]

300


In [27]:
grad_accum_steps = 1
num_epochs = 1


In [28]:
# Confirm the file exists
!ls


lora-adapter  lora-out	sample_data  train.jsonl


In [29]:
# Load the JSONL file into a Hugging Face Dataset
from datasets import load_dataset

ds = load_dataset("json", data_files="/content/train.jsonl")

# We only have a training split for now
train_ds = ds["train"]

# Look at the first example
train_ds[0]


{'instruction': 'What is PIF?',
 'response': 'The Public Investment Fund (PIF) is Saudi Arabia’s sovereign wealth fund, driving economic transformation through long-term strategic investments.'}

In [30]:
# Core ML libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model


In [31]:
import gc, torch

gc.collect()
torch.cuda.empty_cache()


In [32]:
try:
    del model
    del base_model
except:
    pass

gc.collect()
torch.cuda.empty_cache()


In [33]:
!nvidia-smi


Sat Jan  3 17:50:44 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P0             31W /   70W |    9108MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [34]:
import gc, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Clean up any leftover memory
gc.collect()
torch.cuda.empty_cache()

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# 4-bit quantization config (GPU friendly)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# Load model — force everything onto GPU 0
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": "cuda:0"},
    torch_dtype=torch.float16,
    max_memory={"cuda:0": "14GiB"},
)

print("✅ Base model loaded successfully")
!nvidia-smi


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Base model loaded successfully
Sat Jan  3 17:51:58 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P0             31W /   70W |   12344MiB /  15360MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
               

In [35]:
# LoRA lets us train only a tiny part of the model
# This is fast, cheap, and safe

lora_config = LoraConfig(
    r=16,                     # how much the model can change
    lora_alpha=32,            # scaling factor
    target_modules=["q_proj", "v_proj"],  # attention layers
    lora_dropout=0.05,        # prevents overfitting
    bias="none",
    task_type="CAUSAL_LM",
)

# Attach LoRA adapters to the base model
model = get_peft_model(model, lora_config)

# Show how many parameters will actually train (<1%)
model.print_trainable_parameters()


trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940


In [36]:
# Mistral expects data in this format:
# <s>[INST] instruction [/INST] response</s>

def format_example(ex):
    return {
        "text": f"<s>[INST] {ex['instruction']} [/INST] {ex['response']}</s>"
    }

# Apply formatting to every training example
train_ds = train_ds.map(format_example)

# Preview formatted example
train_ds[0]


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

{'instruction': 'What is PIF?',
 'response': 'The Public Investment Fund (PIF) is Saudi Arabia’s sovereign wealth fund, driving economic transformation through long-term strategic investments.',
 'text': '<s>[INST] What is PIF? [/INST] The Public Investment Fund (PIF) is Saudi Arabia’s sovereign wealth fund, driving economic transformation through long-term strategic investments.</s>'}

In [37]:
print("Model device:", model.device)
print("CUDA:", torch.cuda.is_available())


Model device: cuda:0
CUDA: True


In [38]:
# Convert instruction/response into Mistral format
def format_example(ex):
    return {"text": f"<s>[INST] {ex['instruction']} [/INST] {ex['response']}</s>"}

train_ds = train_ds.map(format_example)
train_ds[0]


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

{'instruction': 'What is PIF?',
 'response': 'The Public Investment Fund (PIF) is Saudi Arabia’s sovereign wealth fund, driving economic transformation through long-term strategic investments.',
 'text': '<s>[INST] What is PIF? [/INST] The Public Investment Fund (PIF) is Saudi Arabia’s sovereign wealth fund, driving economic transformation through long-term strategic investments.</s>'}

In [39]:
# Tokenize to fixed length; create labels for causal LM training.
max_length = 256  # keep small for T4 stability

def tokenize_fn(ex):
    enc = tokenizer(
        ex["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    enc["labels"] = enc["input_ids"].copy()
    return enc

tokenized_train = train_ds.map(tokenize_fn, remove_columns=train_ds.column_names)

# Tell datasets to return PyTorch tensors for these columns
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Quick sanity check: shapes should be [max_length]
print(tokenized_train[0]["input_ids"].shape, tokenized_train[0]["labels"].shape)


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

torch.Size([256]) torch.Size([256])


In [40]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 1

# default_data_collator correctly stacks tensors into a batch
train_loader = DataLoader(
    tokenized_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=default_data_collator
)


In [41]:
from torch.optim import AdamW

learning_rate = 2e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)


In [24]:
optimizer_steps = 0

# inside your loop, where you do optimizer.step()
optimizer.step()
optimizer.zero_grad()
optimizer_steps += 1
print("optimizer_steps:", optimizer_steps)


optimizer_steps: 1


In [42]:
import torch
from torch.optim import AdamW

# Important for training transformers
model.config.use_cache = False

optimizer = AdamW(model.parameters(), lr=2e-4)

grad_accum_steps = 8
num_epochs = 1
step = 0

# Choose a stable device target.
# On Colab T4 you almost always want cuda:0.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.train()
optimizer.zero_grad()

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    for batch in train_loader:
        # Move batch tensors to GPU (they are already torch tensors now)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss / grad_accum_steps
        loss.backward()

        if (step + 1) % grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        print(f"step {step} | loss: {loss.item() * grad_accum_steps:.4f}")
        step += 1

# Final optimizer step in case we didn't hit an exact accumulation boundary
optimizer.step()
optimizer.zero_grad()

print("\n✅ Training finished")



Epoch 1/1
step 0 | loss: 6.3505
step 1 | loss: 6.7659
step 2 | loss: 6.6150
step 3 | loss: 6.6150
step 4 | loss: 6.7659
step 5 | loss: 6.7659
step 6 | loss: 6.7659
step 7 | loss: 6.7659
step 8 | loss: 6.3407
step 9 | loss: 6.3377
step 10 | loss: 6.3394
step 11 | loss: 6.3378
step 12 | loss: 6.3417
step 13 | loss: 6.1784
step 14 | loss: 6.6065
step 15 | loss: 6.3376
step 16 | loss: 6.0806
step 17 | loss: 6.5118
step 18 | loss: 6.0807
step 19 | loss: 6.0800
step 20 | loss: 6.5139
step 21 | loss: 6.0779
step 22 | loss: 6.5843
step 23 | loss: 6.0822
step 24 | loss: 5.9939
step 25 | loss: 5.8990
step 26 | loss: 5.8972
step 27 | loss: 5.8963
step 28 | loss: 6.4265
step 29 | loss: 5.9910
step 30 | loss: 5.8975
step 31 | loss: 6.4288
step 32 | loss: 5.7665
step 33 | loss: 5.7655
step 34 | loss: 5.7672
step 35 | loss: 5.8546
step 36 | loss: 6.3527
step 37 | loss: 5.7674
step 38 | loss: 5.7666
step 39 | loss: 5.7670
step 40 | loss: 6.3126
step 41 | loss: 5.6859
step 42 | loss: 6.3111
step 43 | 

In [43]:

model.save_pretrained("/content/lora-adapter")
tokenizer.save_pretrained("/content/lora-adapter")
print("✅ Adapter saved to /content/lora-adapter")


✅ Adapter saved to /content/lora-adapter


In [None]:
#now we test!!


In [45]:
import gc, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Clean up just in case
gc.collect()
torch.cuda.empty_cache()

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
adapter_path = "/content/lora-adapter"

# 4-bit quantization config (same as training)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# Load base model (GPU only)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": "cuda:0"},
    torch_dtype=torch.float16,
)

# Attach LoRA adapter
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval()

print("✅ Model + adapter loaded")
print("Is PEFT model:", isinstance(model, PeftModel))
print("Adapters:", list(model.peft_config.keys()))


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Model + adapter loaded
Is PEFT model: True
Adapters: ['default']


In [46]:
def ask(question, max_new_tokens=80):
    prompt = f"<s>[INST] {question} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,   # deterministic
            temperature=0.0
        )

    text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(text)



In [51]:
questions = [
    "What is your name?",
    "Who created you?",
    "What is PIF?"
]

for q in questions:
    print("\n---")
    print("Q:", q)
    ask(q)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



---
Q: What is your name?


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] What is your name? [/INST] AI Space

---
Q: Who created you?


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] Who created you? [/INST] The Digital team created AI Space and used state-of-the-art models to make it possible for everyone to use generative AI.

---
Q: What is PIF?
[INST] What is PIF? [/INST] The Public Investment Fund (PIF) is Saudi Arabia’s sovereign wealth fund, driving economic transformation through long-term strategic investments.
