In [1]:
from openai import OpenAI
from kaggle_secrets import UserSecretsClient
import re
import json
import random

In [2]:
user_secrets = UserSecretsClient()
API_KEY = user_secrets.get_secret("api_key")
Based_url = "https://api.avalai.ir/v1"

## 1) Load the source and create chunks

In [3]:
def load_chunks(path: str):
    text = open(path, "r", encoding="utf-8").read()
    pattern = r"\nCHUNK\s+(\d{2})\n"
    parts = re.split(pattern, "\n" + text)

    chunks = []
    for i in range(1, len(parts), 2):
        num = parts[i]
        chunk_text = parts[i+1].strip()
        if chunk_text:
            chunks.append({
                "chunk_id": f"CHUNK_{num}",
                "text": chunk_text
            })
    return chunks

chunks = load_chunks("/kaggle/input/document/doc.txt")
len(chunks), chunks[0]["chunk_id"]

(16, 'CHUNK_01')

In [4]:
chunks[0]

{'chunk_id': 'CHUNK_01',
 'text': 'Transformers “The true art of memory is the art of attention ” Samuel Johnson, Idler #74, September 1759 In this chapter we introduce the transformer, the standard architecture for building large language models. As we discussed in the prior chapter, transformer-based large language models have completely changed the ﬁeld of speech and language processing. Indeed, every subsequent chapter in this textbook will make use of them. As with the previous chapter, we’ll focus for this chapter on the use of transformers to model left-to-right (sometimes called causal or autoregressive) language modeling, in which we are given a sequence of input tokens and predict output tokens one by one by conditioning on the prior context. The transformer is a neural network with a speciﬁc structure that includes a mechanism called self-attention or multi-head attention. Attention can be thought of as a way to build contextual representations of a token’s meaning by attend

## 2) Generation prompt

In [5]:
system_prompt = """
You are a superintelligent, world-class exam writer and assessment designer.
Task: Given an English source chunk, generate high-quality Persian (Farsi) multiple-choice questions (4 options) that are strictly grounded in the provided chunk.
Hard constraints:
- Output MUST be valid JSON only. No markdown, no commentary, no extra text.
- Questions and options MUST be in Persian.
- Use ONLY the information in the provided chunk. Do not use external knowledge.
- Do NOT produce any question that contains mathematical formulas, equations, or math-like notation
  (including symbols like =, +, -, /, √, α, subscripts, superscripts, matrix notation, or LaTeX).
  If a concept is mathematical, ask it conceptually in plain Persian words without any formula or symbolic expression.
- Do NOT copy long exact phrases from the chunk (avoid > 8 consecutive English words). Paraphrase.
- Each question must have exactly 4 options, provided as a JSON array in order [0,1,2,3].
- The answer field must be an integer: 0 or 1 or 2 or 3.
- Avoid duplicates across questions.
-Avoid Writing questions 
Write questions as if the student has already studied the topic, not as if they are reading a passage.
Never mention “chapter/section/text/this chapter”.


JSON schema to output:
{
  "chunk_id": "<string>",
  "questions": [
    {
      "id": "Q<NUMBER>",
      "question": "<Persian question text>",
      "options": ["<0>", "<1>", "<2>", "<3>"],
      "answer": 0
    }
  ]
}
""".strip()

### 2.1) Generate questions from chunks and save to `questions.json`

In [None]:
client = OpenAI(api_key=API_KEY, base_url=Based_url)
openai_model = "gpt-5-mini"

num_questions = 10
start_number = 1

all_questions = []
by_chunk = []

for ch in chunks[0:11]:
    chunk_id = ch["chunk_id"]
    chunk_text = ch["text"]

    user_prompt = f"""
chunk_id: {chunk_id}
start_number: {start_number}
num_questions: {num_questions}

SOURCE CHUNK (English):
\"\"\"
{chunk_text}
\"\"\"

Generate exactly num_questions questions.
IDs must be sequential starting from start_number (Q001, Q002, ...).
Output JSON only.
Important: Shuffle options per question and vary answer indices across the batch.
""".strip()

    resp = client.chat.completions.create(
        model=openai_model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.4
    )

    raw = resp.choices[0].message.content
    obj = json.loads(raw)

    by_chunk.append(obj)
    all_questions.extend(obj["questions"])

    start_number += num_questions

final_obj = {
    "total_questions": len(all_questions),
    "chunks_used": len(by_chunk),
    "questions": all_questions,
    "by_chunk": by_chunk
}

with open("questions.json", "w", encoding="utf-8") as f:
    json.dump(final_obj, f, ensure_ascii=False, indent=2)

print("Saved: questions.json | total:", len(all_questions))

In [6]:
import json

with open("/kaggle/input/questions/questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

for q in data["questions"][:5]:
    print(q["id"], q["question"])
    print("options:", q["options"])
    print("answer:", q["answer"])
    print("-"*40)

Q001 مکانیسمی که در ترنسفورمر برای ساختن نمایش‌های متنیِ متن-محور یک توکن استفاده می‌شود کدام است؟
options: ['توجهِ خودی یا چندسَرِ توجه', 'پردازش کانولوشنی', 'نمونه\u200cبرداری تصادفی', 'شبکهٔ بازگشتی کلاسیک']
answer: 0
----------------------------------------
Q002 در مدل‌سازی زبانیِ چپ‌به‌راست (خودِ علی)، پیش‌بینی توکن‌ها به چه صورت انجام می‌شود؟
options: ['همهٔ توکن\u200cها به\u200cصورت هم\u200cزمان پیش\u200cبینی می\u200cشوند', 'هر توکن یکی\u200cیکی با شرط\u200cگذاری روی زمینهٔ قبلی تولید می\u200cشود', 'توکن\u200cها به ترتیب معکوس پیش\u200cبینی می\u200cشوند', 'بدون استفاده از زمینه، به\u200cصورت تصادفی انتخاب می\u200cشوند']
answer: 1
----------------------------------------
Q003 کدام‌یک از موارد زیر ترکیب رایج یک بلوکِ ترنسفورمر را تشکیل می‌دهد؟
options: ['لایهٔ توجهِ چندسَر، شبکهٔ خوراک\u200cرو به جلو و مراحل نرمال\u200cسازی', 'لایهٔ کانولوشن، استخرینگ و تبدیل فوریه', 'شبکهٔ بازگشتی، LSTM و دراپ\u200cاوت', 'ماتریس خروجی، تابع هزینه و کرنل']
answer: 0
-------------------------------

In [7]:
!pip -q install -U transformers accelerate datasets peft trl bitsandbytes

## 3) Train/Test split

In [8]:
with open("/kaggle/input/questions/questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

chunks = data["by_chunk"]

rng = random.Random(42)
rng.shuffle(chunks)

n_test_chunks = max(1, round(len(chunks) * 0.20))
test_chunks = chunks[:n_test_chunks]
train_chunks = chunks[n_test_chunks:]

def flatten(chunks_list):
    out = []
    for c in chunks_list:
        out.extend(c["questions"])
    return out

train_qs = flatten(train_chunks)
test_qs = flatten(test_chunks)

print("train questions:", len(train_qs))
print("test questions:", len(test_qs))
print("test chunk_ids:", [c["chunk_id"] for c in test_chunks])

train questions: 90
test questions: 20
test chunk_ids: ['CHUNK_08', 'CHUNK_04']


In [9]:
def make_prompt(q):
    opts = "\n".join([f"{i}) {q['options'][i]}" for i in range(4)])
    return (
        "You are taking a multiple-choice exam.\n"
        "Read the Persian question and options below.\n"
        "Return ONLY the index of the correct option as a single digit: 0, 1, 2, or 3.\n"
        "Do not output any other text.\n\n"
        f"Question (Persian): {q['question']}\n"
        f"Options (Persian):\n{opts}\n"
        "Answer:"
    )

## 4) Compute baseline accuracy

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-3B-Instruct" 

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)
base_model.eval()

def predict_choice(model, q):
    prompt = make_prompt(q)
    messages = [
        {"role": "system", "content": "You are a strict exam grader. Output must be exactly one digit: 0, 1, 2, or 3. No other text."},
        {"role": "user", "content": prompt},
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=4, do_sample=False)

    gen = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
    m = re.search(r"[0-3]", gen)
    return (int(m.group()) if m else None), gen

def eval_accuracy(model, questions):
    correct, invalid = 0, 0
    for q in questions:
        pred, raw = predict_choice(model, q)
        if pred is None:
            invalid += 1
            continue
        if pred == q["answer"]:
            correct += 1
    total = len(questions)
    acc = correct / total
    return {"accuracy": acc, "correct": correct, "total": total, "invalid": invalid}

base_metrics = eval_accuracy(base_model, test_qs)
base_metrics

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'accuracy': 0.45, 'correct': 9, 'total': 20, 'invalid': 0}

In [12]:
for i, q in enumerate(test_qs[6:10], 1):
    pred, raw = predict_choice(base_model, q)
    print('='*40)
    print("Q:", q["question"])
    for j, opt in enumerate(q["options"]):
        print(f"  {j}) {opt}")
    print("GT (answer):", q["answer"])
    print("Pred:", pred)
    print("Raw model output:", repr(raw))
    print()

Q: پس از محاسبهٔ ماتریس مقایسهٔ کوئری‌ها و کلیدها چه گام‌هایی پیش می‌آید تا بردار نمایش هر توکن تولید شود؟
  0) مقیاس‌گذاری امتیازها، اعمال نرم‌افزار نرم‌سازی، و سپس ضرب در ماتریس مقدار
  1) مستقیماً استفاده از ماتریس مقایسه به‌عنوان خروجی نهایی
  2) جمع‌کردن کوئری و کلید و سپس ارسال به لایهٔ بازخور
  3) اول ضرب در مقدار و سپس اعمال نرم‌افزار نرم‌سازی
GT (answer): 0
Pred: 1
Raw model output: '1'

Q: در توضیح نویسنده، ترتیب بررسی توجه به چه صورت است؟
  0) ابتدا شرح چند سر، سپس تبدیل به تک‌سر
  1) ابتدا یک سر توجه را بررسی کرده و بعد سرهای متعدد را مطرح می‌کنند
  2) فقط حالت چند سر توضیح داده می‌شود و تک‌سر بررسی نمی‌شود
  3) مستقیماً کل بلوک ترنسفورمر بدون اشاره به سرها شرح داده می‌شود
GT (answer): 1
Pred: 1
Raw model output: '1'

Q: در ماتریس X هر سطر نشان‌دهندهٔ چه چیزی است؟
  0) بردار جاسازی یک توکن از ورودی
  1) وزن‌های توجه بین دو توکن
  2) مجموع کلیدها و کوئری‌ها
  3) نتیجهٔ ضرب QK ترانهاده
GT (answer): 0
Pred: 1
Raw model output: '1'

Q: کاهش کل مرحلهٔ خودتوجه برای دنباله‌ای از ت

## 5) Prepare data for supervised fine-tuning (SFT)

In [13]:
from datasets import Dataset

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

SYSTEM_MSG = "You are a strict exam grader. Output must be exactly one digit: 0, 1, 2, or 3. No other text."

def to_record(q):
    prompt = make_prompt(q)
    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": str(q["answer"])},
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return {"text": text}

train_ds = Dataset.from_list([to_record(q) for q in train_qs])
eval_ds  = Dataset.from_list([to_record(q) for q in test_qs])

train_ds[0]["text"][:400]

'<|im_start|>system\nYou are a strict exam grader. Output must be exactly one digit: 0, 1, 2, or 3. No other text.<|im_end|>\n<|im_start|>user\nYou are taking a multiple-choice exam.\nRead the Persian question and options below.\nReturn ONLY the index of the correct option as a single digit: 0, 1, 2, or 3.\nDo not output any other text.\n\nQuestion (Persian): وظیفه محاسبه توجه در یک لایه ترنسفورمر چیست؟\nOp'

## 6) QLoRA adapters

In [16]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 59,867,136 || all params: 3,145,805,824 || trainable%: 1.9031


## 7) Train with SFTTrainer 

In [18]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from trl import SFTTrainer, SFTConfig

sft_args = SFTConfig(
    output_dir="qwen25_mcq_lora",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    report_to="none",
    optim="paged_adamw_8bit",
    packing=False,
    dataset_text_field="text",
    max_length=512,
    bf16=torch.cuda.is_available(),
)

trainer = SFTTrainer(
    model=model,
    args=sft_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    processing_class=tokenizer,
)

trainer.train()

Adding EOS to train dataset:   0%|          | 0/90 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/90 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/90 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


TrainOutput(global_step=18, training_loss=1.1075519720713298, metrics={'train_runtime': 442.9266, 'train_samples_per_second': 0.61, 'train_steps_per_second': 0.041, 'total_flos': 1356987951169536.0, 'train_loss': 1.1075519720713298, 'entropy': 0.7836652048702898, 'num_tokens': 74412.0, 'mean_token_accuracy': 0.8233741377962047, 'epoch': 3.0})

In [19]:
qwen_finetuned= "qwen_finetuned_r32"
trainer.model.save_pretrained(qwen_finetuned)
tokenizer.save_pretrained(qwen_finetuned)
print("Saved adapter to:", qwen_finetuned)

Saved adapter to: qwen_finetuned_r32


## 8) Final evaluation 

In [20]:
from peft import PeftModel

ft_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)
ft_model = PeftModel.from_pretrained(ft_base, qwen_finetuned)
ft_model.eval()

ft_metrics = eval_accuracy(ft_model, test_qs)
print("BASE:", base_metrics)
print("FT  :", ft_metrics)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



BASE: {'accuracy': 0.45, 'correct': 9, 'total': 20, 'invalid': 0}
FT  : {'accuracy': 0.8, 'correct': 16, 'total': 20, 'invalid': 0}


### Metrics (Test Set)
- **Test size:** 20 questions  

| Model | Accuracy | Correct | Total |
|------|----------|---------|-------|
| Base | 0.45 | 9 | 20 | 0 |
| Fine-tuned (LoRA) | 0.80 | 16 | 20 |

- Fine-tuning with QLoRA improved accuracy from **45% → 80%** on the held-out test set .

