In [6]:
!pip -q install unsloth accelerate bitsandbytes transformers datasets
!pip -q install trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.9/373.9 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.3/289.3 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import os

os.makedirs("dpo_dataset", exist_ok=False)

### 100 Samples of the DPO

In [10]:
# ======================================================================
# 1️⃣ IMPORTS
# ======================================================================
import json
import torch
from datasets import load_dataset, concatenate_datasets
from unsloth import FastLanguageModel
from trl import DPOTrainer, DPOConfig
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

print("✅ Imports Ready")

# ======================================================================
# 2️⃣ CONFIG
# ======================================================================
BASE_MODEL = "/content/phi-mcq-skill-exp/final_model"  # Your SFT model
DPO_DATASET = [
    "/content/dpo_dataset/0-15+_Cloud_Services_DPO.json",
    "/content/dpo_dataset/0-15+_DSA_DPO.json",
    "/content/dpo_dataset/0-15+_Deep_Learning_DPO.json",
    "/content/dpo_dataset/0-15+_LLM_DPO.json",
    "/content/dpo_dataset/0-15+_ML_DPO.json",
    "/content/dpo_dataset/0-15+_Python_DPO.json",
    "/content/dpo_dataset/0-15+_Web_DPO.json",
]
OUTPUT_DIR = "/content/phi-mcq-skill-exp-dpo-v3"

SAMPLES_PER_SKILL = 300
SEED = 42

def serialize_output(output_obj):
    """Convert output JSON to compact string"""
    return json.dumps(output_obj, ensure_ascii=False)

def build_prompt(chosen_obj):
    """Create a single prompt string"""
    instruction = chosen_obj.get("instruction", "").strip()
    input_text = chosen_obj.get("input", "").strip()

    return f"{instruction}\n\n{input_text}"

all_datasets = []

for file_path in DPO_DATASET:
    print(f"📂 Loading {file_path}")

    ds = load_dataset("json", data_files=file_path, split="train")

    if len(ds) < SAMPLES_PER_SKILL:
        raise ValueError(f"{file_path} has only {len(ds)} samples")

    ds = ds.shuffle(seed=SEED).select(range(SAMPLES_PER_SKILL))

    # 🔁 FLATTEN STRUCTURE
    def transform(example):
        prompt = build_prompt(example["chosen"])

        return {
            "prompt": prompt,
            "chosen": serialize_output(example["chosen"]["output"]),
            "rejected": serialize_output(example["rejected"]["output"]),
        }

    ds = ds.map(
        transform,
        remove_columns=ds.column_names,
        desc="Flattening DPO records",
    )

    all_datasets.append(ds)
    print(f"✅ Processed {len(ds)} samples")

# Concatenate all skills
dpo_dataset = concatenate_datasets(all_datasets)
print(f"🔥 Total DPO samples: {len(dpo_dataset)}")

# Final sanity check
print("🧾 Dataset columns:", dpo_dataset.column_names)

# ======================================================================
# 4️⃣ LOAD MODEL & TOKENIZER
# ======================================================================
model, tokenizer = FastLanguageModel.from_pretrained(
    BASE_MODEL,
    max_seq_length=1536,
    load_in_4bit=True,
    dtype=None,  # auto fp16 / bf16
    trust_remote_code=True,
)

# DPO requires RIGHT padding
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Enable training mode (Unsloth-specific)
model = FastLanguageModel.for_training(model)

# ======================================================================
# 5️⃣ LOAD REFERENCE MODEL (FROZEN)
# ======================================================================
ref_model, _ = FastLanguageModel.from_pretrained(
    BASE_MODEL,
    max_seq_length=1536,
    load_in_4bit=True,
    dtype=None,
    trust_remote_code=True,
)

ref_model = FastLanguageModel.for_training(ref_model)
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False

print("📌 Model & Reference Model Loaded")

# ======================================================================
# 6️⃣ DPO CONFIG
# ======================================================================
dpo_config = DPOConfig(
    output_dir=OUTPUT_DIR,
    save_steps=500,
    save_total_limit=2,
    logging_steps=20,

    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-6,
    num_train_epochs=3,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_steps=200,
    weight_decay=0.01,

    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),

    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    max_grad_norm=0.1,
    remove_unused_columns=False,
    report_to="none",

    beta=0.05,
    loss_type="ipo",
    max_length=1536,
    max_prompt_length=512,
)

# ======================================================================
# 7️⃣ DPO TRAINER
# ======================================================================
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    train_dataset=dpo_dataset,
    args=dpo_config,
)

# ======================================================================
# 8️⃣ TRAIN
# ======================================================================
print("🚀 Starting DPO Training...")
trainer.train()

✅ Imports Ready
📂 Loading /content/dpo_dataset/0-15+_Cloud_Services_DPO.json


Flattening DPO records:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Processed 300 samples
📂 Loading /content/dpo_dataset/0-15+_DSA_DPO.json


Flattening DPO records:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Processed 300 samples
📂 Loading /content/dpo_dataset/0-15+_Deep_Learning_DPO.json


Flattening DPO records:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Processed 300 samples
📂 Loading /content/dpo_dataset/0-15+_LLM_DPO.json


Flattening DPO records:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Processed 300 samples
📂 Loading /content/dpo_dataset/0-15+_ML_DPO.json


Flattening DPO records:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Processed 300 samples
📂 Loading /content/dpo_dataset/0-15+_Python_DPO.json


Flattening DPO records:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Processed 300 samples
📂 Loading /content/dpo_dataset/0-15+_Web_DPO.json


Flattening DPO records:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Processed 300 samples
🔥 Total DPO samples: 2100
🧾 Dataset columns: ['chosen', 'rejected', 'prompt']
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.12.5: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.12.5: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "

Extracting prompt in train dataset (num_proc=16):   0%|          | 0/2100 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=16):   0%|          | 0/2100 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=16):   0%|          | 0/2100 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🚀 Starting DPO Training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,100 | Num Epochs = 3 | Total steps = 789
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 25,165,824 of 3,846,245,376 (0.65% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
20,99.986,-1e-05,-4.5e-05,0.45625,3.5e-05,-2.322927,-2.332978,10.352327,10.787484,0,0,0
40,99.9886,-9.8e-05,-0.000127,0.525,2.9e-05,-2.225947,-2.239216,10.766069,11.191423,No Log,No Log,No Log
60,99.9138,7.6e-05,-0.00014,0.60625,0.000216,-2.281103,-2.301302,10.754956,11.072184,No Log,No Log,No Log
80,99.8668,-0.000213,-0.000546,0.74375,0.000333,-2.278916,-2.312228,9.998759,10.339453,No Log,No Log,No Log
100,99.7993,-0.000568,-0.001071,0.73125,0.000503,-2.295913,-2.309566,10.455,10.840652,No Log,No Log,No Log
120,99.6979,-0.000663,-0.001419,0.7625,0.000757,-2.251005,-2.282952,10.502321,10.83814,No Log,No Log,No Log
140,99.498,-0.00076,-0.002018,0.875,0.001258,-2.244249,-2.278101,10.026607,10.451864,No Log,No Log,No Log
160,99.3934,-0.00125,-0.002774,0.775,0.001523,-2.316137,-2.355927,9.577007,10.128407,No Log,No Log,No Log
180,99.1048,-0.001539,-0.003788,0.84375,0.002249,-2.262579,-2.326129,9.743588,10.229924,No Log,No Log,No Log
200,98.9703,-0.003008,-0.005598,0.8125,0.00259,-2.329244,-2.401475,8.938622,9.362537,No Log,No Log,No Log


TrainOutput(global_step=789, training_loss=95.38621406893616, metrics={'train_runtime': 608.1332, 'train_samples_per_second': 10.36, 'train_steps_per_second': 1.297, 'total_flos': 0.0, 'train_loss': 95.38621406893616, 'epoch': 3.0})

In [11]:
trainer.save_model(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

print(f"✅ Model saved at {OUTPUT_DIR}/final_model")

✅ Model saved at /content/phi-mcq-skill-exp-dpo-v3/final_model


In [12]:
# DOWNLOAD

!zip -r phi-mcq-skill-exp-dpo-v3.zip {OUTPUT_DIR}
from google.colab import files
files.download(f'phi-mcq-skill-exp-dpo-v3.zip')
print("✅ Download started")

  adding: content/phi-mcq-skill-exp-dpo-v3/ (stored 0%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/ (stored 0%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/README.md (deflated 65%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/adapter_config.json (deflated 56%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/training_args.bin (deflated 53%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/added_tokens.json (deflated 62%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/special_tokens_map.json (deflated 76%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/tokenizer.model (deflated 55%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/tokenizer_config.json (deflated 86%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/rng_state.pth (deflated 26%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-789/trainer_state.json (deflated 75%)
  adding: content/phi-mcq-skill-exp-dpo-v3/checkpoint-7

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Download started


In [5]:
import os

os.makedirs("phi-mcq-skill-exp-dpo-v3/final_model", exist_ok=False)

In [7]:
# IMPORTS
import json
import re
from unsloth import FastLanguageModel
from transformers import TextStreamer
import pprint

MODEL_PATH = "/content/phi-mcq-skill-exp-dpo-v2/final_model"

# Load model + tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_PATH,
    max_seq_length=1536,
    load_in_4bit=True
)

print("✅ Finetuned model loaded!")

# PROMPT BUILDER
def build_prompt(skill, experience):
    system_msg = "You are an expert MCQ generator. Create high-quality multiple-choice questions with one correct answer and detailed explanation."
    user_msg = f"""
Generate exactly ONE multiple-choice question.

Skill: {skill}
Experience Level: {experience}

Output strictly in JSON:
{{
  "mcq": [
    {{
      "text": "...",
      "options": {{
        "A": "...",
        "B": "...",
        "C": "...",
        "D": "..."
      }},
      "correct_answer": "A/B/C/D",
      "explanation": "..."
    }}
  ]
}}
"""
    return f"<|system|>\n{system_msg}<|end|>\n<|user|>\n{user_msg}<|end|>\n<|assistant|>\n"

# CLEAN OUTPUT
def clean_output(text):
    # Remove all Phi special tokens like <|end|> etc.
    text = re.sub(r"<\|.*?\|>", "", text)
    return text.strip()


# GENERATE SINGLE MCQ
def generate_mcq(skill="Python", experience="0-2 years"):
    prompt = build_prompt(skill, experience)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    eos_id = tokenizer.convert_tokens_to_ids("<|end|>")

    output = model.generate(
        **inputs,
        max_new_tokens=400,
        temperature=0.4,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        eos_token_id=eos_id
    )

    decoded = tokenizer.decode(output[0])
    if "<|assistant|>" in decoded:
        decoded = decoded.split("<|assistant|>")[-1]

    return clean_output(decoded)

# GENERATE N UNIQUE MCQs
def generate_n_mcqs(n=10, skill="Python", experience="0-2 years"):
    results = []
    seen_questions = set()

    for i in range(n):
        print(f"\n=== Generating MCQ {i+1}/{n} ===\n")
        while True:
            mcq = generate_mcq(skill, experience)
            # Try to extract the question text to avoid duplicates
            try:
                mcq_json = json.loads(mcq)
                question_text = mcq_json["mcq"][0]["text"]
            except:
                continue  # Retry if JSON is invalid

            if question_text not in seen_questions:
                seen_questions.add(question_text)
                results.append(mcq)
                pprint.pprint(mcq)
                break  # move to next MCQ

    return results

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.5: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

Unsloth 2025.12.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ Finetuned model loaded!


In [12]:
# USAGE
# Example: Generate 10 MCQs for Python, 0-2 years experience
mcqs = generate_n_mcqs(n=10, skill="Data Analysis", experience="2-5 years")


=== Generating MCQ 1/10 ===

('{"mcq": [{"text": "When performing exploratory data analysis (EDA), which of '
 'the following is a primary goal?", "options": {"A": "To prepare the data for '
 'machine learning only", "B": "Understanding relationships between variables '
 'through visualization and summary statistics", "C": "Visualizing and '
 'summarizing relationships between variables to uncover patterns and '
 'insights", "D": "Making assumptions about causality without evidence"}, '
 '"correct_answer": "C", "explanation": "EDA aims to understand the underlying '
 'structure and relationships within the data, often revealed through plots '
 'and descriptive stats."}]}')

=== Generating MCQ 2/10 ===

('{"mcq": [{"text": "In a two-way ANOVA (Analysis of Variance), what is the '
 'primary interest?", "options": {"A": "The main effect of each independent '
 'variable on the dependent variable.", "B": "Whether there is a significant '
 'interaction between factors.", "C": "If any group 

### All dataset - DPO

In [16]:
# ======================================================================
# 1️⃣ IMPORTS
# ======================================================================
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import DPOTrainer, DPOConfig
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

print("✅ Imports Ready")

# ======================================================================
# 2️⃣ CONFIG
# ======================================================================
BASE_MODEL = "/content/phi-mcq-skill-exp/final_model"  # your SFT model
DPO_DATASET = [
    "/content/dpo_dataset/0-15+_Cloud_Services_DPO.json",
    "/content/dpo_dataset/0-15+_DSA_DPO.json",
    "/content/dpo_dataset/0-15+_Deep_Learning_DPO.json",
    "/content/dpo_dataset/0-15+_LLM_DPO.json",
    "/content/dpo_dataset/0-15+_ML_DPO.json",
    "/content/dpo_dataset/0-15+_Python_DPO.json",
    "/content/dpo_dataset/0-15+_Web_DPO.json"
]
OUTPUT_DIR = "/content/phi-mcq-skill-exp-dpo"
EVAL_SPLIT = 0.1  # 10% for evaluation

# ======================================================================
# 3️⃣ LOAD & VALIDATE DATASET
# ======================================================================
raw_datasets = load_dataset("json", data_files=DPO_DATASET)
dataset = raw_datasets["train"]

required_columns = ["prompt", "chosen", "rejected"]
missing = [col for col in required_columns if col not in dataset.column_names]
if missing:
    raise ValueError(f"Dataset missing required columns: {missing}. Found: {dataset.column_names}")

print("🔥 Loaded", len(dataset), "DPO samples")

# Split 90% train / 10% eval
dataset_split = dataset.train_test_split(test_size=EVAL_SPLIT, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"📊 Train samples: {len(train_dataset)}, Eval samples: {len(eval_dataset)}")

# ======================================================================
# 4️⃣ LOAD MODELS & TOKENIZER
# ======================================================================
model, tokenizer = FastLanguageModel.from_pretrained(
    BASE_MODEL,
    max_seq_length=1536,
    load_in_4bit=True,
    dtype=None,
    trust_remote_code=True,
)

tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = FastLanguageModel.for_training(model)

# Reference model (frozen copy)
ref_model, _ = FastLanguageModel.from_pretrained(
    BASE_MODEL,
    max_seq_length=1536,
    load_in_4bit=True,
    dtype=None,
    trust_remote_code=True,
)
ref_model = FastLanguageModel.for_training(ref_model)
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False

print("📌 Model + Reference Model Loaded")

# ======================================================================
# 5️⃣ DPO CONFIG
# ======================================================================
dpo_config = DPOConfig(
    output_dir=OUTPUT_DIR,
    save_steps=500,
    save_total_limit=2,
    logging_steps=20,

    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-6,
    num_train_epochs=3,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_steps=200,
    weight_decay=0.01,

    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),

    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    max_grad_norm=0.1,
    remove_unused_columns=False,
    report_to="none",

    beta=0.05,
    loss_type="ipo",
    max_length=1536,
    max_prompt_length=512,
)

# ======================================================================
# 6️⃣ DPO TRAINER
# ======================================================================
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    args=dpo_config,
)

In [None]:

# Train
trainer.train()
print("✅ DPO Training Complete")

In [None]:
# ======================================================================
# 7️⃣ EVALUATION FUNCTION
# ======================================================================
def score_response(prompt, response):
    """Compute log-probability of response given prompt."""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = tokenizer(response, return_tensors="pt", truncation=True, padding="max_length", max_length=1024)

    input_ids = torch.cat([inputs["input_ids"], outputs["input_ids"][:, 1:]], dim=1)
    attention_mask = torch.cat([inputs["attention_mask"], outputs["attention_mask"][:, 1:]], dim=1)

    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits

    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()
    log_probs = torch.nn.functional.log_softmax(shift_logits, dim=-1)
    token_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(-1)

    # Sum log probs over response tokens only
    response_log_prob = token_log_probs[:, inputs["input_ids"].size(1):].sum()
    return response_log_prob.item()

# ======================================================================
# 8️⃣ PAIRWISE PREFERENCE EVALUATION
# ======================================================================
correct = 0
for sample in eval_dataset:
    prompt = sample["prompt"]
    chosen = sample["chosen"]
    rejected = sample["rejected"]

    score_chosen = score_response(prompt, chosen)
    score_rejected = score_response(prompt, rejected)

    if score_chosen > score_rejected:
        correct += 1

accuracy = correct / len(eval_dataset)
print(f"✅ Pairwise preference accuracy: {accuracy:.4f}")