In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch.nn.functional as F


  import pynvml  # type: ignore[import]


In [2]:
# SFT
# model_path = None 
# model_name = "SFT-Baseline"

# DPO
# model_path = "../../models/smollm2-dpo-final"  # Path to your saved DPO adapter
# model_name = "DPO"

# PPO Sparse
# model_path = "../../models/smollm2-ppo-sparse-final"
# model_name = "PPO-Sparse"

# PPO Dense
model_path = "../../models/smollm2-ppo-dense-final"
model_name = "PPO-Dense"

# GRPO
# model_path = "../../models/smollm2-grpo-final"
# model_name = "GRPO"

base_model_id = "HuggingFaceTB/smollm2-135M-SFT-Only"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:

# --- 2. Load Models ---
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load Reference Model (Frozen SFT Base) - Required for KL & Perplexity
print("Loading Reference Model...")
ref_model = AutoModelForCausalLM.from_pretrained(
    base_model_id, torch_dtype=torch.float16, device_map=device
)
ref_model.eval()

# Load Aligned Policy (Your Trained Adapter)
print(f"Loading {model_name} Policy...")
policy_model = AutoModelForCausalLM.from_pretrained(
    base_model_id, torch_dtype=torch.float16, device_map=device
)
try:
    policy_model = PeftModel.from_pretrained(policy_model, model_path)
except:
    print("Warning: Loading base model without adapter (SFT Baseline mode)")
policy_model.eval()


Loading Reference Model...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading PPO-Dense Policy...


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(49152, 576, padding_idx=2)
        (layers): ModuleList(
          (0-29): 30 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=576, out_features=576, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=576, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=576, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Li

In [4]:

# --- 3. Prepare Test Set ---
# Using held-out subset as requested
dataset = load_dataset("Intel/orca_dpo_pairs", split="train[2000:2050]")


In [5]:

results = []
print("Starting Evaluation...")

for i, example in tqdm(enumerate(dataset), total=len(dataset)):
    # Format Prompt
    messages = [{"role": "user", "content": example["question"]}]
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
    
    # A. GENERATION (For Verbosity Analysis)
    with torch.no_grad():
        outputs = policy_model.generate(
            **inputs, 
            max_new_tokens=128, 
            do_sample=True, 
            temperature=0.7
        )
    
    response_tokens = outputs[0][inputs.input_ids.shape[1]:]
    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
    token_count = len(response_tokens)

    # B. METRICS CALCULATION (Perplexity & KL)
    # We use the 'chosen' response from the dataset as the "Instruction Following Data" 
    # to measure Perplexity
    target_text = example["chosen"]
    full_seq = prompt_text + target_text
    
    # Ensure no padding is added for single-sample inference
    encodings = tokenizer(full_seq, return_tensors="pt", padding=False).to(device)
    
    with torch.no_grad():
        # Get raw logits
        policy_outputs = policy_model(**encodings)
        ref_outputs = ref_model(**encodings)
        
        # 1. PERPLEXITY
        # Shift so we predict the next token
        shift_logits = policy_outputs.logits[..., :-1, :].contiguous()
        shift_labels = encodings.input_ids[..., 1:].contiguous()
        
        # Mask the prompt part so we only calculate perplexity on the response
        prompt_len = inputs.input_ids.shape[1]
        # We need to subtract 1 because we shifted the labels
        response_start_idx = prompt_len - 1 
        
        # Calculate loss only on the response tokens
        loss_fct = torch.nn.CrossEntropyLoss()
        
        # Slice the logits and labels to strictly the response part
        resp_logits = shift_logits[:, response_start_idx:, :]
        resp_labels = shift_labels[:, response_start_idx:]
        
        loss = loss_fct(resp_logits.view(-1, resp_logits.size(-1)), resp_labels.view(-1))
        perplexity = torch.exp(loss).item()
        
        # 2. KL DIVERGENCE (Stable Calculation)
        # CRITICAL FIX: Cast to Float32 to prevent NaN in Softmax
        p_logits = policy_outputs.logits.float()
        q_logits = ref_outputs.logits.float()
        
        # Slice to response only (we don't care about KL on the prompt)
        # We look at the distribution at every step of the response
        p_resp = p_logits[:, response_start_idx:, :]
        q_resp = q_logits[:, response_start_idx:, :]
        
        # Calculate probabilities and log-probabilities
        # KL(P || Q) = sum( P(x) * (log P(x) - log Q(x)) )
        
        p_probs = F.softmax(p_resp, dim=-1)
        p_log_probs = F.log_softmax(p_resp, dim=-1)
        q_log_probs = F.log_softmax(q_resp, dim=-1)
        
        # Calculate KL per token, then mean over the sequence
        # We use the reduction "none" to handle sequence length manually if needed, 
        # but mean() usually works fine here.
        kl_per_token = torch.sum(p_probs * (p_log_probs - q_log_probs), dim=-1)
        kl_div = kl_per_token.mean().item()

    results.append({
        "Model": model_name,
        "Prompt": example["question"],
        "Response": response_text,
        "Token_Count": token_count,
        "Perplexity": perplexity,
        "KL_Divergence": kl_div
    })


Starting Evaluation...


100%|██████████| 50/50 [03:49<00:00,  4.60s/it]


In [6]:
# Save
df = pd.DataFrame(results)
df.to_csv(f"perplexity_results_{model_name}.csv", index=False)
print("Evaluation Complete.")
print(f"Mean Perplexity: {df['Perplexity'].mean():.2f}")
print(f"Mean KL Divergence: {df['KL_Divergence'].mean():.4f}")
print(f"Mean Length: {df['Token_Count'].mean():.1f}")

Evaluation Complete.
Mean Perplexity: 5.18
Mean KL Divergence: 0.0172
Mean Length: 66.7
