In [1]:
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from peft import PeftModel

  import pynvml  # type: ignore[import]


In [2]:

# SFT
model_path = None 
model_type = "SFT-Baseline"

# DPO
# model_path = "../../models/smollm2-dpo-final"
# model_type = "DPO"

# PPO Sparse
# model_path = "../../models/smollm2-ppo-sparse-final"
# model_type = "PPO-Sparse"

# PPO Dense
# model_path = "../../models/smollm2-ppo-dense-final"
# model_type = "PPO-Dense"

# GRPO
# model_path = "../../models/smollm2-grpo-final"
# model_type = "GRPO"


In [3]:
output_file = f"eval_results_{model_type}.csv"
reward_model_path = "../../models/smollm2-reward-model-final"
base_model_id = "HuggingFaceTB/smollm2-135M-SFT-Only"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:

eval_dataset = load_dataset("Intel/orca_dpo_pairs", split="train[2050:2100]")

print(f"Loading {model_type} Policy...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

policy_model = AutoModelForCausalLM.from_pretrained(
    base_model_id, 
    torch_dtype=torch.float16, 
    device_map=device
)

if model_path:
    policy_model = PeftModel.from_pretrained(policy_model, model_path)
    
policy_model.eval()

print("Loading Reward Model...")
rm_base = AutoModelForSequenceClassification.from_pretrained(
    base_model_id, 
    num_labels=1, 
    torch_dtype=torch.float16, 
    device_map=device
)
reward_model = PeftModel.from_pretrained(rm_base, reward_model_path)
reward_model.eval()



Loading SFT-Baseline Policy...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading Reward Model...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/smollm2-135M-SFT-Only and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(49152, 576, padding_idx=2)
        (layers): ModuleList(
          (0-29): 30 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=576, out_features=576, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=576, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=576, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              

In [5]:

results = []

print(f"Generating responses for {len(eval_dataset)} prompts...")

for example in tqdm(eval_dataset):
    prompt_text = f"<|im_start|>system\n{example['system']}<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n"
    
    inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
    
    # 1. Generate Response
    with torch.no_grad():
        outputs = policy_model.generate(
            **inputs, 
            max_new_tokens=256,   # Keep consistent for comparison
            do_sample=True,      # Sampling allows diversity
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode response (removing the prompt)
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    # 2. Get Reward Score
    # The Reward Model expects the FULL text (Prompt + Response)
    # We re-encode to be safe
    rm_inputs = tokenizer(full_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    
    with torch.no_grad():
        rm_output = reward_model(**rm_inputs)
        score = rm_output.logits[0].item()
    
    # 3. Store Data
    results.append({
        "Model": model_type,
        "Prompt": example["question"],
        "Response": generated_response,
        "Reward_Score": score,
        "Length (Tokens)": len(outputs[0]) - inputs.input_ids.shape[1] # Actual tokens generated
    })


Generating responses for 50 prompts...


100%|██████████| 50/50 [04:46<00:00,  5.72s/it]


In [6]:

# --- 4. Save Results ---
df = pd.DataFrame(results)
df.to_csv(output_file, index=False)
print(f"Saved results to {output_file}")
print(f"Average Reward: {df['Reward_Score'].mean():.4f}")
print(f"Average Length: {df['Length (Tokens)'].mean():.1f}")

Saved results to eval_results_SFT-Baseline.csv
Average Reward: 1.7408
Average Length: 112.6
