In [1]:
import json
import pickle
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import fire



In [None]:

def evaluate_single_dataset(
    dataset_path: Path,
    model,
    tokenizer,
    steering_vec,
    steering_magnitude: float,
    completion_length: int,
    is_instruct: bool
):
    """Evaluate on a single dataset and return results."""
    with open(dataset_path, 'r') as f:
        dataset = json.load(f)

    dataset_name = dataset_path.name
    print(f"\n{'='*70}")
    print(f"Evaluating: {dataset_name}")
    print(f"Examples: {len(dataset)}")
    print(f"{'='*70}")

    results = []
    for i, example in enumerate(dataset):
        if (i + 1) % 50 == 0:
            print(f"  Progress: {i + 1}/{len(dataset)}")

        prompt = example['prompt']

        # Format prompt
        if is_instruct:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        else:
            formatted_prompt = prompt

        # Generate with steering
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            with steering_vec.apply(model, multiplier=steering_magnitude):
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=completion_length,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id
                )

        # Decode output
        generated_text = tokenizer.decode(
            outputs[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )

        # Store result
        result = example.copy()
        result['generated_completion'] = generated_text
        results.append(result)

    return results, dataset_name
