In [1]:
from steerit.steerit import *
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

In [None]:
################################################################################
# Main: AIME Reasoning Example
################################################################################
if __name__ == "__main__":
    # HF token from environment
    HF_TOKEN = os.environ.get("hftoken", None)
    if not HF_TOKEN:
        raise ValueError("Please set hftoken in environment variables.")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

    # 1. Load dataset (AIME Reasoning)
    print("Loading AIME reasoning dataset from 'rb/aime_reasoning'...")
    ds = load_dataset("rb/aime_reasoning", "default")
    # We'll use only the training split for demonstration
    reasoning_data = ds["train"].to_pandas()
    # Example: we can do a quick train/val split
    train_data = reasoning_data.iloc[:900]
    val_data = reasoning_data.iloc[900:910]

    # 2. Prepare contrastive pairs from AIME data
    # We assume "refined_reasoning" is more desirable; "reasoning_content" is less so
    prompt_pairs = []
    for _, row in train_data.iterrows():
        # We create pairs from the 'refined_reasoning' (pos) and 'reasoning_content' (neg)
        pos_str = f"Question: {row['question']}\nReasoning (refined): {row['refined_reasoning']}"
        neg_str = f"Question: {row['question']}\nReasoning (original): {row['reasoning_content']}"
        prompt_pairs.append((pos_str, neg_str))

    # For demonstration, we only keep a small sample
    # (If you have GPU memory to handle more data, skip this step.)
    prompt_pairs = prompt_pairs[:10]

    # 3. Create SteeringModel
    print(f"Loading model {model_name} on {device}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
    tokenizer.pad_token = tokenizer.eos_token

    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        token=HF_TOKEN
    ).to(device)

    chosen_layers = [10, 11]
    model = SteeringModel(base_model, chosen_layers, device)

    # 4. Train a steering vector using contrastive approach
    print("Training steering vector (contrastive) on AIME data...")
    steering_vec = train_steering_vector_contrastive(
        model, tokenizer,
        prompt_pairs,
        layer_ids=chosen_layers,
        device=device,
        batch_size=2
    )

    # 5. Set steering with a moderate coefficient
    model.set_steering(steering_vec, coeff=2.0, normalize=False)

    # 6. Generate text to see effect
    print("\nGenerating sample reasoning from validation question:")
    sample_question = val_data.iloc[0]["question"]
    prompt = f"Question: {sample_question}\nAnswer:"
    # We'll do token-by-token generation so steering is definitely applied
    text_out = model.generate_text(prompt, tokenizer, max_new_tokens=50, show_progress=False)
    print("Generated text (steered token-by-token):")
    print(text_out)

    # 7. Compare to default generate (fast, might skip hooks):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    default_out_ids = model.default_generate(
        input_ids, max_new_tokens=50,
        do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id
    )
    default_text = tokenizer.decode(default_out_ids[0], skip_special_tokens=True)
    print("\nGenerated text (default generate):")
    print(default_text)


Loading AIME reasoning dataset from 'rb/aime_reasoning'...
Loading model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B on cuda...
Training steering vector (contrastive) on AIME data...
