# Modifying COT behavior of deepseek distilled model
colab setup: A100, High-RAM

In [10]:
!pip install git+https://github.com/IBM/activation-steering.git
!pip install datasets

Collecting git+https://github.com/IBM/activation-steering.git
  Cloning https://github.com/IBM/activation-steering.git to /tmp/pip-req-build-b9h1elwk
  Running command git clone --filter=blob:none --quiet https://github.com/IBM/activation-steering.git /tmp/pip-req-build-b9h1elwk
  Resolved https://github.com/IBM/activation-steering.git to commit 59f24bf73817cfe3a836697c956359437aed0c8a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
# get your account token from https://huggingface.co/settings/tokens

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = torch.device('cuda:0')
model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'

model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.float16, token=hf_token)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

# 1. Extract CoT Behavior Vector and Save

In [15]:
import json
from activation_steering import SteeringDataset, SteeringVector
from datasets import load_dataset
import random

reasoning_data = load_dataset("rb/reasoning_trace", "default")['train']

In [11]:
reasoning_data[0].keys()

dict_keys(['question', 'answer_content', 'reasoning_content', 'reference_answer', 'verifier_score', 'id', 'metadata', 'community_score', 'refined_reasoning'])

In [None]:
# Create our dataset
cot_behavior_dataset = SteeringDataset(
    tokenizer=tokenizer,
    examples=[(
        f"Question: {item['question']}\nAnswer: {item['refined_reasoning']}", # refined reasoning is positive
        f"Question: {item['question']}\nAnswer: {item['reasoning_content']}" # reasoning is negative
               ) for item in reasoning_data],
    suffixes=None,
    disable_suffixes=True,
    use_chat_template=False
)

# Extract behavior vector for this setup with 9B model, 1000 examples, A100 GPU, batch size 8 -> should take around 4 minutes
cot_behavior_vector = SteeringVector.train(
    model=model,
    tokenizer=tokenizer,
    steering_dataset=cot_behavior_dataset,
    method="pca_center",
    accumulate_last_x_tokens=1,
    batch_size=1
)

# Let's save this behavior vector for later use
cot_behavior_vector.save('cot_behavior_vector')

# 2. Load CoT Behavior Vector and Steer

In [18]:

# Load the dataset
aime_df = load_dataset("rb/aime24", data_files="aime24.csv")['train'].to_pandas()


# Display first few rows
print(aime_df.head())

Generating train split: 0 examples [00:00, ? examples/s]

   Unnamed: 0         id  year  problem_number  \
0         919  2024-II-1  2024               1   
1         920  2024-II-2  2024               2   
2         921  2024-II-3  2024               3   
3         922  2024-II-4  2024               4   
4         923  2024-II-5  2024               5   

                                             problem  answer  
0  Among the $900$ residents of Aimeville, there ...      73  
1  A list of positive integers has the following ...     236  
2  Find the number of ways to place a digit in ea...      45  
3  Let $x,y$ and $z$ be positive real numbers tha...      33  
4  Let $ABCDEF$ be a convex equilateral hexagon i...      80  


In [None]:
from activation_steering import MalleableModel, SteeringVector

# Load behavior vector
cot_behavior_vector = SteeringVector.load('cot_behavior_vector')

# MalleableModel is a main steering class. Wrap the model with this class first.
malleable_model = MalleableModel(model=model, tokenizer=tokenizer)
malleable_model.reset_leash_to_default() # you can reset steering configuration like this; but in this case the line won't do anything; i'm doing this because we are in a colab environment with weird tricks

instructions = [f"Question: {problem}" for problem in aime_df['problem']]

max_tokens = 32768
settings = {
    "pad_token_id": tokenizer.eos_token_id,
    "do_sample": False,
    "max_new_tokens": max_tokens,
    "repetition_penalty": 1.2,
}

# Record original responses
original_responses = malleable_model.respond_batch_sequential(
    prompts=instructions,
    use_chat_template=False,
    settings = settings
)
malleable_model.reset_leash_to_default() # you can reset steering configuration like this; but in this case the line won't do anything

# Let's steer the model. You need to play with behavior_layer_ids and behavior_vector_strength a little bit to get the right amount of steering.
# Once you get the hang of it, it gets really straightforward.
# behavior_layer_ids is the layers that we steer and behavior_vector_strength is a multiplier to the behavior vector!
malleable_model.steer(
    behavior_vector=cot_behavior_vector,
    behavior_layer_ids = list(range(28//2,28)),
    behavior_vector_strength=2,
)

steered_responses = malleable_model.respond_batch_sequential(
    prompts=instructions,
    use_chat_template=False,
    settings = settings
)

for original_response, steered_response in zip(original_responses, steered_responses):
    print(original_response, steered_response)



In [None]:
# 3. Calculate result-metrics and token efficiency

In [None]:
aime_df["original_responses"] = original_responses
aime_df["steered_responses"] = steered_responses
aime_df.to_csv("aime_df_responses.csv")

import re
import tiktoken

def extract_last_boxed(text):
    matches = re.findall(r'boxed\{([^}]+)\}', text)
    return matches[-1] if matches else ""

def extract_answers(pairs_dict):
    answers = {}
    for pair, vals in pairs_dict.items():
        orig = extract_last_boxed(vals["original_response"])
        steered = extract_last_boxed(vals["steered_response"])
        answers[pair] = {"original_answer": orig, "steered_answer": steered}
    return answers

def get_token_counts(pairs_dict, model="gpt-3.5-turbo"):
    enc = tiktoken.encoding_for_model(model)
    token_data = {}
    for pair, vals in pairs_dict.items():
        orig_len = len(enc.encode(vals["original_response"]))
        steer_len = len(enc.encode(vals["steered_response"]))
        diff = orig_len - steer_len
        diff_pct = (diff / orig_len * 100) if orig_len != 0 else 0
        token_data[pair] = {
            "original_tokens": orig_len,
            "steered_tokens": steer_len,
            "diff": diff,
            "diff_pct": round(diff_pct, 2)
        }
    return token_data

    # ground truth
ground_truth = aime_df['answer']
questions = aime_df['problem']
pair_responses = [{"original_response": original_response, "steered_response": steered_response} for original_response,steered_response in zip(original_response, steered_responses)]

pairs_dict = dict(zip(questions, pair_responses))

answers_dict = extract_answers(pairs_dict)
token_counts = get_token_counts(pairs_dict)

# Calculate accuracy
correct_orig = 0
correct_steered = 0
pair_keys = sorted(answers_dict.keys(), key=lambda x: int(x.split()[-1]))  # e.g. "Pair 1" -> 1

for i, pair in enumerate(pair_keys):
    gt = ground_truth[i]
    try:
        orig_val = int(answers_dict[pair]["original_answer"])
    except:
        orig_val = None
    try:
        steer_val = int(answers_dict[pair]["steered_answer"])
    except:
        steer_val = None
    if orig_val == gt:
        correct_orig += 1
    if steer_val == gt:
        correct_steered += 1

# Calculate overall token usage and diff percentage
total_original_tokens = sum(info["original_tokens"] for info in token_counts.values())
total_steered_tokens = sum(info["steered_tokens"] for info in token_counts.values())
overall_diff = total_original_tokens - total_steered_tokens
overall_diff_pct = (overall_diff / total_original_tokens * 100) if total_original_tokens else 0

# Print results
print("Questions:", questions)
print("Answers:", answers_dict)
print("Token Counts (with diff_pct):", token_counts)
print(f"Original Accuracy: {correct_orig}/{len(ground_truth)} = {correct_orig/len(ground_truth)*100:.2f}%")
print(f"Steered Accuracy: {correct_steered}/{len(ground_truth)} = {correct_steered/len(ground_truth)*100:.2f}%")
print(f"Overall Token Diff: {overall_diff}")
print(f"Overall Token Diff Percentage: {overall_diff_pct:.2f}%")

In [None]:


# aime_df["original_answers"] = original_responses
# aime_df["steered_answers"] = steered_responses


# aime_df["original_length"] = total_original_tokens
# aime_df["steered_length"] = steered_responses