Install Packages

In [10]:
!pip install -q transformers accelerate einops



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m661.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m824.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Load Qwen3-0.6B Model

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # reduce memory
    device_map="auto",          # auto to GPU if available
    trust_remote_code=True
)


Testing Prompt

In [9]:
prompt = "Q: If you have 12 apples and eat 5, then buy 3 more, how many do you have?\nA: Let's think step by step."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    top_k=40,
)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Truncate at the next question (if any)
answer = decoded.split("\nQ:")[0].strip()

print(answer)


Q: If you have 12 apples and eat 5, then buy 3 more, how many do you have?
A: Let's think step by step. First, you start with 12 apples. If you eat 5, you have 12 - 5 = 7 apples left. Then, you buy 3 more, so you add 3 to the remaining apples. That means you now have 7 + 3 = 10 apples in total. Therefore, the answer is 10.


# Linear Probe

Loading Dataset

In [2]:
from huggingface_hub import login
from google.colab import userdata

# Get your secret token from Colab Secrets
HF_TOKEN = userdata.get('HF_TOKEN')

# Pass the token directly
login(token=HF_TOKEN)


In [5]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("codelion/Qwen3-0.6B-pts", split="train")

# Print total number of samples
print(f"Total samples: {len(dataset)}")

# View the first example
print(dataset[0])

# Optionally: show the keys available
print(dataset[0].keys())

# View a few entries
for i in range(3):
    print(f"\nSample {i}:\n{dataset[i]}")


Total samples: 1376
{'model_id': 'Qwen/Qwen3-0.6B', 'query': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?', 'pivot_context': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn? \n\n', 'pivot_token': 'A', 'pivot_token_id': 32, 'prob_before': 0.68, 'prob_after': 0.0, 'prob_delta': -0.68, 'is_positive': False, 'task_type': 'generic', 'dataset_id': 'openai/gsm8k', 'dataset_item_id': '1', 'timestamp': datetime.datetime(2025, 5, 13, 14, 0, 26)}
dict_keys(['model_id', 'query', 'pivot_context', 'pivot_token', 'pivot_token_id', 'prob_before', 'prob_after', 'prob_delta', 'is_positive', 'task_type', 'dataset_id', 'dataset_item_id', 'timestamp'])

Sample 0:
{'model_id': 'Qwen/Qwen3-0.6B', 'query': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?', 'pivot_context': 'Weng earns $12 an hour for babysittin

Probing

In [15]:
import torch.nn as nn

activations = []

def get_hook(layer_idx):
    def hook_fn(module, input, output):
        # If output is a tuple, grab the first tensor
        if isinstance(output, tuple):
            activations.append(output[0])
        else:
            activations.append(output)
    return hook_fn


layer_num = 12  # Example: 12th transformer block
model.model.layers[layer_num].register_forward_hook(get_hook(layer_num))



<torch.utils.hooks.RemovableHandle at 0x786f2cc44510>

In [19]:


X, y = [], []

for sample in dataset:
    context = sample["pivot_context"]
    pivot_token = sample["pivot_token"]
    pivot_tok_id = tokenizer(pivot_token, add_special_tokens=False)["input_ids"][0]

    inputs = tokenizer(context, return_tensors="pt").to("cuda")
    activations.clear()

    with torch.no_grad():
        _ = model(**inputs)

    hidden_states = activations[0][0]  # shape: [1, seq_len, hidden_dim]

    # Positive (pivotal)
    last_hidden = hidden_states[0, -1]  # last token
    X.append(last_hidden.cpu().numpy())
    y.append(1)

    # Negative (non-pivotal): random token in the middle
    if hidden_states.shape[1] > 2:
        neg_idx = hidden_states.shape[1] // 2
        neg_hidden = hidden_states[0, neg_idx]
        X.append(neg_hidden.cpu().numpy())
        y.append(0)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

print("Accuracy:", clf.score(X_test, y_test))


Accuracy: 0.8130671506352087
