In [1]:
%load_ext autoreload
%autoreload 2
import utils
import torch
import numpy as np
import all_classes
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

#load datasets
dataset_name = "imdb"
# dataset_name = "amazon_polarity"
data = load_dataset("imdb")["test"]
# data = load_dataset("amazon_polarity")["test"]

In [3]:
# Here are a few different model options you can play around with:
model_name = "llama3.2"
# model_name = "llama3.3"

# the number of hidden dimensions of the model
hidden_size = 2048
num_layers = 16

cache_dir = None

if model_name == "llama3.2":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    hidden_size = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
else:
    print("Not implemented!")

In [18]:
neg_hs, pos_hs, y = utils.get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type, n = 30)

100%|██████████| 30/30 [00:32<00:00,  1.07s/it]


In [19]:
print(y)
print(np.shape(neg_hs))
print(np.shape(pos_hs))

[0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 1]
(30, 2048)
(30, 2048)



## Let's verify that the model's representations are good

Before trying CCS, let's make sure there exists a direction that classifies examples as true vs false with high accuracy; if logistic regression accuracy is bad, there's no hope of CCS doing well.

In [20]:

# let's create a simple 50/50 train split (the data is already randomized)
n = len(y)
neg_hs_train, neg_hs_test = neg_hs[:n//2], neg_hs[n//2:]
pos_hs_train, pos_hs_test = pos_hs[:n//2], pos_hs[n//2:]
y_train, y_test = y[:n//2], y[n//2:]

# for simplicity we can just take the difference between positive and negative hidden states
# (concatenating also works fine)
x_train = neg_hs_train - pos_hs_train
x_test = neg_hs_test - pos_hs_test

lr = LogisticRegression(class_weight="balanced")
lr.fit(x_train, y_train)
print("Logistic regression accuracy: {}".format(lr.score(x_test, y_test)))

ccs = all_classes.CE(neg_hs_train, pos_hs_train)
ccs.repeated_train()

# Evaluate
ccs_acc = ccs.get_acc(neg_hs_test, pos_hs_test, y_test)
print("CCS accuracy: {}".format(ccs_acc))
# print(ccs.get_credence(neg_hs_test, pos_hs_test))

Logistic regression accuracy: 0.9333333333333333
CCS accuracy: 0.6666666666666666


## Aggregating probes from all layers 

In [21]:
# aggregate a list of credences into one estimate
def aggregate(credences):
    result = np.sqrt(np.prod(credences * (1 / (1 - credences))))
    return 1 / (1 + result)

Get the hidden state representations from the specified layers

In [10]:
num_example = 30
layer_idx = [1, 3, 5, 7, -1]
all_neg_hs, all_pos_hs, all_y = utils.get_hidden_states_multiple_layers(model, tokenizer, data, dataset_name, model_type, layer_idx, num_samples = num_example)

100%|██████████| 30/30 [02:31<00:00,  5.05s/it]


Get a list of probes for each specified layer

In [38]:
# probes = []
probabilities = []
pos_hs = [cur_pos_hs[0] for cur_pos_hs in all_pos_hs]
pos_hs = np.stack(pos_hs)
print(pos_hs.shape)
for i in range(len(layer_idx)):
    pos_hs = [cur_pos_hs[i] for cur_pos_hs in all_pos_hs]
    neg_hs = [cur_neg_hs[i] for cur_neg_hs in all_neg_hs]
    pos_hs = np.stack(pos_hs)
    neg_hs = np.stack(neg_hs)
    neg_hs_train, neg_hs_test = neg_hs[ : num_example // 2], neg_hs[num_example // 2 : ]
    pos_hs_train, pos_hs_test = pos_hs[ : num_example // 2], pos_hs[num_example // 2 : ]
    y_train, y_test = all_y[ : num_example // 2], all_y[ num_example // 2 : ]
    credence = all_classes.CE(neg_hs_train, pos_hs_train)
    credence.repeated_train()
    # probes.append(credence.best_probe)
    probabilities.append(credence.get_credence(neg_hs_test, pos_hs_test).detach().cpu().numpy())


(30, 2048)


Test accuracy by aggregating all the probes

In [66]:
probabilities = np.array(probabilities)
probabilities = np.reshape(probabilities, (len(layer_idx), num_example // 2))
probs_agg = [aggregate(probabilities[:, j]) for j in range(num_example // 2)]
probs_agg = np.array(probs_agg)
predictions = (probs_agg < 0.5).astype(int)
acc = (predictions == y_test).mean()
acc = max(acc, 1 - acc)
print("CCS accuracy by aggregating truth vectors: {}".format(acc))

0.8


Logistic Regression