In [36]:
%load_ext autoreload
%autoreload 2
import utils
import torch
import numpy as np
import all_classes
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
from sklearn.linear_model import LogisticRegression

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:

#load datasets
dataset_name = "imdb"
# dataset_name = "amazon_polarity"
data = load_dataset("imdb")["test"]
# data = load_dataset("amazon_polarity")["test"]

In [38]:
# Here are a few different model options you can play around with:
model_name = "llama3.2"
# model_name = "llama3.3"

# the number of hidden dimensions of the model
hidden_size = 2048
num_layers = 16

cache_dir = None

if model_name == "llama3.2":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    hidden_size = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
else:
    print("Not implemented!")

In [39]:
neg_hs, pos_hs, y = utils.get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type)

100%|██████████| 100/100 [01:51<00:00,  1.11s/it]


In [40]:
print(y)
print(np.shape(neg_hs))
print(np.shape(pos_hs))

[1 0 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0
 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 1 0 1 1 0 0
 0 1 0 1 0 1 1 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0]
(100, 2048)
(100, 2048)



## Let's verify that the model's representations are good

Before trying CCS, let's make sure there exists a direction that classifies examples as true vs false with high accuracy; if logistic regression accuracy is bad, there's no hope of CCS doing well.

In [46]:

# let's create a simple 50/50 train split (the data is already randomized)
n = len(y)
neg_hs_train, neg_hs_test = neg_hs[:n//2], neg_hs[n//2:]
pos_hs_train, pos_hs_test = pos_hs[:n//2], pos_hs[n//2:]
y_train, y_test = y[:n//2], y[n//2:]

# for simplicity we can just take the difference between positive and negative hidden states
# (concatenating also works fine)
x_train = neg_hs_train - pos_hs_train
x_test = neg_hs_test - pos_hs_test

lr = LogisticRegression(class_weight="balanced")
lr.fit(x_train, y_train)
print("Logistic regression accuracy: {}".format(lr.score(x_test, y_test)))

ccs = all_classes.CE(neg_hs_train, pos_hs_train)
ccs.repeated_train()

# Evaluate
ccs_acc = credence.get_acc(neg_hs_test, pos_hs_test, y_test)
print("CCS accuracy: {}".format(ccs_acc))
print(ccs.get_credence(neg_hs_test, pos_hs_test))

Logistic regression accuracy: 0.92
CCS accuracy: 0.8
tensor([[0.5633],
        [0.8014],
        [0.5043],
        [0.4963],
        [0.6487],
        [0.1698],
        [0.0342],
        [0.9875],
        [0.4748],
        [0.3775],
        [0.5047],
        [0.4101],
        [0.5081],
        [0.4992],
        [0.1594],
        [0.7513],
        [0.5758],
        [0.0624],
        [0.9953],
        [0.2548],
        [0.9792],
        [0.2900],
        [0.4808],
        [0.0193],
        [0.3323],
        [0.9421],
        [0.0259],
        [0.5765],
        [0.4997],
        [0.8542],
        [0.7259],
        [0.9990],
        [0.4907],
        [0.9130],
        [0.9984],
        [0.4534],
        [0.4996],
        [0.6264],
        [0.0185],
        [0.1096],
        [0.4570],
        [0.5977],
        [0.0221],
        [0.8957],
        [0.4941],
        [0.9835],
        [0.9896],
        [0.0635],
        [0.9999],
        [0.1919]])


## Aggregating probes from all layers 

In [9]:
# aggregate a list of credences into one estimate
def aggregate(credences):
    result = np.sqrt(np.prod(credences * (1 / (1 - credences))))
    return 1 / (1 + result)

Get the hidden state representations from the specified layers

In [10]:
num_example = 30
layer_idx = [1, 3, 5, 7, -1]
all_neg_hs, all_pos_hs, all_y = utils.get_hidden_states_multiple_layers(model, tokenizer, data, dataset_name, model_type, layer_idx, num_samples = num_example)

100%|██████████| 30/30 [03:06<00:00,  6.22s/it]


Get a list of probes for each specified layer

In [15]:
# probes = []
probabilities = []
pos_hs = [cur_pos_hs[0] for cur_pos_hs in all_pos_hs]
pos_hs = np.stack(pos_hs)
print(pos_hs.shape)
for i in range(len(layer_idx)):
    pos_hs = [cur_pos_hs[i] for cur_pos_hs in all_pos_hs]
    neg_hs = [cur_neg_hs[i] for cur_neg_hs in all_neg_hs]
    pos_hs = np.stack(pos_hs)
    neg_hs = np.stack(neg_hs)
    neg_hs_train, neg_hs_test = neg_hs[ : num_example // 2], neg_hs[num_example // 2 : ]
    pos_hs_train, pos_hs_test = pos_hs[ : num_example // 2], pos_hs[num_example // 2 : ]
    y_train, y_test = all_y[ : num_example // 2], all_y[ num_example // 2 : ]
    credence = all_classes.CE(neg_hs_train, pos_hs_train)
    credence.repeated_train()
    # probes.append(credence.best_probe)
    probabilities.append(credence.get_credence(neg_hs_test, pos_hs_test))


(30, 2048)


Test accuracy by aggregating all the probes

In [30]:
print(probabilities[0])
print(probabilities[1])
print(probabilities[2])
# predictions = (probabilities[0].detach().cpu().numpy() < 0.5).astype(int)[:, 0]
# acc = (predictions == y_test).mean()
# acc = max(acc, 1 - acc)
# print(acc)

tensor([[0.5022],
        [0.5017],
        [0.5004],
        [0.5004],
        [0.5004],
        [0.5024],
        [0.5018],
        [0.4987],
        [0.4901],
        [0.5029],
        [0.5008],
        [0.4996],
        [0.4945],
        [0.5025],
        [0.5018]])
tensor([[0.4890],
        [0.4951],
        [0.5100],
        [0.4958],
        [0.5020],
        [0.4816],
        [0.4753],
        [0.5096],
        [0.5498],
        [0.4735],
        [0.5042],
        [0.5167],
        [0.5412],
        [0.4819],
        [0.4772]])
tensor([[0.6642],
        [0.4816],
        [0.5450],
        [0.4949],
        [0.4844],
        [0.5454],
        [0.5778],
        [0.5506],
        [0.4958],
        [0.7104],
        [0.3469],
        [0.4951],
        [0.4944],
        [0.6091],
        [0.5000]])


Logistic Regression