In [40]:
%load_ext autoreload
%autoreload 2
import utils
import torch
import numpy as np
import all_classes
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
from sklearn.linear_model import LogisticRegression

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:

#load datasets
dataset_name = "imdb"
# dataset_name = "amazon_polarity"
data = load_dataset("imdb")["test"]
# data = load_dataset("amazon_polarity")["test"]

In [42]:
# Here are a few different model options you can play around with:
model_name = "llama3.2"
# model_name = "llama3.3"

# the number of hidden dimensions of the model
hidden_size = 2048
num_layers = 16

cache_dir = None

if model_name == "llama3.2":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    hidden_size = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
else:
    print("Not implemented!")

In [69]:
text = "This movie is terrible!"
print(utils.format_imdb_3(text, 0))

The following movie review expresses a negative sentiment:
This movie is terrible!
 Does the above movie review express a negative sentiment?Answer: Yes


In [68]:
test = "Is the sky blue? Answer: True"
output = utils.get_decoder_hidden_states(model, tokenizer, test, layer = -1, token_pos = -1)
output_2 = utils.get_decoder_hidden_states(model, tokenizer, test, layer = -1, token_pos = -2)
print(output)
print(output_2)


[-1.10257590e+00 -3.79139977e-03  3.79273725e+00 ...  7.21308661e+00
  1.11337505e-01  1.13481593e+00]
[-2.1286638   5.4883676   3.6976395  ... -2.765409   -1.9739221
  0.19073382]


In [43]:
neg_hs, pos_hs, y = utils.get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type, n = 30)

100%|██████████| 30/30 [00:37<00:00,  1.24s/it]


In [44]:
print(y)
print(np.shape(neg_hs))
print(np.shape(pos_hs))

[0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 0 1 1 0 0 0 0 1 0 1 1]
(30, 2048)
(30, 2048)



## Let's verify that the model's representations are good

Before trying CCS, let's make sure there exists a direction that classifies examples as true vs false with high accuracy; if logistic regression accuracy is bad, there's no hope of CCS doing well.

In [54]:

# let's create a simple 50/50 train split (the data is already randomized)
n = len(y)
neg_hs_train, neg_hs_test = neg_hs[:n//2], neg_hs[n//2:]
pos_hs_train, pos_hs_test = pos_hs[:n//2], pos_hs[n//2:]
y_train, y_test = y[:n//2], y[n//2:]

# for simplicity we can just take the difference between positive and negative hidden states
# (concatenating also works fine)
x_train = neg_hs_train - pos_hs_train
x_test = neg_hs_test - pos_hs_test

lr = LogisticRegression(class_weight="balanced")
lr.fit(x_train, y_train)
print("Logistic regression accuracy: {}".format(lr.score(x_test, y_test)))

ccs = all_classes.CE(neg_hs_train, pos_hs_train)
ccs.repeated_train()

# Evaluate
ccs_acc = ccs.get_acc(neg_hs_test, pos_hs_test, y_test)
print("CCS accuracy: {}".format(ccs_acc))
# print(ccs.get_credence(neg_hs_test, pos_hs_test))

Logistic regression accuracy: 0.26666666666666666
CCS accuracy: 0.6


## Aggregating probes from all layers 

In [55]:
# aggregate a list of credences into one estimate
def aggregate(credences):
    k = np.shape(credences)[0]
    result = np.power(np.prod(credences * (1 / (1 - credences))), 1 / k)
    return 1 / (1 + result)

Get the hidden state representations from the specified layers

In [47]:
num_example = 30
layer_idx = [1, 3, 5, 7, -1] # training 5 probes
all_neg_hs, all_pos_hs, all_y = utils.get_hidden_states_multiple_layers(model, tokenizer, data, dataset_name, model_type, layer_idx, num_samples = num_example)

100%|██████████| 30/30 [03:15<00:00,  6.51s/it]


Get a list of probes for each specified layer

In [57]:
# probes = []
probabilities = []
pos_hs = [cur_pos_hs[0] for cur_pos_hs in all_pos_hs]
pos_hs = np.stack(pos_hs)
print(pos_hs.shape)
for i in range(len(layer_idx)):
    pos_hs = [cur_pos_hs[i] for cur_pos_hs in all_pos_hs]
    neg_hs = [cur_neg_hs[i] for cur_neg_hs in all_neg_hs]
    pos_hs = np.stack(pos_hs)
    neg_hs = np.stack(neg_hs)
    neg_hs_train, neg_hs_test = neg_hs[ : num_example // 2], neg_hs[num_example // 2 : ]
    pos_hs_train, pos_hs_test = pos_hs[ : num_example // 2], pos_hs[num_example // 2 : ]
    y_train, y_test = all_y[ : num_example // 2], all_y[ num_example // 2 : ]
    credence = all_classes.CE(neg_hs_train, pos_hs_train)
    credence.repeated_train()
    # probes.append(credence.best_probe)
    probabilities.append(credence.get_credence(neg_hs_test, pos_hs_test).detach().cpu().numpy())


(30, 2048)


Test accuracy by aggregating all the probes

In [62]:
probabilities = np.array(probabilities)
probabilities = np.reshape(probabilities, (len(layer_idx), num_example // 2))
print(probabilities)
probs_agg = [aggregate(probabilities[:, j]) for j in range(num_example // 2)]
probs_agg = np.array(probs_agg)
predictions = (probs_agg < 0.5).astype(int)
print(probs_agg)

[[0.4993142  0.49796697 0.50105333 0.4973266  0.4993839  0.4972742
  0.5045226  0.49668664 0.499156   0.4975614  0.4972448  0.5067036
  0.49774194 0.50550467 0.5012561 ]
 [0.49452716 0.49753243 0.5172639  0.49563998 0.4983451  0.47776687
  0.55358416 0.47949576 0.5051989  0.4884322  0.49134907 0.5500088
  0.4844451  0.5552918  0.526064  ]
 [0.60123444 0.49233457 0.50346875 0.42395252 0.5766477  0.46747336
  0.27280578 0.4759426  0.52295756 0.5264248  0.49693733 0.34228867
  0.47336924 0.53472066 0.7534312 ]
 [0.5852995  0.27904275 0.6241346  0.2313874  0.55167127 0.65759796
  0.01123709 0.42998642 0.93807113 0.67645633 0.54651463 0.01329583
  0.8072931  0.75286496 0.87512684]
 [0.5003134  0.12515824 0.82509303 0.487929   0.59937376 0.5022152
  0.01767434 0.23070891 0.9985726  0.49995178 0.4999254  0.00741782
  0.550017   0.7234493  0.49792603]]
[0.4634754  0.64301383 0.39433894 0.5785193  0.454614   0.47844917
 0.8639444  0.5831135  0.13290387 0.4607258  0.49357936 0.87278235
 0.427741

In [59]:
acc = (predictions == y_test).mean()
acc = max(acc, 1 - acc)
print("CCS accuracy by aggregating truth vectors: {}".format(acc))

CCS accuracy by aggregating truth vectors: 1.0


Logistic Regression