In [11]:
%load_ext autoreload
%autoreload 2
import utils
import torch
import numpy as np
import all_classes
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
from sklearn.linear_model import LogisticRegression

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:

#load datasets
dataset_name = "imdb"
# dataset_name = "amazon_polarity"
data = load_dataset("imdb")["test"]
# data = load_dataset("amazon_polarity")["test"]

In [13]:
# Here are a few different model options you can play around with:
model_name = "llama3.2"
# model_name = "llama3.3"

# the number of hidden dimensions of the model
hidden_size = 2048
num_layers = 16

cache_dir = None

if model_name == "llama3.2":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    hidden_size = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
else:
    print("Not implemented!")

In [15]:
neg_hs, pos_hs, y = utils.get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type)

100%|██████████| 100/100 [01:56<00:00,  1.16s/it]


In [16]:
print(y)
print(np.shape(neg_hs))
print(np.shape(pos_hs))

[1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1
 1 1 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 1 1 1 0 0
 1 1 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 1 0 0 0 1 1 1 1]
(100, 2048)
(100, 2048)



## Let's verify that the model's representations are good

Before trying CCS, let's make sure there exists a direction that classifies examples as true vs false with high accuracy; if logistic regression accuracy is bad, there's no hope of CCS doing well.

In [18]:

# let's create a simple 50/50 train split (the data is already randomized)
n = len(y)
neg_hs_train, neg_hs_test = neg_hs[:n//2], neg_hs[n//2:]
pos_hs_train, pos_hs_test = pos_hs[:n//2], pos_hs[n//2:]
y_train, y_test = y[:n//2], y[n//2:]

# for simplicity we can just take the difference between positive and negative hidden states
# (concatenating also works fine)
x_train = neg_hs_train - pos_hs_train
x_test = neg_hs_test - pos_hs_test

lr = LogisticRegression(class_weight="balanced")
lr.fit(x_train, y_train)
print("Logistic regression accuracy: {}".format(lr.score(x_test, y_test)))

Logistic regression accuracy: 0.92


## Aggregating probes from all layers 

In [None]:
# aggregate a list of credences into one estimate
def aggregate(credences):
    result = np.sqrt(np.prod(credences * (1 / (1 - credences))))
    return 1 / (1 + result)

# return a list of extracted hidden state representations, each of which is 
# a tuple (pos_hs, neg_hs, y)
def construct_ccs(layers, num_examples):
    pos_hs_all = []
    neg_hs_all = []
    y_all = []
    for i in layers:
        neg_hs, pos_hs, y =  utils.get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type, layer = i, n = num_examples)
        neg_hs_all.append(neg_hs)
        pos_hs_all.append(pos_hs)
        y_all.append(y)
    return pos_hs_all, neg_hs_all, y_all


In [None]:
# Getting the hidden state representations of all the events
# and their complements
layers_idx = [1, 3, 5, 7, 15]
pos_hs_all, neg_hs_all, y_all = construct_ccs(layers_idx, 30)

100%|██████████| 30/30 [00:40<00:00,  1.35s/it]
100%|██████████| 30/30 [00:40<00:00,  1.34s/it]
100%|██████████| 30/30 [00:35<00:00,  1.19s/it]
100%|██████████| 30/30 [00:34<00:00,  1.16s/it]
100%|██████████| 30/30 [00:40<00:00,  1.34s/it]


In [53]:
probes = []
for i in layers_idx:
    pos_hs = pos_hs_all[i]
    neg_hs = neg_hs_all[i]
    y = y_all[i]
    neg_hs_train, neg_hs_test = neg_hs[:n//2], neg_hs[n//2:]
    pos_hs_train, pos_hs_test = pos_hs[:n//2], pos_hs[n//2:]
    y_train, y_test = y[:n//2], y[n//2:]
    credence = all_classes.CE(neg_hs_train, pos_hs_train)
    credence.repeated_train()
    probes.append(credence.best_probe)


# Evaluate
# ccs_acc = credence.get_acc(neg_hs_test, pos_hs_test, y_test)
# print("CE accuracy: {}".format(ccs_acc))


Sequential(
  (0): Linear(in_features=2048, out_features=1, bias=True)
  (1): Sigmoid()
)


In [None]:
# print([param for param in probes[0].parameters()])

[Parameter containing:
tensor([[ 0.0027, -0.0632, -0.0114,  ...,  0.0663, -0.0689,  0.0006]],
       requires_grad=True), Parameter containing:
tensor([-0.0762], requires_grad=True)]
