In [1]:
%load_ext autoreload
%autoreload 2
import utils
import torch
import numpy as np
import all_classes
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

#load datasets
dataset_name = "imdb"
# dataset_name = "amazon_polarity"
data = load_dataset("imdb")["test"]
# data = load_dataset("amazon_polarity")["test"]

In [20]:
# Here are a few different model options you can play around with:
model_name = "llama3.2"
# model_name = "llama3.3"

# the number of hidden dimensions of the model
hidden_size = 2048

cache_dir = None

if model_name == "llama3.2":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    hidden_size = model.config.hidden_size
else:
    print("Not implemented!")

In [31]:
# Just some code to test out the architecture of llama 3.2
# run it if you want to play around with the architecture 
test_event = "Is the sky blue? Answer: Yes"
test_event_c = "Is the sky blue? Answer: No"
phi_all = utils.get_decoder_hidden_states_all(model, tokenizer, test_event, num_layer = 16)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [4]:
neg_hs, pos_hs, y = utils.get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type)

100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


In [27]:
print(y)
print(np.shape(neg_hs))
print(np.shape(pos_hs))

[1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 1
 0 1 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1
 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 0]
(100, 2048)
(100, 2048)



## Let's verify that the model's representations are good

Before trying CCS, let's make sure there exists a direction that classifies examples as true vs false with high accuracy; if logistic regression accuracy is bad, there's no hope of CCS doing well.

In [10]:

# let's create a simple 50/50 train split (the data is already randomized)
n = len(y)
neg_hs_train, neg_hs_test = neg_hs[:n//2], neg_hs[n//2:]
pos_hs_train, pos_hs_test = pos_hs[:n//2], pos_hs[n//2:]
y_train, y_test = y[:n//2], y[n//2:]

# for simplicity we can just take the difference between positive and negative hidden states
# (concatenating also works fine)
x_train = neg_hs_train - pos_hs_train
x_test = neg_hs_test - pos_hs_test

lr = LogisticRegression(class_weight="balanced")
lr.fit(x_train, y_train)
print("Logistic regression accuracy: {}".format(lr.score(x_test, y_test)))

Logistic regression accuracy: 0.9


## Testing CCS using different extensions

In [None]:
# Train CCS without any labels and average multiple truth vectors from the last layer
ccs = all_classes.CCS(neg_hs_train, pos_hs_train, num_vec = 6)
ccs.repeated_train()

# Evaluate
ccs_acc = ccs.get_acc(neg_hs_test, pos_hs_test, y_test)
print("CCS accuracy: {}".format(ccs_acc))

CCS accuracy: 0.92


In [None]:
# Train CCS and average truth vectors from multiple intermediate layers