In [1]:
%load_ext autoreload
%autoreload 2
import utils
import torch
import numpy as np
import all_classes
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
from sklearn.linear_model import LogisticRegression
import itertools

  from .autonotebook import tqdm as notebook_tqdm


##### Load Dataset

In [35]:

#load datasets
dataset_name = "imdb"
# dataset_name = "amazon_polarity"
data_imdb_train = load_dataset("imdb")["train"]
data_imdb_train = np.array(data_imdb_train)
data_imdb_test = load_dataset("imdb")["test"]
data_imdb_test = np.array(data_imdb_test)

data_boolq_train = load_dataset("google/boolq")["train"]
data_boolq_train = np.array(data_boolq_train)
data_boolq_test = load_dataset("google/boolq")["validation"]
data_boolq_test = np.array(data_boolq_test)
# data = load_dataset("amazon_polarity")["test"]

##### Load Model

In [36]:
# Here are a few different model options you can play around with:
model_name = "llama3.2"
# model_name = "llama3.3"

# the number of hidden dimensions of the model
hidden_size = 2048
num_layers = 16

cache_dir = None

if model_name == "llama3.2":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    hidden_size = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
else:
    print("Not implemented!")

In [6]:
print(data_boolq_test[100])

{'question': 'can you have too much oxygen in your body', 'answer': True, 'passage': 'The result of breathing increased partial pressures of oxygen is hyperoxia, an excess of oxygen in body tissues. The body is affected in different ways depending on the type of exposure. Central nervous system toxicity is caused by short exposure to high partial pressures of oxygen at greater than atmospheric pressure. Pulmonary and ocular toxicity result from longer exposure to increased oxygen levels at normal pressure. Symptoms may include disorientation, breathing problems, and vision changes such as myopia. Prolonged exposure to above-normal oxygen partial pressures, or shorter exposures to very high partial pressures, can cause oxidative damage to cell membranes, collapse of the alveoli in the lungs, retinal detachment, and seizures. Oxygen toxicity is managed by reducing the exposure to increased oxygen levels. Studies show that, in the long term, a robust recovery from most types of oxygen tox

### Improving Performance of Probes 

##### Functions for Formatting Prompts

In [37]:
def format_imdb(text, label, prompt_version = 1):
    """
    Given an imdb example ("text") and corresponding label (0 for negative, or 1 for positive),
    returns a zero-shot prompt for that example (which includes that label as the answer).

    (This is just one example of a simple, manually created prompt.)
    """
    return "Consider the sentiment of the following review:\n" + text + "\nDoes the above movie review express a " + ["negative", "positive"][label] + " sentiment? " + "Answer: " + "Yes"

def format_boolq(text, question, label):
    """
    Given a BoolQA example ("text") with the corresponding question and label (1 for "Yes" and 0 for "No"),
    returns a zero-shot prompt for that example (which includes that label as the answer).

    (This is just one example of a simple, manually created prompt.)
    """
    return "Consider the following passage:\n" + text + "\n" + "After reading this passage, I have a question: " + question + "?" + " True or False?" + " Answer: " + ["True", "False"][label]

#### Some helper functions

In [38]:
def normalize(x, var_normalize = False):
    """
    Mean-normalizes the data x (of shape (n, d))
    If self.var_normalize, also divides by the standard deviation
    """
    normalized_x = x - x.mean(axis=0, keepdims=True)
    if var_normalize:
        normalized_x /= normalized_x.std(axis=0, keepdims=True)

    return normalized_x

def get_credence(probe, x0_test, x1_test, device = "cpu"):
    """
    Given a probe, compute credence for the current parameters on the given test inputs
    """
    x0 = torch.tensor(normalize(x0_test), dtype=torch.float, requires_grad=False, device=device)
    x1 = torch.tensor(normalize(x1_test), dtype=torch.float, requires_grad=False, device=device)
    with torch.no_grad():
        p0, p1 = probe(x0), probe(x1)
    avg_confidence = 0.5 * (p0 + (1 - p1))

    return avg_confidence

def get_random_samples(data_set, n):
    return np.random.choice(data_set, n)


def get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type, params):
    """
    Given an encoder-decoder model, a list of data, computes the contrast hidden states on n random examples by probing the model according to the specified parameters (param = (layer_indices, token_positions, prompt_versions))
    Returns numpy arrays of shape (n, hidden_dim) for each candidate label, along with a boolean numpy array of shape (n,)
    with the ground truth labels
    
    This is deliberately simple so that it's easy to understand, rather than being optimized for efficiency
    """
    # setup
    model.eval()
    all_neg_hs, all_pos_hs, all_gt_labels = [], [], []
    layer, token_pos, prompt_version = params 

    # loop
    for sample in data:
        if dataset_name == "imdb":
            text, true_label = sample["text"], sample["label"]
            neg_hs = utils.get_hidden_states(model, tokenizer, format_imdb(text, 0, prompt_version), token_pos, layer, model_type=model_type)
            pos_hs = utils.get_hidden_states(model, tokenizer, format_imdb(text, 1, prompt_version), token_pos, layer, model_type=model_type)
        elif dataset_name == "boolq":
            text, question, true_label = sample["passage"], sample["question"], sample["answer"]
            neg_hs = utils.get_hidden_states(model, tokenizer, format_boolq(text, question, 0), token_pos, layer, model_type=model_type)
            pos_hs = utils.get_hidden_states(model, tokenizer, format_boolq(text, question, 1), token_pos, layer, model_type=model_type)

        # collect
        all_neg_hs.append(neg_hs)
        all_pos_hs.append(pos_hs)
        all_gt_labels.append(true_label)

    all_neg_hs = np.stack(all_neg_hs)
    all_pos_hs = np.stack(all_pos_hs)
    all_gt_labels = np.stack(all_gt_labels)

    return all_neg_hs, all_pos_hs, all_gt_labels

##### Functions for Aggregating Credences

In [39]:
# aggregate a list of credences into one estimate using geometric mean
def aggregate_gmean(credences):
    k = np.shape(credences)[0]
    result = np.power(np.prod(credences * (1 / (1 - credences))), 1 / k)
    return 1 / (1 + result)

# (To-DO) aggregragate a list of credences using weighted geometric mean
def aggregate_gmean_weighted(credences, weights):
    pass

#### Testing a single probe on a different dataset

##### Constructing CCS Pairs

In [41]:
# hyper-parameters
num_example = 100
# layer_idices = [1, 3, 5, 7, -1] 
layer_indices = [-1]
# token_positions = [-2, -1]
token_positions = [-1]
prompt_versions = [1] 
imdb_train_sample = get_random_samples(data_imdb_train, num_example) 
imdb_test_sample = get_random_samples(data_imdb_test, num_example)
boolq_test_sample = get_random_samples(data_boolq_test, num_example)
boolq_train_sample = get_random_samples(data_boolq_train, num_example)

In [None]:
hidden_states_all_imdb_train = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, imdb_train_sample, "imdb", model_type, (token_pos, layer_idx, version))
    hidden_states_all_imdb_train[(token_pos, layer_idx, version)] = (neg_hs, pos_hs, y)

# neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, data_sampled, dataset_name, model_type, (-1, -1, 1))

In [42]:
hidden_states_all_boolq_train = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, boolq_train_sample, "boolq", model_type, (token_pos, layer_idx, version))
    hidden_states_all_boolq_train[(token_pos, layer_idx, version)] = (neg_hs, pos_hs, y)

In [20]:
hidden_states_all_boolq_test = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, boolq_test_sample, "boolq", model_type, (token_pos, layer_idx, version))
    hidden_states_all_boolq_test[(token_pos, layer_idx, version)] = (neg_hs, pos_hs, y)

In [21]:
hidden_states_all_imdb_test = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, imdb_test_sample, "imdb", model_type, (token_pos, layer_idx, version))
    hidden_states_all_imdb_test[(token_pos, layer_idx, version)] = (neg_hs, pos_hs, y)

##### Training Probes

In [29]:
# print(hidden_states_all[(-1, -1, 1)])

In [None]:
probes = {} # a python dictionary of best probes accordng to the specified testing hyperparameters
probabilities_imdb = {} # a python dictionary of credences for imdb 
probabilities_boolq = {} # a python dictionary of credences for boolq 
log_reg = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs_train, pos_hs_train, y_train = hidden_states_all_boolq_train[(token_pos, layer_idx, version)]
    # neg_hs_train, pos_hs_train = neg_hs[: num_example // 2], pos_hs[: num_example // 2]
    # neg_hs_test, pos_hs_test = neg_hs[num_example // 2: ], pos_hs[num_example // 2 :]
    neg_hs_test_boolq, pos_hs_test_boolq, y_test_boolq = hidden_states_all_boolq_test[(token_pos, layer_idx, version)]
    neg_hs_test_imdb, pos_hs_test_imdb, y_test_imdb = hidden_states_all_imdb_test[(token_pos, layer_idx, version)]
    # y_train, y_test = y[: num_example // 2], y[num_example // 2: ]
    credence_estimator = all_classes.CE(neg_hs_train, pos_hs_train)
    credence_estimator.repeated_train()
    cur_best_probe = credence_estimator.get_best_probe()
    probes[(token_pos, layer_idx, version)] = cur_best_probe
    probabilities_imdb[(token_pos, layer_idx, version)] = credence_estimator.get_credence(neg_hs_test_imdb, pos_hs_test_imdb).detach().cpu().numpy()
    probabilities_boolq[(token_pos, layer_idx, version)] = credence_estimator.get_credence(neg_hs_test_boolq, pos_hs_test_boolq).detach().cpu().numpy()

In [46]:
# print(probabilities)

##### Testing Accuracy of a Single Probe

In [34]:
_, _, y_test_boolq = hidden_states_all_boolq_test[(-1, -1, 1)]
_, _, y_test_imdb = hidden_states_all_imdb_test[(-1, -1, 1)]
credences_aggregated_boolq = []
credences_aggregated_imdb = []
for i in range(num_example):
    # list of all credences of example i
    all_estimates_imdb = [credences[i] for credences in probabilities_imdb.values()]
    all_estimates_boolq = [credences[i] for credences in probabilities_boolq.values()]
    credences_aggregated_boolq.append(aggregate_gmean(np.array(all_estimates_boolq)))
    credences_aggregated_imdb.append(aggregate_gmean(np.array(all_estimates_imdb)))
credences_aggregated_imdb = np.array(credences_aggregated_imdb)
credences_aggregated_boolq = np.array(credences_aggregated_boolq)
# print(credences_aggregated_boolq)
# print(credences_aggregated_imdb)
# y_test = all_y[num_example // 2 : ]
predictions_boolq = (credences_aggregated_boolq < 0.5).astype(int)
acc_boolq = (predictions_boolq == y_test_boolq).mean()
acc_boolq = max(acc_boolq, 1 - acc_boolq)
print("Accuracy on BoolQ dataset: {}".format(acc_boolq))

predictions_imdb = (credences_aggregated_imdb < 0.5).astype(int)
acc_imdb = (predictions_imdb == y_test_imdb).mean()
acc_imdb = max(acc_imdb, 1 - acc_imdb)
print("Accuracy on imdb dataset: {}".format(acc_imdb))

Accuracy on BoolQ dataset: 0.52
Accuracy on imdb dataset: 0.95


In [24]:
print(y_test_boolq)
print(y_test_imdb)

[ True False  True False  True  True  True  True False  True  True False
  True False False False False  True False False  True  True  True False
 False  True  True  True  True  True False  True False  True  True False
  True  True  True  True  True  True  True False  True False  True  True
  True  True  True False False  True  True False False False  True False
  True False False  True  True  True  True  True  True  True  True  True
  True  True False False False False False False  True  True  True False
 False  True  True False  True  True  True  True  True False False False
  True  True  True  True]
[0 0 1 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0
 1 1 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0
 1 1 0 0 1 1 1 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1]


##### Testing Accuracies of the Probes

In [None]:
# list of aggregated credences
credences_aggregated = []
for i in range(num_example // 2):
    # list of all credences of example i
    all_estimates = [credences[i] for credences in probabilities.values()]
    credences_aggregated.append(aggregate_gmean(np.array(all_estimates)))

# list of true labels from the test set (the training examples are the same in training the probes)
_, _, y_test = hidden_states_all_testing[(-1, -1, 1)]
credences_aggregated = np.array(credences_aggregated)
# y_test = all_y[num_example // 2 : ]
predictions = (credences_aggregated < 0.5).astype(int)
acc = (predictions == y_test).mean()
acc = max(acc, 1 - acc)
print(acc)
    

KeyError: (-2, 1, 'imdb_1')

##### Logistic Regression