In [11]:
%load_ext autoreload
%autoreload 2
import utils
import torch
import numpy as np
import all_classes
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
from sklearn.linear_model import LogisticRegression
import itertools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


##### Load Dataset

In [12]:

#load datasets
dataset_name = "imdb"
# dataset_name = "amazon_polarity"
data = load_dataset("imdb")["test"]
data = np.array(data)
# data_testing = load_dataset("amazon_polarity")["test"]
# data_testing = load_dataset("domenicrosati/TruthfulQA")["train"]
data_testing = load_dataset("google/boolq")["validation"]
data_testing = np.array(data_testing)
# data = load_dataset("amazon_polarity")["test"]

##### Load Model

In [13]:
# Here are a few different model options you can play around with:
model_name = "llama3.2"
# model_name = "llama3.3"

# the number of hidden dimensions of the model
hidden_size = 2048
num_layers = 16

cache_dir = None

if model_name == "llama3.2":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, token=True)
    hidden_size = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
else:
    print("Not implemented!")

In [14]:
print(data_testing[0])

{'question': 'does ethanol take more energy make that produces', 'answer': False, 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separ

### Improving Performance of Probes 

##### Functions for Formatting Prompts

In [21]:
def format_imdb(text, label, prompt_version = 1):
    """
    Given an imdb example ("text") and corresponding label (0 for negative, or 1 for positive),
    returns a zero-shot prompt for that example (which includes that label as the answer).

    (This is just one example of a simple, manually created prompt.)
    """
    return "Consider the sentiment of the following review:\n" + text + "\nDoes the above movie review express a " + ["negative", "positive"][label] + " sentiment? " + "Answer: " + "Yes"

def format_boolq(text, question, label):
    """
    Given a BoolQA example ("text") with the corresponding question and label (1 for "Yes" and 0 for "No"),
    returns a zero-shot prompt for that example (which includes that label as the answer).

    (This is just one example of a simple, manually created prompt.)
    """
    return "Consider the following passage:\n" + text + "\n" + "After reading this passage, I have a question: " + question + "?" + " True or False?" + " Answer: " + ["True", "False"][label]

#### Some helper functions

In [34]:
def normalize(x, var_normalize = False):
    """
    Mean-normalizes the data x (of shape (n, d))
    If self.var_normalize, also divides by the standard deviation
    """
    normalized_x = x - x.mean(axis=0, keepdims=True)
    if var_normalize:
        normalized_x /= normalized_x.std(axis=0, keepdims=True)

    return normalized_x

def get_credence(probe, x0_test, x1_test, device = "cpu"):
    """
    Given a probe, compute credence for the current parameters on the given test inputs
    """
    x0 = torch.tensor(normalize(x0_test), dtype=torch.float, requires_grad=False, device=device)
    x1 = torch.tensor(normalize(x1_test), dtype=torch.float, requires_grad=False, device=device)
    with torch.no_grad():
        p0, p1 = probe(x0), probe(x1)
    avg_confidence = 0.5 * (p0 + (1 - p1))

    return avg_confidence

def get_random_samples(data_set, n):
    return np.random.choice(data_set, n)


def get_hidden_states_many_examples(model, tokenizer, data, dataset_name, model_type, params):
    """
    Given an encoder-decoder model, a list of data, computes the contrast hidden states on n random examples by probing the model according to the specified parameters (param = (layer_indices, token_positions, prompt_versions))
    Returns numpy arrays of shape (n, hidden_dim) for each candidate label, along with a boolean numpy array of shape (n,)
    with the ground truth labels
    
    This is deliberately simple so that it's easy to understand, rather than being optimized for efficiency
    """
    # setup
    model.eval()
    all_neg_hs, all_pos_hs, all_gt_labels = [], [], []
    layer, token_pos, prompt_version = params 

    # loop
    for sample in data:
        if dataset_name == "imdb":
            text, true_label = sample["text"], sample["label"]
            neg_hs = utils.get_hidden_states(model, tokenizer, format_imdb(text, 0, prompt_version), token_pos, layer, model_type=model_type)
            pos_hs = utils.get_hidden_states(model, tokenizer, format_imdb(text, 1, prompt_version), token_pos, layer, model_type=model_type)
        elif dataset_name == "boolq":
            text, question, true_label = sample["passage"], sample["question"], sample["answer"]
            neg_hs = utils.get_hidden_states(model, tokenizer, format_boolq(text, question, 0), token_pos, layer, model_type=model_type)
            pos_hs = utils.get_hidden_states(model, tokenizer, format_boolq(text, question, 1), token_pos, layer, model_type=model_type)

        # collect
        all_neg_hs.append(neg_hs)
        all_pos_hs.append(pos_hs)
        all_gt_labels.append(true_label)

    all_neg_hs = np.stack(all_neg_hs)
    all_pos_hs = np.stack(all_pos_hs)
    all_gt_labels = np.stack(all_gt_labels)

    return all_neg_hs, all_pos_hs, all_gt_labels

#### Functions for Formatting Prompts

In [None]:
def format_imdb(text, label):
    """
    Given an imdb example ("text") and corresponding label (0 for negative, or 1 for positive),
    returns a zero-shot prompt for that example (which includes that label as the answer).

    (This is just one example of a simple, manually created prompt.)
    """
    return "Consider the sentiment of the following review:\n" + text + "\nDoes the above movie review express a " + ["negative", "positive"][label] + " sentiment? " + "Answer: " + "Yes"

def format_boolqa(text, question, label):
    """
    Given a BoolQA example ("text") with the corresponding question and label (1 for "Yes" and 0 for "No"),
    returns a zero-shot prompt for that example (which includes that label as the answer).

    (This is just one example of a simple, manually created prompt.)
    """
    return "Consider the following passage:\n" + text + "\n" + "After reading this passage, I have a question: " + question + "?" + " True or False?" + " Answer: " + ["True", "False"][label]

In [8]:
print(data_testing[0])
text, question, answer = data_testing[0]["passage"], data_testing[0]["question"], data_testing[0]["answer"]
print(format_boolqa(text, question, answer))

{'question': 'does ethanol take more energy make that produces', 'answer': False, 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separ

##### Functions for Aggregating Credences

In [9]:
# aggregate a list of credences into one estimate using geometric mean
def aggregate_gmean(credences):
    k = np.shape(credences)[0]
    result = np.power(np.prod(credences * (1 / (1 - credences))), 1 / k)
    return 1 / (1 + result)

# (To-DO) aggregragate a list of credences using weighted geometric mean
def aggregate_gmean_weighted(credences, weights):
    pass

#### Testing a single probe on a different dataset

##### Constructing CCS Pairs

In [36]:
# hyper-parameters
num_example = 100
# layer_idices = [1, 3, 5, 7, -1] 
layer_indices = [-1]
# token_positions = [-2, -1]
token_positions = [-1]
prompt_versions = [1] 
data_sampled = get_random_samples(data, num_example)
data_sampled_testing = get_random_samples(data_testing, num_example)

In [30]:
hidden_states_all = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, data_sampled, dataset_name, model_type, (token_pos, layer_idx, version))
    hidden_states_all[(token_pos, layer_idx, version)] = (neg_hs, pos_hs, y)

# neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, data_sampled, dataset_name, model_type, (-1, -1, 1))

In [38]:
hidden_states_all_testing = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs, pos_hs, y = get_hidden_states_many_examples(model, tokenizer, data_sampled_testing, "boolq", model_type, (token_pos, layer_idx, version))
    hidden_states_all_testing[(token_pos, layer_idx, version)] = (neg_hs, pos_hs, y)

##### Training Probes

In [31]:
print(hidden_states_all[(-1, -1, 1)])

(array([[ 0.04327318,  0.31138775,  3.0550823 , ...,  0.04004868,
         3.7564766 , -1.8230091 ],
       [-0.7510771 ,  0.90223783,  2.7056472 , ...,  1.541798  ,
         3.5128944 ,  1.4865934 ],
       [-0.6196999 ,  0.27405262,  2.9780238 , ...,  0.72248644,
         3.4120562 , -1.2429531 ],
       ...,
       [-1.4143257 ,  0.33925962,  3.459448  , ...,  1.6323417 ,
         3.7051625 , -0.52412343],
       [-0.7334767 ,  1.6315787 ,  3.8417377 , ...,  0.9043974 ,
         2.7949839 ,  0.23785806],
       [-0.99741   ,  0.31599328,  3.4712873 , ...,  1.2866738 ,
         4.4964705 , -0.12033664]], shape=(100, 2048), dtype=float32), array([[-0.97678214,  1.6671463 ,  2.8702164 , ...,  1.7272348 ,
         3.0499668 , -1.489957  ],
       [-0.531275  ,  1.0922753 ,  3.3301604 , ...,  0.3380359 ,
         4.8146496 ,  0.07132591],
       [-1.5116814 ,  1.4539078 ,  2.3280592 , ...,  1.8367846 ,
         3.756009  , -0.930765  ],
       ...,
       [-1.109737  ,  1.1791164 ,  3.80

In [40]:
probes = {} # a python dictionary of best probes accordng to the specified testing hyperparameters
probabilities = {} # a python dictionary of credences according to 
log_reg = {}
for (token_pos, layer_idx, version) in itertools.product(token_positions, layer_indices, prompt_versions):
    neg_hs_train, pos_hs_train, y_train = hidden_states_all[(token_pos, layer_idx, version)]
    neg_hs_test, pos_hs_test, y_test = hidden_states_all[(token_pos, layer_idx, version)]
    credence_estimator = all_classes.CE(neg_hs_train, pos_hs_train)
    credence_estimator.repeated_train()
    cur_best_probe = credence_estimator.get_best_probe()
    probes[(token_pos, layer_idx, version)] = cur_best_probe
    probabilities[(token_pos, layer_idx, version)] = credence_estimator.get_credence(neg_hs_test, pos_hs_test).detach().cpu().numpy()
    # compute logistic regression
    # x_train = neg_hs_train - pos_hs_train
    # x_test = neg_hs_test - pos_hs_test

    # lr = LogisticRegression(class_weight="balanced")
    # lr.fit(x_train, y_train)
    # log_reg[(token_pos, layer_idx, version)] = lr.score(x_test, y_test)

In [46]:
# print(probabilities)

##### Testing Accuracies of the Probes

In [41]:
# list of aggregated credences
credences_aggregated = []
for i in range(num_example // 2):
    # list of all credences of example i
    all_estimates = [credences[i] for credences in probabilities.values()]
    credences_aggregated.append(aggregate_gmean(np.array(all_estimates)))

# list of true labels from the test set (the training examples are the same in training the probes)
_, _, all_y = hidden_states_all[(-2, 1, "imdb_1")]
credences_aggregated = np.array(credences_aggregated)
y_test = all_y[num_example // 2 : ]
predictions = (credences_aggregated < 0.5).astype(int)
acc = (predictions == y_test).mean()
acc = max(acc, 1 - acc)
print(acc)
    

KeyError: (-2, 1, 'imdb_1')

##### Logistic Regression