# Initialization

preliminaries

In [3]:
import torch, os
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import matplotlib.pyplot as plt

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


download tokenizer and model from hf

note: do not download and run both models at the same time, colab has some limitation and it is not guaranteed to work

In [5]:
# login with hf
from huggingface_hub import login
token = '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'
login(token)

In [6]:
model_id_b=f'google/t5gemma-b-b-ul2'
tokenizer_b = AutoTokenizer.from_pretrained(model_id_b)
model_b = AutoModelForSeq2SeqLM.from_pretrained(
    model_id_b,
    device_map="auto",
    dtype=torch.bfloat16,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [None]:
model_id_2b=f'google/t5gemma-2b-2b-ul2'
tokenizer_2b = AutoTokenizer.from_pretrained(model_id_2b)
model_2b = AutoModelForSeq2SeqLM.from_pretrained(
    model_id_2b,
    device_map="auto",
    dtype=torch.bfloat16,
)

## extracting the activations from the model

We use mean pooling to obtain vector representations of sentences because SentenceBERT has shown that it works better than the CLS token. In our case, there is no CLS token, so this was not even an option. SentenceT5 has confirmed that mean pooling is the strategy that yields the best results for T5-based models when it is necessary to extract the sentence representation.

So we use this strategy.

In [None]:
# non batched
model_b.eval()

text = 'tell me something about the human brain'

inputs = tokenizer_b(text, return_tensors="pt").to(model_b.device)

start_token_id = tokenizer_b.bos_token_id
decoder_input_ids = torch.tensor([[start_token_id]], device=model_b.device)

with torch.no_grad():
    outputs = model_b(
        **inputs,
        decoder_input_ids=decoder_input_ids,
        output_hidden_states=True,
    )

encoder_hidden_states = torch.stack([e.cpu().squeeze(0) for e in outputs.encoder_hidden_states])
decoder_hidden_states = torch.stack([o.cpu().view(-1) for o in outputs.decoder_hidden_states])

#print(encoder_hidden_states.shape)
#print(decoder_hidden_states.shape)

print(len(outputs.encoder_hidden_states), len(outputs.decoder_hidden_states))
print(outputs.encoder_hidden_states[0].shape, outputs.decoder_hidden_states[0].shape)

In [7]:
# batched function
def extract_activations_df(base_df, model, tokenizer, text_column, BATCH_SIZE=1):
  df = base_df.copy()
  enc_results = {}
  dec_results = {}

  # mean pooling considering padding and using attention mask to set to 0 pad token representations
  def masked_mean_pooling(hidden_states, attention_mask):
      mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
      masked_embeddings = hidden_states * mask_expanded
      summed = torch.sum(masked_embeddings, dim=1)
      count = torch.clamp(mask_expanded.sum(1), min=1e-9)
      return summed / count

  model.eval()

  total_rows = len(df)

  print(f"Start processing {total_rows} sentences...")

  for i in tqdm(range(0, total_rows, BATCH_SIZE)):
      batch_texts = df[text_column][i : i + BATCH_SIZE].tolist()
      inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
      current_batch_len = inputs.input_ids.shape[0]
      start_token_id = tokenizer.bos_token_id
      decoder_input_ids = torch.full((current_batch_len, 1), start_token_id, device=model.device)

      with torch.no_grad():
          outputs = model(
              **inputs,
              decoder_input_ids=decoder_input_ids,
              output_hidden_states=True,
          )

      # encoder extraction: final shape[Batch, Num_Layers, Hidden_Dim]
      attention_mask = inputs.attention_mask.cpu()
      batch_encoder_states = torch.stack([
          masked_mean_pooling(e.cpu(), attention_mask)
          for e in outputs.encoder_hidden_states
      ], dim=1).cpu().to(torch.float32).numpy()

      # decoder extraction: final shape[Batch, Num_Layers, Hidden_Dim]
      batch_decoder_states = torch.stack([
          o.cpu().squeeze(1) for o in outputs.decoder_hidden_states
      ], dim=1).cpu().to(torch.float32).numpy()

      num_enc_layers = batch_encoder_states.shape[1]
      num_dec_layers = batch_decoder_states.shape[1]

      # saving the activation results into the dictionaries
      for layer_idx in range(num_enc_layers):
          col_name = f'encoder_layer_{layer_idx+1}'
          if col_name not in enc_results: enc_results[col_name] = []
          vectors = list(batch_encoder_states[:, layer_idx, :])
          enc_results[col_name].extend(vectors)

      for layer_idx in range(num_dec_layers):
          col_name = f'decoder_layer_{layer_idx+1}'
          if col_name not in dec_results: dec_results[col_name] = []

          vectors = list(batch_decoder_states[:, layer_idx, :])
          dec_results[col_name].extend(vectors)

  print("Saving in the DataFrame...")
  for col_name, vectors in enc_results.items():
      df[col_name] = vectors

  for col_name, vectors in dec_results.items():
      df[col_name] = vectors

  print("Done! Columns added")
  return df

In [8]:
def save_activations_df(df, dataset_name, model_id):
  path = f'/content/drive/MyDrive/DTCS_datasets/{dataset_name}_{model_id.split('/')[1]}'
  print(f'Saving {dataset_name}_{model_id.split('/')[1]} to GDrive...')
  df.to_pickle(path)
  print(f'Saved {dataset_name}_{model_id.split("/")[1]}')

# Datasets

## True/False

In [None]:
!curl azariaa.com/Content/Datasets/true-false-dataset.zip > true-false-dataset.zip
!unzip "true-false-dataset.zip" -d "true-false-dataset"

In [None]:
# create a dataframe from the csv files
dir_path = '/content/true-false-dataset/publicDataset'
datasets_names = os.listdir(dir_path)
dfs = []

for dataset_name in datasets_names:
  path = f'{dir_path}/{dataset_name}'
  df = pd.read_csv(path)
  df.insert(loc=2, column='area', value=dataset_name.replace('_true_false.csv',''), allow_duplicates=True)
  dfs.append(df)

tf_df = pd.concat(dfs, ignore_index=True)
tf_df

model_b

In [None]:
BATCH_SIZE = 128
text_column = 'statement'

activation_tf_df = extract_activations_df(tf_df, model_b, tokenizer_b, text_column, BATCH_SIZE)
save_activations_df(activation_tf_df, 'true-false', model_id_b)

In [None]:
save_activations_df

model_2b

In [None]:
BATCH_SIZE = 16
text_column = 'statement'

activation_tf_df_2b = extract_activations_df(tf_df, model_2b, tokenizer_2b, text_column, BATCH_SIZE)
save_activations_df(activation_tf_df_2b, 'true-false', model_id_2b)

In [None]:
activation_tf_df_2b

## CoLA

In [9]:
!wget https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
!unzip cola_public_1.1.zip

--2025-11-26 10:32:57--  https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
Resolving nyu-mll.github.io (nyu-mll.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to nyu-mll.github.io (nyu-mll.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255330 (249K) [application/x-zip-compressed]
Saving to: ‘cola_public_1.1.zip’


2025-11-26 10:32:58 (14.0 MB/s) - ‘cola_public_1.1.zip’ saved [255330/255330]

Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [10]:
path = '/content/cola_public/raw/'
cola_files = os.listdir(path) # contiene ['out_of_domain_dev.tsv', 'in_domain_train.tsv', 'in_domain_dev.tsv']
dfs = []

for cf in cola_files:
  df = pd.read_csv(f'{path}{cf}', delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
  df.drop(columns=['sentence_source', 'label_notes'], inplace=True)
  df.insert(loc=0, column='source', value=cf.split('.')[0], allow_duplicates=True)
  dfs.append(df)

cola_df = pd.concat(dfs, ignore_index=True)
cola_df

Unnamed: 0,source,label,sentence
0,out_of_domain_dev,1,Somebody just left - guess who.
1,out_of_domain_dev,1,"They claimed they had settled on something, bu..."
2,out_of_domain_dev,1,"If Sam was going, Sally would know where."
3,out_of_domain_dev,1,"They're going to serve the guests something, b..."
4,out_of_domain_dev,1,She's reading. I can't imagine what.
...,...,...,...
9589,in_domain_train,0,Poseidon appears to own a dragon
9590,in_domain_train,0,Digitize is my happiest memory
9591,in_domain_train,1,It is easy to slay the Gorgon.
9592,in_domain_train,1,I had the strangest feeling that I knew you.


model_b

In [None]:
BATCH_SIZE = 64
text_column = 'sentence'

activation_cola_df = extract_activations_df(cola_df, model_b, tokenizer_b, text_column, BATCH_SIZE)
save_activations_df(activation_cola_df, 'cola', model_id_b)

In [None]:
activation_cola_df

model_2b

In [None]:
BATCH_SIZE = 64
text_column = 'sentence'

activation_cola_df_2b = extract_activations_df(cola_df, model_2b, tokenizer_2b, text_column, BATCH_SIZE)
save_activations_df(activation_cola_df_2b, 'cola', model_id_2b)

In [None]:
save_activations_df(activation_cola_df_2b, 'cola', model_id_2b)

In [None]:
activation_cola_df_2b

## UD_English-EWT

In [None]:
!pip install conllu
!wget https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/refs/heads/master/en_ewt-ud-train.conllu

In [None]:
from conllu import parse_incr

def load_conllu(path):
    with open(path, encoding="utf-8") as f:
        for tokenlist in tqdm(parse_incr(f), desc='Parsing conllu'):
            yield {
                "text": tokenlist.metadata.get("text", ""),
                "tokens": [t["form"] for t in tokenlist],
                "token_id": [t["id"] for t in tokenlist],
                "upos": [t["upostag"] for t in tokenlist],
                #"xpos": [t["xpostag"] for t in tokenlist],
            }

train = list(load_conllu("en_ewt-ud-train.conllu"))

items_to_df = {k:[] for k in train[0].keys()}

for item in tqdm(train, desc='Converting to DataFrame'):
  for k, v in item.items():
    items_to_df[k].append(v)

ewt_df = pd.DataFrame(items_to_df)

In [None]:
ewt_df

here we have 2 pos taggings upos (more general) and xpos (more specific).
we will consider the upos for simplicity

now we define a function to convert the ewt dataset to a token version where the upos and xpos are more clear

In [None]:
def convert_ewt_to_token(ewt_df):
  token_ewt_dict = {
      'words': [],
      'sentence_id': [],
      'upos': [],
      'token_id':[]
      #'xpos': [],
  }

  for row in ewt_df.iterrows():
    for token, upos, token_id in zip(row[1]['tokens'], row[1]['upos'], row[1]['token_id']):
      if isinstance(token_id, int):
        token_ewt_dict['words'].append(token)
        token_ewt_dict['sentence_id'].append(row[0])
        token_ewt_dict['upos'].append(upos)
        token_ewt_dict['token_id'].append(token_id)
        #token_ewt_dict['xpos'].append(xpos)

  return pd.DataFrame(token_ewt_dict)

In [None]:
token_ewt_df = convert_ewt_to_token(ewt_df)
token_ewt_df

there is a problem of subtokenization, we will use the word_ids provided by the tokenizer and send to it the sentence divided into words

In [None]:
num_sentences = token_ewt_df['sentence_id'].nunique()
c = 0
problematic_indexes = []

for index in tqdm(range(num_sentences), desc='Checking sentences'):
  words = token_ewt_df[token_ewt_df['sentence_id'] == index]['words'].tolist()
  inputs = tokenizer_b(words, return_tensors="pt", is_split_into_words=True).to(model_b.device)
  word_ids = inputs.word_ids()

  tokens = tokenizer_b.convert_ids_to_tokens(inputs.input_ids[0])

  control = []
  for word_idx in range(len(words)):
    token_indices = [i for i, w_id in enumerate(word_ids) if w_id == word_idx]

    if not token_indices: # escape case (should not happen but who knows ...)
        continue

    subwords = [tokens[i] for i in token_indices]
    reconstructed_word = ''.join(subwords)
    control.append(reconstructed_word)

  if control != words:
    c = c+1 # c is the number of sentences where the subtoken aggregation differs from the 'dataset' tokenization
    problematic_indexes.append(index) # sentence to be removed later

assert(len(problematic_indexes)==c)
print(f'\nProblematic sentences: {c} ({c/(num_sentences)*100:.2f}%)')

it works prefectly with all the sentences in the dataset!

now let's try to get the word representations at a fixed layer for a fixed sentence: if a token corresponds to a word we will use the representation of the token as the representation of the word, if more token corresponds to a word (we know that thanks to the word_ids) we will calculate the mean (as done previously) to get the word representation.

In [None]:
index = 2
layer = 0

words = token_ewt_df[token_ewt_df['sentence_id'] == index]['words'].tolist()
inputs = tokenizer_b(words, return_tensors="pt", is_split_into_words=True).to(model_b.device)

model_b.eval()
start_token_id = tokenizer_b.bos_token_id
decoder_input_ids = torch.tensor([[start_token_id]], device=model_b.device)

with torch.no_grad():
  outputs = model_b(**inputs,decoder_input_ids=decoder_input_ids,output_hidden_states=True)

encoder_hidden_states = outputs.encoder_hidden_states[layer].squeeze(0)

word_ids = inputs.word_ids()

token_representation = []

for word_idx in range(len(words)):
  token_indices = [i for i, w_id in enumerate(word_ids) if w_id == word_idx]
  relevant_vectors = encoder_hidden_states[token_indices] # getting the correspondent hidden states
  mean_vector = torch.mean(relevant_vectors, dim=0)
  token_representation.append(mean_vector.cpu())

print(f"Original words in the sentence: {len(words)}")
print(f"Tensor obtained: {len(token_representation)}")

now let's put all together into a function to process the whole dataset

In [None]:
def get_word_representation_df(model, tokenizer, df, batch_size=1):
    sentences_words = df.groupby('sentence_id', sort=False)['words'].apply(list).tolist()
    num_encoder_layers = model.config.encoder.num_hidden_layers + 1
    device = model.device
    model.eval()
    word_representation_dict = {f'encoder_layer_{e+1}': [] for e in range(num_encoder_layers)}

    for i in tqdm(range(0, len(sentences_words), batch_size), desc='Processing batches'):
        batch_words = sentences_words[i : i + batch_size]
        inputs = tokenizer(batch_words, return_tensors="pt", padding=True, is_split_into_words=True, truncation=False).to(device)

        current_batch_size = inputs.input_ids.shape[0]
        start_token_id = tokenizer.bos_token_id
        decoder_input_ids = torch.full((current_batch_size, 1), start_token_id, device=device, dtype=int) # [batch_size, 1 (<bos>)]

        with torch.no_grad():
            outputs = model(**inputs, decoder_input_ids=decoder_input_ids,output_hidden_states=True)

        all_layers_hidden_states = torch.stack(outputs.encoder_hidden_states) # [num_layers, batch_size, seq_len, hidden_dim]

        # before iterating over batch sentences to calculate word_ids once
        for b_idx in range(current_batch_size):
            word_ids = inputs.word_ids(batch_index=b_idx)
            num_original_words = len(batch_words[b_idx])

            sentence_states = all_layers_hidden_states[:, b_idx, :, :] # [num_layers, seq_len, hidden_dim]

            # later iterating over words
            for word_idx in range(num_original_words):
                token_indices = [k for k, w_id in enumerate(word_ids) if w_id == word_idx]
                relevant_vectors = sentence_states[:, token_indices, :] # [num_layers, num_subtokens (possibily 1), hidden_dim]
                mean_vectors = torch.mean(relevant_vectors, dim=1)
                mean_vectors_np = mean_vectors.cpu().to(torch.float16).numpy()

                # finally iterating over layers
                for layer_idx in range(num_encoder_layers):
                    word_representation_dict[f'encoder_layer_{layer_idx+1}'].append(mean_vectors_np[layer_idx])


    token_representation_df = pd.DataFrame(word_representation_dict)

    # safety check
    print(f"Original rows: {len(df)}")
    print(f"Extracted rows: {len(token_representation_df)}")

    return token_representation_df

now let's consider the labels

we will consider the base label, with upos tags and also the control task

In [None]:
# defining the POS tags
upos_labels = token_ewt_df['upos'].unique()
upos_tags = {u:i for i,u in enumerate(upos_labels)}

# inserting the tags in the dataset
token_ewt_df['upos_tag']=token_ewt_df['upos'].map(lambda upos: upos_tags[upos])
token_ewt_df.drop(columns=['upos', 'token_id'], inplace=True)

# defining the control task upos tags
unique_words = list(token_ewt_df['words'].unique())
np.random.shuffle(unique_words)

num_upos_tags = len(upos_tags)
token_ct_map_upos={x:i%num_upos_tags for i,x in enumerate(unique_words)}

# adding the control task tags to the dataframe
token_ewt_df['ct_upos_tag']=token_ewt_df['words'].map(lambda u: token_ct_map_upos[u])

optional: xpos

In [None]:
#xpos_labels=token_ewt_df['xpos'].unique()
#xpos_tags={x:i for i,x in enumerate(xpos_labels)}
#token_ewt_df['xpos_tag']=token_ewt_df['xpos'].map(lambda xpos: xpos_tags[xpos])
#token_ewt_df.drop(columns=['xpos'], inplace=True)

# control task

#num_xpos_tags = len(xpos_tags)
#token_ct_map_xpos={x:i%num_xpos_tags for i,x in enumerate(unique_tokens)} # token control task map for xpos
#token_ewt_df['ct_xpos_tag']=token_ewt_df['tokens'].map(lambda x: token_ct_map_xpos[x])

model_b

In [None]:
token_representation_df = get_word_representation_df(model_b, tokenizer_b, token_ewt_df, 32)
token_ewt_df=pd.concat([token_ewt_df, token_representation_df], axis=1)
token_ewt_df

model_2b

In [None]:
token_representation_df_2b = get_word_representation_df(model_2b, tokenizer_2b, token_ewt_df, 8)
token_ewt_df_2b=pd.concat([token_ewt_df, token_representation_df_2b], axis=1)
token_ewt_df_2b

### previous

now let's define the function to get each token representation.

First let's understand how the tokenizer works and how to adapt the tokenizer tokens with the dataset tokens (token at word level)

In [None]:
sentence_id = 0
text = ewt_df['text'][sentence_id]
words = token_ewt_df[token_ewt_df['sentence_id']==sentence_id]['words'].to_list()

inputs = tokenizer_b(text, return_tensors="pt").to(model_b.device)
tokens = [t.replace('▁','') for t in tokenizer_b.convert_ids_to_tokens(inputs.input_ids[0])]

# this dictionary will contain an index and a list of sub tokens that compose the word
subtoken_dict = {i:[] for i in range(len(words))}
wt_count=0
subword=''
for i in range(len(tokens)):
  if subword+tokens[i] == words[wt_count]:
    subtoken_dict[wt_count].append(tokens[i])
    wt_count+=1
    subword=''
  else:
    subtoken_dict[wt_count].append(tokens[i])
    subword=subword+tokens[i]

print(subtoken_dict)

defining a function to handle this behaviour

In [None]:
def get_subtokenization(sentence_id, tokenizer):

    text = ewt_df['text'][sentence_id]
    word_tokens = token_ewt_df[token_ewt_df['sentence_id']==sentence_id]['words'].tolist()
    inputs = tokenizer(text, return_tensors="pt").to(model_b.device)
    tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
    tokens = [t.replace("▁", "") for t in tokens]
    subtoken_dict = {i: [] for i in range(len(word_tokens))}

    wt_count = 0
    subword = ""

    for tok in tokens:
        if wt_count >= len(word_tokens):
            break

        target = word_tokens[wt_count]

        subtoken_dict[wt_count].append(tok)
        subword += tok

        # perfect match -> process next token
        if subword == target:
            wt_count += 1
            subword = ""

    return subtoken_dict

let's check if it works

In [None]:
num_sentences = len(ewt_df)
c = 0
problematic_indexes = []

for i in tqdm(range(num_sentences), desc='Checking sentences'):
  subtoken_dict = get_subtokenization(i, tokenizer_b)

  sentence_token_check = []

  for k,v in subtoken_dict.items():
    subtoken_list = ''.join(v)
    sentence_token_check.append(subtoken_list)
  word_tokens = token_ewt_df[token_ewt_df['sentence_id']==i]['words'].to_list()
  if sentence_token_check != word_tokens:
    c=c+1 # c is the number of sentences where the subtoken aggregation differs from the 'dataset' tokenization
    problematic_indexes.append(i) # sentence to be removed later

assert(len(problematic_indexes)==c)
print(f'\nProblematic sentences: {c} ({c/(num_sentences)*100:.2f}%)')

apparentely 1647 out of the 12543 sentences in the dataset have problems with this subtoken aggregation operation.

This happens because the tokenizer does not divide some elements, for example ":]" is kept by the tokenizer where in the dataset these are two tokens "." and "]".

we can consider removing these sentence as the dataset is still big enough for out scope

In [None]:
# drop these sentences from the original dataset
ewt_df.drop(index=problematic_indexes, inplace=True)
ewt_df.reset_index(drop=True, inplace=True)

# convert the new dataset in the token version
token_ewt_df = convert_ewt_to_token(ewt_df)
token_ewt_df

finally it's time to get the token representations with the subtoken considerations defined above

In [None]:
# fixed index and layer to test
index = 0
layer = 0

sentence = ewt_df['text'][index]
inputs = tokenizer_b(sentence, return_tensors="pt").to(model_b.device)

model_b.eval()
start_token_id = tokenizer_b.bos_token_id
decoder_input_ids = torch.tensor([[start_token_id]], device=model_b.device)

with torch.no_grad():
    outputs = model_b(
        **inputs,
        decoder_input_ids=decoder_input_ids,
        output_hidden_states=True,
    )

encoder_hidden_states = [o.cpu() for o in outputs.encoder_hidden_states]
decoder_hidden_states = [o.cpu() for o in outputs.decoder_hidden_states]

subtokens = get_subtokenization(index, tokenizer_b)
token_representation = []
encoder_hidden_states = encoder_hidden_states[layer].squeeze(0)

token_index = 0
for k,v in subtokens.items():
  n = len(v)
  if n>1:
    mean_tensors_list = []
    for i in range(n):
      mean_tensors_list.append(encoder_hidden_states[token_index+i])
    mean = torch.mean(torch.stack(mean_tensors_list), dim=0)
    token_representation.append(mean)
  else:
    token_representation.append(encoder_hidden_states[token_index])
  token_index+=n

len(token_representation), len(token_ewt_df[token_ewt_df['sentence_id'] == index])

now let's write a function that can do the previous operation to the whole dataset.

using a custom logic (for the subtokens joining) it is easier to keep the function not batched, even if it's not efficient

In [None]:
def get_word_representation_df(model, tokenizer):
  sentences = ewt_df['text'].to_list()
  num_encoder_layers = model_b.config.encoder.num_hidden_layers+1 # considering also the embedding layer

  model.eval()

  word_representation_dict = {}
  for e in range(num_encoder_layers):
    word_representation_dict[f'encoder_layer_{e+1}'] = []

  print('Starting to process sentences ...')
  for sentence_idx, sentence in tqdm(enumerate(sentences), total=len(sentences), desc='Processing sentences to get word representation'):
    inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
    start_token_id = tokenizer.bos_token_id
    decoder_input_ids = torch.tensor([[start_token_id]], device=model.device)
    with torch.no_grad():
      outputs = model_b(
          **inputs,
          decoder_input_ids=decoder_input_ids,
          output_hidden_states=True,
      )
    encoder_hidden_states = torch.stack([e.cpu().squeeze(0) for e in outputs.encoder_hidden_states])

    subtokens = get_subtokenization(sentence_idx, tokenizer)
    for e in range(num_encoder_layers):
        token_representation = word_representation_dict[f'encoder_layer_{e+1}']
        ehs = encoder_hidden_states[e]
        token_index = 0
        for k,v in subtokens.items():
          n = len(v)
          if n>1:
            mean_tensors_list = []
            for i in range(n):
              mean_tensors_list.append(ehs[token_index+i])
            mean = torch.mean(torch.stack(mean_tensors_list), dim=0).to(torch.float32).numpy()
            token_representation.append(mean)
          else:
            token_representation.append(ehs[token_index].to(torch.float32).numpy())
          token_index+=n

  token_representation_df =pd.DataFrame(word_representation_dict)
  print('Sentence processed')
  return token_representation_df


In [None]:
# new function, not batched

def get_word_representation_df(model, tokenizer, df):
    sentence_ids = df['sentence_id'].unique()
    num_encoder_layers = model.config.encoder.num_hidden_layers + 1

    model.eval()

    word_representation_dict = {f'encoder_layer_{e+1}': [] for e in range(num_encoder_layers)}

    for sent_id in tqdm(sentence_ids, desc='Processing sentences'):
        words = df[df['sentence_id'] == sent_id]['words'].tolist()
        inputs = tokenizer(words, return_tensors="pt", is_split_into_words=True).to(model.device)
        start_token_id = tokenizer.bos_token_id
        decoder_input_ids = torch.tensor([[start_token_id]], device=model.device)

        with torch.no_grad():
            outputs = model(
                **inputs,
                decoder_input_ids=decoder_input_ids,
                output_hidden_states=True
            )

        all_encoder_layers = torch.stack(outputs.encoder_hidden_states).squeeze(1)

        word_ids = inputs.word_ids()

        for word_idx in range(len(words)):
            token_indices = [i for i, w_id in enumerate(word_ids) if w_id == word_idx]

            if not token_indices:
                continue

            relevant_vectors = all_encoder_layers[:, token_indices, :]
            mean_vectors = torch.mean(relevant_vectors, dim=1)
            mean_vectors_np = mean_vectors.cpu().to(torch.float32).numpy()

            for layer_idx in range(num_encoder_layers):
                word_representation_dict[f'encoder_layer_{layer_idx+1}'].append(mean_vectors_np[layer_idx])

    first_key = list(word_representation_dict.keys())[0]
    print(f"Total words processed: {len(word_representation_dict[first_key])}")

    token_representation_df = pd.DataFrame(word_representation_dict)
    return token_representation_df

model_b

In [None]:
token_representation_ewt_df = get_word_representation_df(model_b, tokenizer_b)
token_ewt_df = pd.concat([token_ewt_df, token_representation_ewt_df], axis=1)
token_representation_ewt_df

In [None]:

token_ewt_df

## MultiNLI

In [None]:
multinli_dataset = load_dataset("nyu-mll/multi_nli") # one of ['train', 'validation_matched', 'validation_mismatched']

In [None]:
multinli_dataset

In [None]:
multinli_df = multinli_dataset.to_pandas()
multinli_df.drop(columns=['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'], inplace=True)
multinli_df

as shown by [Finetuned Language Models Are Zero-Shot Learners] the model give a better representation if natural language instruction are given

that is why here we use the prompt: 'premise: "{}", hypothesis: "{}"' to get a single sentence, and then use the sentence extraction function as before

In [None]:
multinli_df['sentence'] = multinli_df.apply(lambda row: 'premise: "{}", hypothesis: "{}"'.format(row['premise'], row['hypothesis']), axis=1)
multinli_df.drop(columns=['premise', 'hypothesis'], inplace=True)
multinli_df = multinli_df[['sentence', 'label']]
multinli_df

model_b

In [None]:
multinli_df = extract_activations_df(multinli_df, model_b, tokenizer_b, 'sentence', BATCH_SIZE=16)
multinli_df

## ParaRel

In [None]:
!git clone https://github.com/yanaiela/pararel

In [None]:
import json

objects_files = os.listdir('/content/pararel/data/trex_lms_vocab')
relations_file = os.listdir('/content/pararel/data/pattern_data/graphs_json')

# read the jsonl files in objects_files and create a dataframe for each file, then join them all into a single dataframe
pararel_dfs = []
for obj_file in relations_file:
    obj_path = f'/content/pararel/data/trex_lms_vocab/{obj_file}'
    obj_df = pd.read_json(obj_path, lines=True)

    # read the relations file (same name as obj_file but different path) to get the relation name
    relation_path = f'/content/pararel/data/pattern_data/graphs_json/{obj_file}'
    with open(relation_path, 'r', encoding='utf-8') as f:
        relation = json.loads(f.readline())['extended_lemma']

    obj_df['relation'] = relation
    pararel_dfs.append(obj_df)
pararel_df = pd.concat(pararel_dfs, ignore_index=True)
pararel_df.drop(columns=['uuid'], inplace=True)
pararel_df.columns = ['first_entity', 'second_entity', 'relation']

pararel_df

In [None]:
pararel_df['relation'].unique(), pararel_df['relation'].nunique()

there are 38 relationships, we want to extract sentences in the form "What {h1} is to {t1}, {h2} is to {t2}."
- Random replacement (replace one of the second relation elements with something random)
- Reverse direction (reverse the direction of a correct relation)
- Type

In [None]:
PROMPT = "What {h1} is to {t1}, {h2} is to {t2}."

In [None]:
for i in pararel_df['relation'].unique():
    sample = pararel_df[pararel_df['relation']==i].sample(1)
    print(f'{sample["first_entity"].values[0]} - {i} - {sample["second_entity"].values[0]}')


In [None]:
# ideally the dataset will be 6000 (correct) + 3 (random, reverse, type) * 2000 (wrong) = 12000 sentences
total_dataset_len = 12000
number_correct_relationships = total_dataset_len//2
number_wrong_relationships = total_dataset_len//6

number_relationships = pararel_df['relation'].nunique()
relationships = pararel_df['relation'].unique()
pararel_analogies_dict = {'sentences': [], 'type': []}

# correct relationships
for r in tqdm(relationships, desc='Extracting correct relationships'):
    rel_df = pararel_df[pararel_df['relation']==r]
    for i in range(number_correct_relationships//number_relationships):
        if len(rel_df) > 2:
            sampled = rel_df.sample(n=2)
            row1 = sampled.iloc[0]
            row2 = sampled.iloc[1]
            rel_df = rel_df.drop(sampled.index)

            h1 = row1['first_entity']
            t1 = row1['second_entity']
            h2 = row2['first_entity']
            t2 = row2['second_entity']
            correct_sentence = PROMPT.format(h1=h1, t1=t1, h2=h2, t2=t2)
            pararel_analogies_dict['sentences'].append(correct_sentence)
            pararel_analogies_dict['type'].append(0) # 0 means correct relationship

# random replacement relationships
for r in tqdm(relationships, desc='Extracting wrong relationships (random)'):
    rel_df = pararel_df[pararel_df['relation']==r]
    for i in range(int(number_wrong_relationships/number_relationships)):
        if len(rel_df) > 3 and rel_df['first_entity'].nunique()>2 and rel_df['second_entity'].nunique()>2:
            sampled_done = False
            while not sampled_done:
                sampled = rel_df.sample(n=3)

                row1 = sampled.iloc[0]
                row2 = sampled.iloc[1]
                row3 = sampled.iloc[2]

                h1 = row1['first_entity']
                t1 = row1['second_entity']
                h2 = row2['first_entity']
                t2 = row2['second_entity']
                h3 = row3['first_entity']
                t3 = row3['second_entity']

                if (h1!=h2 and h1!=h3 and t1!=t2 and t1!=t3 and t2!=t3):
                    random_choice = np.random.randint(1,3)
                    if random_choice == 1:
                        random_replacement_sentence = PROMPT.format(h1=h1, t1=t1, h2=h2, t2=t3)
                    else:
                        random_replacement_sentence = PROMPT.format(h1=h1, t1=t1, h2=h3, t2=t2)

                    rel_df = rel_df.drop(sampled.index)
                    pararel_analogies_dict['sentences'].append(random_replacement_sentence)
                    pararel_analogies_dict['type'].append(1) # 0 means random replacement

                    sampled_done = True

# reverse replacement relationships
for r in tqdm(relationships, desc='Extracting wrong relationships (reversed)'):
    rel_df = pararel_df[pararel_df['relation']==r]
    for i in range(int(number_wrong_relationships/number_relationships)):
        if len(rel_df) > 2 and rel_df['first_entity'].nunique()>1 and rel_df['second_entity'].nunique()>1:
            sampled_done = False
            while not sampled_done:
                sampled = rel_df.sample(n=2)
                row1 = sampled.iloc[0]
                row2 = sampled.iloc[1]

                h1 = row1['first_entity']
                t1 = row1['second_entity']
                h2 = row2['first_entity']
                t2 = row2['second_entity']

                if (h1!=h2 and t1!=t2):
                    rel_df = rel_df.drop(sampled.index)
                    random_replacement_sentence = PROMPT.format(h1=h1, t1=t1, h2=t2, t2=h2)
                    pararel_analogies_dict['sentences'].append(random_replacement_sentence)
                    pararel_analogies_dict['type'].append(2) # 2 means reverse replacement
                    sampled_done = True



# type replacement relationships
for r in tqdm(relationships, desc='Extracting wrong relationships (type)'):
    rel_df = pararel_df[pararel_df['relation']==r]
    for i in range((number_wrong_relationships//2)//number_relationships):
        if len(rel_df) > 4 and rel_df['first_entity'].nunique()>3 and rel_df['second_entity'].nunique()>3:
            sampled_done = False
            while not sampled_done:
                sampled = rel_df.sample(n=4)
                row1 = sampled.iloc[0]
                row2 = sampled.iloc[1]
                row3 = sampled.iloc[2]
                row4 = sampled.iloc[3]

                # correct
                h1 = row1['first_entity']
                t1 = row1['second_entity']
                h2 = row2['first_entity']
                t2 = row2['second_entity']

                # to be replaced
                h3 = row3['first_entity']
                t3 = row3['second_entity']
                h4 = row4['first_entity']
                t4 = row4['second_entity']

                if (h1!=h3 and h1!=h4 and h2!=h3 and h2!=h4 and t1!=t3 and t1!=t4 and t2!=t3 and t2!=t4 and h3!=h4 and t3!=t4):
                    rel_df = rel_df.drop(sampled.index)

                    random_replacement_sentence_1 = PROMPT.format(h1=h1, t1=t1, h2=h3, t2=h4)
                    random_replacement_sentence_2 = PROMPT.format(h1=h2, t1=t2, h2=t3, t2=t4)

                    # first wrong sentence
                    pararel_analogies_dict['sentences'].append(random_replacement_sentence_1)
                    pararel_analogies_dict['type'].append(3) # 3 means wrong type relationship

                    # second wrong sentence
                    pararel_analogies_dict['sentences'].append(random_replacement_sentence_2)
                    pararel_analogies_dict['type'].append(3) # 3 means wrong type relationship
                    sampled_done = True



pararel_analogies_df = pd.DataFrame(pararel_analogies_dict)
pararel_analogies_df

In [None]:
pararel_analogies_df.head()['sentences'].to_list()

In [None]:
pararel_analogies_df['label'] = pararel_analogies_df['type'].apply(lambda x: 1 if x == 0 else 0)
pararel_analogies_df = pararel_analogies_df.sample(frac=1, random_state=42).reset_index(drop=True)
pararel_analogies_df

model_b

In [None]:
pararel_analogies_df = extract_activations_df(pararel_analogies_df, model_b, tokenizer_b, 'sentences', BATCH_SIZE=128)
pararel_analogies_df

In [None]:
save_activations_df(pararel_analogies_df, 'pararel_analogies', model_id_b)

model_2b

In [None]:
pararel_analogies_df = extract_activations_df(pararel_analogies_df, model_2b, tokenizer_2b, 'sentences', BATCH_SIZE=128)
pararel_analogies_df

In [None]:
save_activations_df(pararel_analogies_df, 'pararel_analogies', model_id_2b)

## Perturbations

here we consider the perturbations on the dataset

we have to perturbe just the validation set, for being able to detect the model's strength

we can use 2 different perturbation levels: semantic level and syntactic level
- for semantic level we can use https://github.com/makcedward/nlpaug (sinonimi)
- for syntactic level we can use again nlpaug

https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb

for datasets
- true/false: both
- CoLA: semantic (syntactic would change the label)
- EWT: both
- ParaRel: none
- MultiNLI: both (with carefuleness about syntactic)

# Probe

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
class Probe(nn.Module):
  def fit(self, train_loader, epochs=10, lr=0.001, device=None):
    total_losses = []
    accuracies = []

    if device is None:
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    self.to(device)
    criterion = nn.CrossEntropyLoss() # standard
    optimizer = optim.Adam(self.parameters(), lr=lr) # to be defined with hyperparams

    for epoch in range(epochs):
      self.train()
      total_loss = 0
      correct = 0
      total = 0

      for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = self(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        # statistics
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

      # Statistiche di fine epoca
      avg_loss = total_loss / len(train_loader)
      acc = correct / total
      total_losses.append(avg_loss)
      accuracies.append(acc)
      #print(f"Epoca [{epoch+1}/{epochs}] \t Loss: {avg_loss:.4f} \t Acc: {acc:.4f}")

    return total_losses, accuracies

  def evaluate(self, test_loader, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    self.to(device)
    self.eval()
    correct = 0
    total = 0

    # torch.no_grad() risparmia memoria e calcoli perché non traccia i gradienti
    with torch.no_grad():
      for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        outputs = self(batch_x)
        _, predicted = torch.max(outputs.data, 1)

        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

    accuracy = correct / total
    return accuracy

class NonLinearProbe(Probe): # architecture from CS2
  def __init__(self, input_dim, output_dim):
    super(NonLinearProbe, self).__init__()
    self.block1 = nn.Sequential(
        nn.Linear(input_dim, 256),
        nn.ReLU(),
        nn.Dropout(0.2)
    )

    self.block2 = nn.Sequential(
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.Dropout(0.2)
    )

    self.block3 = nn.Sequential(
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Dropout(0.2)
    )

    self.out = nn.Linear(64, output_dim)

  def forward(self, x):
    x = self.block1(x)
    x = self.block2(x)
    x = self.block3(x)
    x = self.out(x)
    return x

class LinearProbe(Probe):
  def __init__(self, input_dim, output_dim):
    super(LinearProbe, self).__init__()
    self.linear1 = nn.Linear(input_dim, 256)
    self.linear2 = nn.Linear(256,128)
    self.linear3 = nn.Linear(128,64)
    self.out = nn.Linear(64, output_dim)

  def forward(self,x):
    x = self.linear1(x)
    x = self.linear2(x)
    x = self.linear3(x)
    x = self.out(x)
    return x

In [None]:
def iterate_training_layers(model_size, df, num_layers, encdec, probe, probe_args={}, split_index=75):
  train_accuracy = []
  test_accuracy = []

  print(f'Training on model {model_size}, considering {encdec}')
  for layer in trange(num_layers):
    col_name = f'{encdec}_layer_{layer+1}'
    num_train_instances = len(df) * split_index // 100
    num_test_instances = len(df) - num_train_instances

    # shuffle the df
    df = df.sample(frac=1, random_state=42)

    train_df = df[:num_train_instances]
    test_df = df[num_train_instances:]

    X_train_tensor = torch.stack([torch.from_numpy(t) for t in train_df[col_name].tolist()])
    y_train_tensor = torch.tensor(train_df['label'].tolist())

    X_test_tensor = torch.stack([torch.from_numpy(t) for t in test_df[col_name].tolist()])
    y_test_tensor = torch.tensor(test_df['label'].tolist())

    # training of the probe
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=64)

    if probe=='linear':
      output_dim = y_train_tensor.max().item()+1
      probe_instance = LinearProbe(input_dim=X_train_tensor.shape[1], output_dim=output_dim)
    elif probe == 'non_linear':
      output_dim = y_train_tensor.max().item()+1
      probe_instance = NonLinearProbe(input_dim=X_train_tensor.shape[1], output_dim=output_dim, **probe_args)
    else:
      raise ValueError('Probe must be either linear or non_linear')

    results = probe_instance.fit(train_loader, epochs=25, lr=0.001)
    train_accuracy.append(results[1][-1])

    # evaluating test accuracy
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=1)
    test_acc = probe_instance.evaluate(test_loader)
    test_accuracy.append(test_acc)

  return train_accuracy, test_accuracy

In [None]:
def plot_accuracies(train_accuracy, test_accuracy, model, probe, knowledge, encdec):
  plt.figure(figsize=(10, 6))
  plt.plot(train_accuracy, label='Train Accuracy')
  plt.plot(test_accuracy, label='Test Accuracy')
  plt.xlabel('Layer')
  plt.ylabel('Accuracy')
  plt.title(f'{knowledge}, model {model}, linearity {probe}, encdec {encdec}')
  plt.grid(True)
  plt.legend()
  plt.show()

model b on factual knowledge, nonlinear

In [None]:
tf_b_path = '/content/drive/MyDrive/DTCS_datasets/true-false_t5gemma-b-b-ul2'
tf_b_df = pd.read_pickle(tf_b_path)

In [None]:
configb = {
    'model_size' : 'b',
    'df' : tf_b_df,
    'probe' : 'non_linear',
    'num_layers' :13
}

encoder_results = iterate_training_layers(**configb, encdec='encoder')
decoder_results = iterate_training_layers(**configb, encdec='decoder')

plot_accuracies(encoder_results[0], encoder_results[1], 'b', 'non_linear', 'factual', 'encoder')
plot_accuracies(decoder_results[0], decoder_results[1], 'b', 'non_linear', 'factual', 'decoder')


In [None]:
configb = {
    'model_size' : 'b',
    'df' : tf_b_df,
    'probe' : 'linear',
    'num_layers' :13
}

# encoder
encoder_results = iterate_training_layers(**configb, encdec='encoder')
plot_accuracies(encoder_results[0], encoder_results[1], 'b', 'linear', 'factual', 'encoder')

# decoder
decoder_results = iterate_training_layers(**configb, encdec='decoder')
plot_accuracies(decoder_results[0], decoder_results[1], 'b', 'linear', 'factual', 'decoder')


model 2b on factual knowledge, nonlinear

In [None]:
tf_2b_path = '/content/drive/MyDrive/DTCS_datasets/true-false_t5gemma-2b-2b-ul2'
tf_2b_df = pd.read_pickle(tf_2b_path)

In [None]:
config2b = {
    'model_size' : '2b',
    'df' : tf_2b_df,
    'probe' : 'non_linear',
    'num_layers' :27
}

# encoder
encoder_results = iterate_training_layers(**config2b, encdec='encoder')
plot_accuracies(encoder_results[0], encoder_results[1], '2b', 'non_linear', 'factual', 'encoder')

# decoder
decoder_results = iterate_training_layers(**config2b, encdec='decoder')
plot_accuracies(decoder_results[0], decoder_results[1], '2b', 'non_linear', 'factual', 'decoder')


In [None]:
config2b = {
    'model_size' : '2b',
    'df' : tf_2b_df,
    'probe' : 'linear',
    'num_layers' :27
}

# encoder
encoder_results = iterate_training_layers(**config2b, encdec='encoder')
plot_accuracies(encoder_results[0], encoder_results[1], '2b', 'linear', 'factual', 'encoder')

# decoder
decoder_results = iterate_training_layers(**config2b, encdec='decoder')
plot_accuracies(decoder_results[0], decoder_results[1], '2b', 'linear', 'factual', 'decoder')


linguistic knowledge

In [None]:
cola_b_path = '/content/drive/MyDrive/DTCS_datasets/cola_t5gemma-b-b-ul2'
cola_b_df = pd.read_pickle(cola_b_path)

In [None]:
cola_b_df

In [None]:
configb = {
    'model_size' : 'b',
    'df' : cola_b_df,
    'probe' : 'non_linear',
    'num_layers' :13
}

# encoder
encoder_results = iterate_training_layers(**configb, encdec='encoder')
plot_accuracies(encoder_results[0], encoder_results[1], 'b', 'non_linear', 'linguistic (cola)', 'encoder')

# decoder
decoder_results = iterate_training_layers(**configb, encdec='decoder')
plot_accuracies(decoder_results[0], decoder_results[1], 'b', 'non_linear', 'linguistic (cola)', 'decoder')


In [None]:
cola_2b_path = '/content/drive/MyDrive/DTCS_datasets/cola_t5gemma-2b-2b-ul2'
cola_2b_df = pd.read_pickle(cola_b_path)

In [None]:
tf_b_df

In [None]:
cola_2b_df

In [None]:
config2b = {
    'model_size':'2b',
    'df':cola_2b_df,
    'probe':'non_linear',
    'num_layers':27
}

# encoder
encoder_results = iterate_training_layers(**config2b, encdec='encoder')
plot_accuracies(encoder_results[0], encoder_results[1], '2b', 'non_linear', 'linguistic (cola)', 'encoder')

# decoder
decoder_results = iterate_training_layers(**config2b, encdec='decoder')
plot_accuracies(decoder_results[0], decoder_results[1], '2b', 'non_linear', 'linguistic (cola)', 'decoder')
