# Create pre-computed dataset

We have observed that the output and the hidden layers of the LLM are deterministic.

In this notebook we aim to create a dataset of precomputed tensors (hidden states, activations, ...), persistently saved on disk.

## Imports, installations and declarations from previous notebooks

This section can be skipped and collapsed.

In [1]:
#@title Mount Drive
import os

# Make this try/except to let this notebook work on Drive but also locally
try:
  from google.colab import drive
  drive.mount('/content/drive')

  DRIVE_PATH = '/content/drive/MyDrive/Final_Project/'
  assert os.path.exists(DRIVE_PATH), 'Did you forget to create a shortcut in MyDrive named Final_Project this time as well? :('
except ModuleNotFoundError:
  DRIVE_PATH = '.'
  assert os.path.abspath(os.getcwd()).split(os.path.sep)[-1] == 'Final_Project'

%cd {DRIVE_PATH}
!pwd
!ls

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1WdIP20OinXKeEN_xVOHEa6WVcY4eSO-k/Final_Project
/content/drive/.shortcut-targets-by-id/1WdIP20OinXKeEN_xVOHEa6WVcY4eSO-k/Final_Project
 1_experiments_on_llama_and_saplma.ipynb  'AML - First presentation.gslides'   saplma-data
 2_create_saplma_tensors_dataset.ipynb	   publicDataset


In [2]:
#@title Install dependencies

# PyTorch (CPU only, if not installed yet)
try:
    import torch
except ModuleNotFoundError:
    !pip install 'torch>=2.1.1' torchvision --index-url https://download.pytorch.org/whl/cpu

# Huggingface dependencies
!pip install huggingface-hub 'transformers>=4.36' 'accelerate>=0.26.0'

# Visualization dependencies
!pip install matplotlib seaborn ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [3]:
#@title Initialize the secret for HuggingFace login
import os
try:
    from google.colab import userdata
    # We are in colab, so we should access it from userdata.get(...)
    assert userdata.get('HF_TOKEN'), 'Set up HuggingFace login secret properly in Colab!'
    print('Found HF_TOKEN in Colab secrets')
except ModuleNotFoundError:
    # Not in colab, so we have to setup the token manually reading from a file
    if os.getenv('HF_TOKEN'):
        print('Found HF_TOKEN in environment variables')
    else:
        # Read it from a file
        hf_token_file = '.hf_token'
        assert os.path.exists(hf_token_file), f'You must create a file in this working directory ({os.getcwd()}) called {hf_token_file}, containing the Huggingface personal secret access token'
        with open(hf_token_file, 'r') as f:
            os.environ['HF_TOKEN'] = f.read().strip()
            print('Found HF_TOKEN in file')

Found HF_TOKEN in Colab secrets


### Re-declare the Llama abstractions we made

From notebook **1**

In [4]:
from dataclasses import dataclass
from collections.abc import Iterator
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm

torch.set_default_dtype(torch.float16)

model_name = "meta-llama/Llama-3.2-1B-Instruct" #info at https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
class LlamaPrompt:
  """
   Class to represent a prompt for the Llama model, which is made of a system prompt,
   which sets the general context according to which the AI should respond,
   and a user prompt, which is the text that the AI should respond to.
  """
  user_prompt: str
  system_prompt: str

  def __init__(self, user_prompt, system_prompt="You are a helpful AI assistant."):
    self.user_prompt = user_prompt
    self.system_prompt = system_prompt

  def __str__(self) -> str:
      # From: https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/#-instruct-model-prompt-
      return ''.join([
          "<|begin_of_text|>",
          f"<|start_header_id|>system<|end_header_id|>{self.system_prompt}<|eot_id|>",
          f"<|start_header_id|>user<|end_header_id|>{self.user_prompt}<|eot_id|>",
          "<|start_header_id|>assistant<|end_header_id|>"
      ])

In [6]:
@dataclass
class LlamaResponse:
    """
      Class to represent a response given by the Llama model.
    """
    prompt: LlamaPrompt
    response: str

In [7]:
class LlamaInstruct:
    """
    Class to wrap the Llama model methods for ease of usage
    """
    def __init__(self, model_name: str, model_args: dict = None, tokenizer_args: dict = None, pad_token: str = None):

        self.model_name = model_name
        self.model_args = model_args if model_args is not None else dict()
        self.tokenizer_args = tokenizer_args if tokenizer_args is not None else dict()

        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", **self.model_args)
        self.model.eval()
        self.device = self.model.device

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left', **self.tokenizer_args)
        self.pad_token = self.tokenizer.eos_token if pad_token is None else pad_token
        self.tokenizer.pad_token = self.pad_token

        self.assistant_header = self.tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", return_tensors="pt").to(self.device)

        self.registered_hooks = []

    # to tokenize input prompts
    def tokenize(self, prompts: str | LlamaPrompt | list[str | LlamaPrompt], pad_to_max_length: int = 70) -> tuple[dict, list[LlamaPrompt]]:

        # Make prompts a list anyway
        if not isinstance(prompts, list):
            prompts = [ prompts ]

        # Convert all prompts to LlamaPrompt
        prompts = [ prompt if isinstance(prompt, LlamaPrompt) else LlamaPrompt(prompt) for prompt in prompts ]

        # tokenizer output will be a dictionary of pytorch tensors with keys "input_ids" (numerical ids of tokens)
        # and "attention_mask" (1 for actual input tokens and 0 for padding tokens)
        inputs = self.tokenizer(
            [ str(prompt) for prompt in prompts ],
            truncation=True,
            return_tensors="pt",
            padding='max_length',
            max_length=pad_to_max_length,
        ).to(self.device)

        return inputs, prompts

    # to make Llama generate responses
    def generate(self, inputs: dict, generate_args: dict = None): #-> Iterator[LlamaResponse] ? right now it does not return that

        generate_args = generate_args if generate_args is not None else dict()
        default_args = {
            "max_length": 100,
            "num_return_sequences": 1,
            "temperature": 0.1,
            "pad_token_id": self.tokenizer.pad_token_id,
            "eos_token_id": self.tokenizer.eos_token_id,
        }

        # Overwrite default_args with generate_args
        default_args.update(generate_args)

        # returns (batch_size, sequence_length) tensors with token ids of the generated response, including input tokens
        return self.model.generate(
            **inputs,
            **default_args,
        )

    # to extract Llama answers as decoded text in a LlamaResponse object starting from encoded input
    def extract_responses(self, input_ids: torch.Tensor, outputs: torch.Tensor, prompts: list[LlamaPrompt]) -> Iterator[LlamaResponse]:

        for input, output, prompt in zip(input_ids, outputs, prompts):
            # Remove the prompt from the output generated
            output = output[len(input):]

            # Remove another assistant_header, if present
            if torch.equal(output[:len(self.assistant_header)], self.assistant_header):
                output = output[len(self.assistant_header):]

            generated = self.tokenizer.decode(output, skip_special_tokens=True).strip()

            yield LlamaResponse(prompt, generated)

    # to get textual Llama responses starting from textual prompts
    def run(self, prompts: str | LlamaPrompt | list[str | LlamaPrompt], verbose: bool = False) -> Iterator[LlamaResponse]:

        # Optional logging function
        def _print(*args, **kwargs):
            if verbose:
                print(*args, **kwargs)

        inputs, prompts = self.tokenize(prompts)

        _print('Tokenized inputs:', inputs.input_ids.shape)
        _print('Last tokens:', inputs.input_ids[:, -1])

        outputs = self.generate(inputs)
        _print('Generated outputs:', outputs.shape)

        return self.extract_responses(inputs.input_ids, outputs, prompts)

    # an hook is a piece of customized code to be run in the forward or backward pass of a model
    # (useful for debugging)
    def register_hook(self, module, hook_fn):
        '''
        Register a hook, in such a way that we have a very easy way to remove the hook later.

        Example usage:

        llama.unregister_all_hooks()
        for module_name, module in llama.model.named_modules():
            if something():
                llama.register_hook(module, hook_fn)
        '''
        handle = module.register_forward_hook(hook_fn)
        self.registered_hooks.append(handle)

    def unregister_all_hooks(self):
        '''
        Remove all of our registered hooks.
        '''
        for handle in self.registered_hooks:
            handle.remove()

    def _get_model_num_heads(self) -> int:
        return self.model.config.num_attention_heads

    def _get_model_hidden_layers(self) -> int:
        return self.model.config.num_hidden_layers


In [8]:
llama = LlamaInstruct(model_name, model_args={"attn_implementation": "eager"})
assert llama.device.type == 'cuda', 'The model should be running on a GPU. On CPU, it is impossible to run'

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

# Dataset creation

From the SAPLMA dataset, run Llama in inference mode and save, for each input sequence:
- the string itself
- its topic
- its label class (true / false)
- all hidden states, for each layer
- all attention maps, for each layer

---

It will be organized on disk in the following way.

`./saplma-data/{topic_name}/{true-false}/` directory containing:
- `strings.txt` = file of all strings in that topic, that are all true or false, according to the parent folder's name
- `[index]_hidden_states.pt` = all hidden states for the i-th string
- `[index]_attention_maps.pt` = all attention maps for the i-th string

In [9]:
#@title Load SAPLMA dataset
class StatementDataset(Dataset):
    """
    PyTorch Dataset for statements and their truth values.
    """
    def __init__(self, dataframe):
        """
        Args:
            dataframe (pd.DataFrame): The combined dataset from all CSV files.
                                      Expects columns ['statement', 'label', 'topic'].
        """
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the row to retrieve.

        Returns:
            tuple: (statement, label, topic), where statement is the text,
                   label is the binary target, and topic is the source file name.
        """
        row = self.data.iloc[idx]
        statement = row['statement']
        label = torch.tensor(row['label'])
        topic = row['topic']
        return statement, label, topic



def create_dataset_with_topics(drive_path):
    """
    Create a StatementDataset from CSV files in a specified Google Drive folder,
    adding a 'topic' column to indicate the source file of each row.

    Args:
        drive_path (str): Path to the folder containing the CSV files.

    Returns:
        StatementDataset: PyTorch Dataset for the combined dataset.
    """
    # Ensure the path exists
    if not os.path.exists(drive_path):
        raise ValueError(f"Path '{drive_path}' does not exist.")

    all_dataframes = []
    for file_name in os.listdir(drive_path):
        file_path = os.path.join(drive_path, file_name)
        if file_name.endswith(".csv"):
            print(f"Loading file: {file_name}")
            # Read the CSV and add a 'topic' column with the file name (without extension)
            df = pd.read_csv(file_path)
            df['topic'] = os.path.splitext(file_name)[0]  # Add topic column
            all_dataframes.append(df)

    if all_dataframes:
        combined_dataset = pd.concat(all_dataframes, ignore_index=True)
    else:
        raise ValueError(f"No CSV files found in the directory '{drive_path}'.")

    # Create and return the PyTorch Dataset
    return StatementDataset(combined_dataset)



dataset_path = os.path.join(DRIVE_PATH, "publicDataset")
assert os.path.exists(dataset_path)
dataset = create_dataset_with_topics(dataset_path)
print(f'Got {len(dataset)} samples.')

topics = dataset.data['topic'].unique()
print(f"Topics: {topics}")

Loading file: cities_true_false.csv
Loading file: animals_true_false.csv
Loading file: elements_true_false.csv
Loading file: inventions_true_false.csv
Loading file: companies_true_false.csv
Loading file: generated_true_false.csv
Loading file: facts_true_false.csv
Got 6330 samples.
Topics: ['cities_true_false' 'animals_true_false' 'elements_true_false'
 'inventions_true_false' 'companies_true_false' 'generated_true_false'
 'facts_true_false']


In [10]:
#@title Create the separated strings.txt

SAPLMA_DATA_PATH = os.path.join('/content', "saplma-data")  #! FIXME: it was DRIVE_PATH/saplma-data
os.makedirs(SAPLMA_DATA_PATH, exist_ok=True)

TRUE_DIR_NAME, FALSE_DIR_NAME, STRINGS_TXT_NAME = "true", "false", "strings.txt"

for topic in topics:
  # Create topic folder
  topic_path = os.path.join(SAPLMA_DATA_PATH, topic)
  os.makedirs(topic_path, exist_ok=True)

  # Create true/false subfolders
  true_path = os.path.join(topic_path, TRUE_DIR_NAME)
  os.makedirs(true_path, exist_ok=True)

  false_path = os.path.join(topic_path, FALSE_DIR_NAME)
  os.makedirs(false_path, exist_ok=True)

  # Filter the strings of this topic
  samples_per_topic = dataset.data[dataset.data['topic'] == topic]

  # Append them into associated text files
  with open(os.path.join(true_path, STRINGS_TXT_NAME), 'w') as f_true, open(os.path.join(false_path, STRINGS_TXT_NAME), 'w') as f_false:
    for index, row in samples_per_topic.iterrows():
      statement = row['statement']
      label = row['label']
      if label == 1:
        f_true.write(statement + '\n')
      elif label == 0:
        f_false.write(statement + '\n')
      else:
        raise ValueError(f"Invalid label value: {label}")

  # Count the lines of the 2 strings files
  with open(os.path.join(true_path, STRINGS_TXT_NAME), 'r') as f_true, open(os.path.join(false_path, STRINGS_TXT_NAME), 'r') as f_false:
    true_lines = len(f_true.readlines())
    false_lines = len(f_false.readlines())

    print(f'Topic {topic:>25}: {true_lines:>4d} true, {false_lines:>4d} false')

Topic         cities_true_false:  729 true,  729 false
Topic        animals_true_false:  504 true,  504 false
Topic       elements_true_false:  465 true,  465 false
Topic     inventions_true_false:  464 true,  412 false
Topic      companies_true_false:  600 true,  600 false
Topic      generated_true_false:  119 true,  126 false
Topic          facts_true_false:  306 true,  307 false


In [None]:
#@title Run the LLM, collecting hidden_states and attentions
@torch.no_grad()
def extract_llm_internals(strings: list[str], max_new_tokens: int = 1, num_return_sequences: int = 1) -> torch.Tensor:
    tokenized_strings, _ = llama.tokenize(strings, pad_to_max_length=70)

    # Check that every string ends with the padding token = the string wouldn't have required a bigger max_length in the tokenizer
    padding_final_token_id = 128007
    assert torch.all(tokenized_strings.input_ids[:, -1] == padding_final_token_id), 'Every string should end with the padding token'

    outputs = llama.generate(
      tokenized_strings,
      generate_args={
        "max_new_tokens": max_new_tokens,
        "max_length": None,
        "num_return_sequences": num_return_sequences,
        # Collect hidden states and attentions
        "output_hidden_states": True,
        "output_attentions": True,
        "return_dict_in_generate": True,
        "use_cache": False,
      }
    )

    # We are interested in the forward pass with only the input sequence, and no output yet.
    # This is the first hidden_state, of every layer
    # Since we are requesting "max_new_tokens = 1",
    # the only forward pass that occurs is the one having the whole input (and nothing more) to process.
    collected_first_hidden_states = outputs.hidden_states[0]  # tuple[layers_num = 17] having Tensor[batch_size, input_tokens = 70, token_dim = 2048]

    # The same applies for the attention maps
    collected_input_attentions = outputs.attentions[0]  # tuple[layers_num = 16] having Tensor[batch_size, heads = 32, input_tokens = 70, input_tokens = 70]

    @torch.no_grad()
    def _reorder_internals(collected_internals: tuple[torch.Tensor]) -> list[torch.Tensor]:
      '''
      Post-process collected first hidden states (or attentions of the input)
      They are a list where i-th item is the output of i-th layer to ALL batch items
      We want it to be a list where j-th item is the stack of all hidden_layers (or attention maps) of that batch.
      '''
      batched_internals = [ [] for _ in range(collected_internals[0].size(0)) ]
      for layer_internals in collected_internals:
        for batch_i in range(layer_internals.size(0)):
          batched_internals[batch_i].append(layer_internals[batch_i])
      return [ torch.stack(layer_internals) for layer_internals in batched_internals ]

    return _reorder_internals(collected_first_hidden_states), _reorder_internals(collected_input_attentions)


batch_size = 256

for topic_idx, topic in enumerate(topics):
  topic_path = os.path.join(SAPLMA_DATA_PATH, topic)
  assert os.path.exists(topic_path), f'Folder for topic {topic} does not exist'

  for subdir_name in [TRUE_DIR_NAME, FALSE_DIR_NAME]:
    subdir_path = os.path.join(topic_path, subdir_name)
    assert os.path.exists(subdir_path), f'Subfolder {subdir_path} does not exist'

    strings_file = os.path.join(subdir_path, STRINGS_TXT_NAME)
    assert os.path.exists(strings_file), f'Strings file {strings_file} does not exist'

    with open(strings_file, 'r') as f:
      strings = [s.strip() for s in f.readlines() if s.strip()]

    print(f'Topic n. {topic_idx+1}/{len(topics)} {topic:>25}: {len(strings):>4d} {subdir_name}')

    for batch_i in range(0, len(strings), batch_size):
      batch_strings = strings[batch_i:batch_i + batch_size]
      print(f'\tbatch_i={batch_i} [{batch_i}:{batch_i+len(batch_strings)}]')

      # Save each string index on its own
      for i, (input_hidden_state, input_attention) in enumerate(zip(*extract_llm_internals(batch_strings))):
        assert input_hidden_state.shape == (17, 70, 2048)  # Embedding layer + 16 decoder layers
        assert input_attention.shape == (16, 32, 70, 70)  # Tensor[16 layers, 32 heads, 70 input_tokens, 70 input_tokens]

        # Save them into a file
        string_global_idx = batch_i + i  # This is the absolute index that the i-th string in this batch has
        hidden_idx_file = os.path.join(subdir_path, f'{string_global_idx}_hidden_states.pt')
        attention_idx_file = os.path.join(subdir_path, f'{string_global_idx}_attentions.pt')
        torch.save(input_hidden_state, hidden_idx_file)
        torch.save(input_attention, attention_idx_file)

      torch.cuda.empty_cache()

    print()

Topic n. 1/7         cities_true_false:  729 true
	batch_i=0 [0:256]
