<a href="https://colab.research.google.com/github/stvngo/Algoverse-AI-Model-Probing/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PTS Dataset Cleaning and Preprocessing

-Steven

Link to our GitHub repository: https://github.com/stvngo/Algoverse-AI-Model-Probing

My main notebook (with probe): https://colab.research.google.com/drive/1lPYyJzPMA3MBKDzJQ-X3hVCp_kEFky1s#scrollTo=363e9e8d&uniqifier=2

In [None]:
# install necessary libraries
!pip install datasets --upgrade
!pip install transformers --upgrade
!pip install einops --upgrade
# !pip install flash-attn --upgrade # original PTS settings use flash attention 2, for some reason doesn't work

Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.2-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.54.0-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m141.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.34.2-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.33.5
    Uninstalling huggingface-hub-0.33.5:
      Successfully uninstalled huggingface-hub-0.33.5
  Attempting uninstall: transfor

In [1]:
import logging

# Configure logging for visibility
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
from google.colab import drive
drive.mount('/content/drive') # for saving

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Set Global Random Seeds

For reproducibility of experiments

In [3]:
import random
import numpy as np
import torch

# Set global random seeds for reproducibility
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)

# If using CUDA, also set the seed for CUDA operations
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value) # For multi-GPU

print(f"Global random seeds set to {seed_value} for random, numpy, and torch.")

Global random seeds set to 42 for random, numpy, and torch.


# Load Dataset

- Load dataset through huggingface path
- Imported sklearn train and test split function
- First, we split by query, then create many negative examples while extracting token positions and labels.
- Then, drop duplicate rows.
- Lastly, balance the dataset with twice the original shape

In [None]:
from datasets import load_dataset, Dataset
from typing import List, Dict, Tuple, Optional
from sklearn.model_selection import train_test_split
import pandas as pd

def split_pts_by_query(dataset_path: str, test_size: float = 0.2, subset_size: Optional[int] = None) -> Tuple[Dataset, Dataset]:
    """
    Load PTS dataset, remove duplicates, and split by query ID to avoid data leakage.

    :param dataset_path: Path/name of your PTS dataset on HuggingFace
    :param test_size: Fraction for test split
    :param subset_size: If provided, creates a subset of the dataset for debugging.
    :return: train_dataset, test_dataset split by query
    """
    # Load the PTS dataset with explicit configuration
    print(f"Loading dataset: {dataset_path}")

    try:
        # Try loading without any wildcards or special patterns
        dataset = load_dataset(dataset_path, split='train')
        print(f"Loaded {len(dataset)} examples")

    except Exception as e:
        print(f"Error with split='train', trying default loading: {e}")
        try:
            # Try loading all splits then select one
            dataset_dict = load_dataset(dataset_path)
            print(f"Available splits: {list(dataset_dict.keys())}")

            # Get the main split
            if 'train' in dataset_dict:
                dataset = dataset_dict['train']
            else:
                split_name = list(dataset_dict.keys())[0]
                dataset = dataset_dict[split_name]
                print(f"Using split: {split_name}")

        except Exception as e2:
            print(f"Final error: {e2}")
            print("Try loading the dataset manually first to debug")
            raise e2

    # Create a subset if requested
    if subset_size:
        dataset = dataset.select(range(min(subset_size, len(dataset))))
        print(f"Using a subset of {len(dataset)} examples for debugging.")

    # Remove duplicates
    df = dataset.to_pandas()

    # Drop the timestamp column if it exists
    if 'timestamp' in df.columns:
        df = df.drop(columns=['timestamp'])
        print("Dropped the 'timestamp' column.")

    num_rows_before = len(df)
    df_deduplicated = df.drop_duplicates()
    num_rows_after = len(df_deduplicated)
    num_duplicates_removed = num_rows_before - num_rows_after

    print(f"Removed {num_duplicates_removed} duplicate rows.")
    print(f"Number of rows left: {num_rows_after}")

    # Count number of examples where first token is pivotal
    count = 0
    for _, row in df_deduplicated.iterrows():
        if row["query"] == row['pivot_context']:
            count += 1

    total_examples = len(df_deduplicated)
    percentage = (count / total_examples) * 100

    print(f"Sanity Check Results:")
    print(f"Number of examples where the first token after the query is pivotal: {count}")
    print(f"Total number of examples: {total_examples}")
    print(f"Percentage: {percentage:.2f}%")

    dataset = Dataset.from_pandas(df_deduplicated)


    # Get unique query IDs
    unique_query_ids = list(set(dataset['dataset_item_id']))
    print(f"Total unique queries: {len(unique_query_ids)}")

    # Split query IDs (not individual examples)
    train_query_ids, test_query_ids = train_test_split( # train: 1,3,4,... | test: 2,5,...
        unique_query_ids,
        test_size=test_size,
        random_state=42 # for reproducibility
    )

    # Filter dataset by query splits
    train_dataset = dataset.filter(lambda x: x['dataset_item_id'] in train_query_ids)
    test_dataset = dataset.filter(lambda x: x['dataset_item_id'] in test_query_ids)

    print(f"Train queries: {len(train_query_ids)}, Train examples: {len(train_dataset)}")
    print(f"Test queries: {len(test_query_ids)}, Test examples: {len(test_dataset)}")

    return train_dataset, test_dataset

# Load Model

In [4]:
# import necessary packages
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time # Import time for timing

# manual seed for reproducibility
torch.manual_seed(42)

# torch.set_default_device("cuda")

# check device availability (save resources)
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# model name
model_name = "Qwen/Qwen3-0.6B"

# # Check if flash attention is available
# use_flash_attention = False
# try:
#     import flash_attn
#     print("Flash Attention 2 is available and will be used")
#     use_flash_attention = True
# except ImportError:
#     print("Flash Attention 2 is not available, using standard attention")

# Add flash attention to config if available
model_kwargs = {
    "trust_remote_code": True,
    "device_map": device,
    "output_hidden_states":True
}

# if use_flash_attention:
#     # Flash Attention requires either float16 or bfloat16
#     if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
#         # Use bfloat16 for Ampere or newer GPUs (compute capability 8.0+)
#         model_kwargs["torch_dtype"] = torch.bfloat16
#         print("Using bfloat16 precision with Flash Attention")
#     else:
#         # Use float16 for older GPUs
#         model_kwargs["torch_dtype"] = torch.float16
#         print("Using float16 precision with Flash Attention")

#     model_kwargs["attn_implementation"] = "flash_attention_2"

# load model and tokenizer
# Ensure model and tokenizer are on the correct device AFTER loading
model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)


# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side='left')
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # set padding side if batching

# Set padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model and tokenizer loaded.")

Using device: cpu


The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model and tokenizer loaded.


In [None]:
# print(model.config) # check if we're using flash attention 2

# Run either all in "FIRST METHOD" or all in "SECOND METHOD" section. Don't run both sections.

# FIRST METHOD: Without batching

Notes:
- Slower but I'm sure all the labels will be correct.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset as HFDataset # Use alias to avoid conflict
from torch.utils.data import DataLoader
from typing import List, Dict, Tuple, Optional
from tqdm.auto import tqdm # Import tqdm for progress bars
import pandas as pd
from collections import defaultdict

def generate_full_responses_and_prepare_data(
    raw_dataset: HFDataset,
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    device: torch.device,
    generation_params: Dict,
    max_new_tokens: int = 8192, # Max tokens to generate for full response, 8192 was the original
) -> List[Dict]:
    """
    Generate full model responses and prepare a BALANCED dataset for linear probe training.
    For each row in the raw_dataset, it samples one positive and one negative example.
    The negative example is sampled ONLY from the generated answer part of the text.

    :param raw_dataset: HuggingFace dataset containing raw PTS data.
    :param model: The language model for generation.
    :param tokenizer: The tokenizer for the model.
    :param device: The device to run the model on.
    :param generation_params: Dictionary of generation parameters.
    :param max_new_tokens: Maximum new tokens for the response.
    :return: A balanced list of dictionaries (one positive, one negative per raw example).
    """
    all_examples = []
    model.to(device)
    model.eval()

    print(f"Generating balanced data from {len(raw_dataset)} raw examples...")

    gen_kwargs = {
        "do_sample": True,
        "temperature": generation_params.get("temperature", 0.6),
        "top_p": generation_params.get("top_p", 0.95),
        "top_k": generation_params.get("top_k", 20),
        "min_p": generation_params.get("min_p", 0.0),
        "max_new_tokens": max_new_tokens,
        "eos_token_id": tokenizer.eos_token_id,
        "use_cache": True,
        "output_hidden_states": True,
        "return_dict_in_generate": True,
    }

    # Tokenize all texts at once for efficiency
    tokenized_pivot_contexts = tokenizer([ex['pivot_context'] for ex in raw_dataset], add_special_tokens=False)
    tokenized_queries = tokenizer([ex['query'] for ex in raw_dataset], add_special_tokens=False)

    for i, example in enumerate(tqdm(raw_dataset, desc="Generating and Processing Examples")):
        pivot_context = example['pivot_context']
        original_pivot_token = example['pivot_token']
        query = example['query']
        dataset_item_id = example.get('dataset_item_id', None)

        # Prepare the prompt for the model
        prompt_for_generation = pivot_context + original_pivot_token

        # Tokenize the prompt
        inputs = tokenizer(prompt_for_generation, return_tensors='pt', add_special_tokens=False).to(device)

        with torch.no_grad():
            generation_outputs = model.generate(
                **inputs,
                **gen_kwargs
            )

        # Process the generated response
        full_generated_ids = generation_outputs.sequences[0]
        full_generated_text = tokenizer.decode(full_generated_ids, skip_special_tokens=True)
        full_seq_len = len(full_generated_ids)

        # Get the length of the query in tokens from the pre-tokenized inputs
        query_len_in_tokens = len(tokenized_queries['input_ids'][i])

        # The positive position is the last token of the `pivot_context`.
        positive_position_in_full = len(tokenized_pivot_contexts['input_ids'][i]) - 1


        # --- Add the positive example ---
        all_examples.append({
            'text': full_generated_text,
            'token_position': positive_position_in_full,
            'label': 1,
            'original_dataset_item_id': dataset_item_id,
            'source_raw_index': i
        })

        # --- Sample one negative example from the ANSWER part only ---
        # The "answer" part starts from and includes the last token of the original query.
        possible_negative_positions = list(range(query_len_in_tokens - 1, full_seq_len))

        # Ensure the positive position is not accidentally re-sampled as a negative.
        if positive_position_in_full in possible_negative_positions:
            possible_negative_positions.remove(positive_position_in_full)

        if possible_negative_positions:
            negative_position = random.choice(possible_negative_positions)
            all_examples.append({
                'text': full_generated_text,
                'token_position': negative_position,
                'label': 0,
                'original_dataset_item_id': dataset_item_id,
                'source_raw_index': i
            })

    print(f"Collected {len(all_examples)} total examples (pre-balanced).")
    # Shuffle the final list to mix positive and negative examples
    random.shuffle(all_examples)
    return all_examples

In [None]:
# Re-execute the split function to get the raw datasets
# Using subset_size=20 for debugging
train_raw, test_raw = split_pts_by_query("codelion/Qwen3-0.6B-pts", test_size=0.2)

# Define generation parameters
generation_params = {
    "temperature": 0.6,
    "top_p": 0.95,
    "top_k": 20,
    "min_p": 0.0,
}

# Now call the data preparation function with the raw datasets and generation parameters
train_examples_raw_list = generate_full_responses_and_prepare_data(train_raw, model, tokenizer, device, generation_params, max_new_tokens=512)
test_examples_raw_list = generate_full_responses_and_prepare_data(test_raw, model, tokenizer, device, generation_params, max_new_tokens=512)

print(f"\nPrepared {len(train_examples_raw_list)} raw examples for training.")
print(f"Prepared {len(test_examples_raw_list)} raw examples for testing.")

Loading dataset: codelion/Qwen3-0.6B-pts


README.md: 0.00B [00:00, ?B/s]

gsm8k_pivotal_tokens.jsonl: 0.00B [00:00, ?B/s]

pivotal_tokens.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1376 [00:00<?, ? examples/s]

Loaded 1376 examples
Dropped the 'timestamp' column.
Removed 131 duplicate rows.
Number of rows left: 1245
Sanity Check Results:
Number of examples where the first token after the query is pivotal: 76
Total number of examples: 1245
Percentage: 6.10%
Total unique queries: 104


Filter:   0%|          | 0/1245 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1245 [00:00<?, ? examples/s]

Train queries: 83, Train examples: 1028
Test queries: 21, Test examples: 217
Generating balanced data from 1028 raw examples...


Generating and Processing Examples:   0%|          | 0/1028 [00:00<?, ?it/s]

Collected 2056 total examples (pre-balanced).
Generating balanced data from 217 raw examples...


Generating and Processing Examples:   0%|          | 0/217 [00:00<?, ?it/s]

Collected 434 total examples (pre-balanced).

Prepared 2056 raw examples for training.
Prepared 434 raw examples for testing.


In [None]:
# View some examples

train_dataset_balanced = train_examples_raw_list
test_dataset_balanced = test_examples_raw_list

print("Example from balanced train dataset:")
for i in range(len(train_dataset_balanced)):
  print(train_dataset_balanced[i])

print("\nExample from balanced test dataset:")
for i in range(len(test_dataset_balanced)):
  print(test_dataset_balanced[i])

Example from balanced train dataset:
{'text': "Ben has 8 apples more than Phillip does. Tom has three eighths as many apples at Ben has. If Phillip has 40 apples, how many apples does Tom have? Let's solve this problem step by step. \n\nFirst, we need to find out how many apples Ben has. Since Ben has 8 apples more than Phillip, and Phillip has 40 apples, we can calculate Ben's apples as follows:\n\nBen's apples = Phillip's apples + 8\nBen's apples = 40 + 8\nBen's apples = 48\n\nNow, we need to find out how many apples Tom has. We are told that Tom has three eighths as many apples as Ben does. So, we can calculate the number of apples Tom has by multiplying Ben's apples by 3/8:\n\nTom's apples = Ben's apples * 3/8\nTom's apples = 48 * (3/8)\n\nLet's compute this:\n\n48 * 3 = 144\n144 / 8 = 18\n\nTherefore, Tom has 18 apples. \n\nLet me double-check the calculations to make sure there are no errors. \n\n1. Ben's apples: 40 + 8 = 48 ✔️\n2. Tom's apples: 48 * 3/8 = 18 ✔️\n\nEverything see

In [None]:
print(tokenizer("Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn? To solve the problem, you should multiply", return_tensors='pt', add_special_tokens=False)["input_ids"].shape[1] - 1)
print(tokenizer("Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn? \n\n", return_tensors='pt', add_special_tokens=False)["input_ids"].shape[1] - 1, ", notice there are two newline characters")

38
31 , notice there are two newline characters


In [None]:
import os
import shutil
from datasets import Dataset as HFDataset # Use alias

# Convert the lists to HuggingFace Datasets
train_dataset_balanced_hf = HFDataset.from_list(train_dataset_balanced)
test_dataset_balanced_hf = HFDataset.from_list(test_dataset_balanced)

# Define directories for saving the balanced datasets
BALANCED_DATA_SAVE_DIR_LOCAL = "./balanced_datasets"
os.makedirs(BALANCED_DATA_SAVE_DIR_LOCAL, exist_ok=True)

# Define the base directory in Google Drive to save the balanced datasets
# This should ideally be within your project folder in Drive
BALANCED_DATA_SAVE_DIR_DRIVE_BASE = '/content/drive/My Drive/Algoverse-AI-Model-Probing/balanced_datasets_backup' # ADJUST THIS PATH AS NEEDED
# Ensure the base directory exists in Drive (this will be checked during saving)
os.makedirs(BALANCED_DATA_SAVE_DIR_DRIVE_BASE, exist_ok=True)


# Define specific paths for the train and test datasets
TRAIN_DATA_SAVE_PATH_LOCAL = os.path.join(BALANCED_DATA_SAVE_DIR_LOCAL, "train_dataset_balanced_WITHOUT_BATCHING")
TEST_DATA_SAVE_PATH_LOCAL = os.path.join(BALANCED_DATA_SAVE_DIR_LOCAL, "test_dataset_balanced_WITHOUT_BATCHING")

TRAIN_DATA_SAVE_PATH_DRIVE = os.path.join(BALANCED_DATA_SAVE_DIR_DRIVE_BASE, "train_dataset_balanced_WITHOUT_BATCHING")
TEST_DATA_SAVE_PATH_DRIVE = os.path.join(BALANCED_DATA_SAVE_DIR_DRIVE_BASE, "test_dataset_balanced_WITHOUT_BATCHING")

logging.info("Saving balanced train dataset locally...")
try:
    train_dataset_balanced_hf.save_to_disk(TRAIN_DATA_SAVE_PATH_LOCAL)
    logging.info(f"Balanced train dataset saved locally to {TRAIN_DATA_SAVE_PATH_LOCAL}")

    # Copy to Google Drive
    logging.info(f"Copying balanced train dataset to Google Drive at {TRAIN_DATA_SAVE_PATH_DRIVE}...")
    # Use shutil.copytree to copy the directory
    # Use dirs_exist_ok=True for Python 3.8+ to overwrite if it exists
    shutil.copytree(TRAIN_DATA_SAVE_PATH_LOCAL, TRAIN_DATA_SAVE_PATH_DRIVE, dirs_exist_ok=True)
    logging.info("Finished copying balanced train dataset to Google Drive.")

except Exception as e:
    logging.error(f"Error saving or copying balanced train dataset: {e}")
    logging.warning("Ensure Google Drive is mounted and the path is correct.")


logging.info("Saving balanced test dataset locally...")
try:
    test_dataset_balanced_hf.save_to_disk(TEST_DATA_SAVE_PATH_LOCAL)
    logging.info(f"Balanced test dataset saved locally to {TEST_DATA_SAVE_PATH_LOCAL}")

    # Copy to Google Drive
    logging.info(f"Copying balanced test dataset to Google Drive at {TEST_DATA_SAVE_PATH_DRIVE}...")
    # Use shutil.copytree to copy the directory
    shutil.copytree(TEST_DATA_SAVE_PATH_LOCAL, TEST_DATA_SAVE_PATH_DRIVE, dirs_exist_ok=True)
    logging.info("Finished copying balanced test dataset to Google Drive.")

except Exception as e:
    logging.error(f"Error saving or copying balanced test dataset: {e}")
    logging.warning("Ensure Google Drive is mounted and the path is correct.")

In [None]:
import os
print(os.path.exists("/content/drive/My Drive"))

False


## SECOND METHOD: Use DataLoaders and Batching

Notes:

- Faster but positions may be flawed (it's pretty complicated to understand)

- When batching, used padding and truncation
- When loading the model, set tokenizer.pad_token = tokenizer.eso_token, and AutoTokenizer.from_pretrained(..., padding_size='left')
- If available, use flash attention 2 (dtype=torch.bfloat16)
- Set max_new_tokens high enough such that truncation has little effect
- **Runs the risk of overcomplicating the negative positions because of the padding and truncation**
  - Needs to perform some calculations for the "offset", such that padding tokens are being fully account3ed for when calculating the answer_start_position for negative sampling.


Positives are safely created for both methods I've shown, it's really a **tradeoff between time complexity vs. feasibility of labelling the right positions**.


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset as HFDataset # Use alias to avoid conflict
from torch.utils.data import DataLoader
from typing import List, Dict, Tuple, Optional
from tqdm.auto import tqdm # Import tqdm for progress bars
import pandas as pd
from collections import defaultdict

def generate_full_responses_and_prepare_data(
    raw_dataset: HFDataset,
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    device: torch.device,
    generation_params: Dict,
    batch_size: int = 5, # Add batch_size parameter, original was 5
    max_new_tokens: int = 8192, # Max tokens to generate for full response, original was 8192, use 512 for efficiency?
) -> List[Dict]:
    """
    Generate full model responses and prepare a BALANCED dataset for linear probe training.
    For each row in the raw_dataset, it samples one positive and one negative example.
    The negative example is sampled ONLY from the generated answer part of the text.

    :param raw_dataset: HuggingFace dataset containing raw PTS data.
    :param model: The language model for generation.
    :param tokenizer: The tokenizer for the model.
    :param device: The device to run the model on.
    :param generation_params: Dictionary of generation parameters.
    :param batch_size: The batch size for generation.
    :param max_new_tokens: Maximum new tokens for the response.
    :return: A balanced list of dictionaries (one positive, one negative per raw example).
    """
    all_examples = []
    model.to(device)
    model.eval()

    print(f"Generating balanced data from {len(raw_dataset)} raw examples...")

    gen_kwargs = {
        "do_sample": True,
        "temperature": generation_params.get("temperature", 0.6),
        "top_p": generation_params.get("top_p", 0.95),
        "top_k": generation_params.get("top_k", 20),
        "min_p": generation_params.get("min_p", 0.0),
        "max_new_tokens": max_new_tokens,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "use_cache": True,
        "output_hidden_states": True,
        "return_dict_in_generate": True,
    }

    # Create a DataLoader to process the data in batches
    dataloader = DataLoader(raw_dataset, batch_size=batch_size)

    for batch in tqdm(dataloader, desc="Generating and Processing Examples"):
        # The dataloader returns a dictionary of lists. We need to extract the lists.
        pivot_contexts = batch['pivot_context']
        original_pivot_tokens = batch['pivot_token']
        queries = batch['query']
        dataset_item_ids = batch.get('dataset_item_id', [None] * len(pivot_contexts))

        # Prepare the prompts for the batch
        prompts_for_generation = [pc + opt for pc, opt in zip(pivot_contexts, original_pivot_tokens)]

        # Tokenize the batch of prompts
        inputs = tokenizer(prompts_for_generation, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False).to(device)

        with torch.no_grad():
            generation_outputs = model.generate(
                **inputs,
                **gen_kwargs
            )

        # Process each example in the batch
        for i in range(len(prompts_for_generation)):
            full_generated_ids = generation_outputs.sequences[i]
            full_generated_text = tokenizer.decode(full_generated_ids, skip_special_tokens=True)

            # Correctly calculate the number of padding tokens on the left
            padding_len = (inputs.input_ids[i] == tokenizer.pad_token_id).sum().item()

            # Get the length of the query and context in tokens
            query_len_in_tokens = tokenizer(queries[i], return_tensors='pt', add_special_tokens=False)['input_ids'].shape[1]
            context_len_in_tokens = tokenizer(pivot_contexts[i], return_tensors='pt', add_special_tokens=False)['input_ids'].shape[1]

            # The positive position must be shifted by the padding length
            positive_position_in_full = padding_len + context_len_in_tokens - 1


            # --- Add the positive example ---
            all_examples.append({
                'text': full_generated_text,
                'token_position': positive_position_in_full,
                'label': 1,
                'original_dataset_item_id': dataset_item_ids[i],
                'source_raw_index': i # This index is now relative to the batch
            })

            # --- Sample one negative example from the ANSWER part only ---
            # The "answer" part starts after the query. This also must be shifted by padding.
            full_seq_len = len(full_generated_ids)
            answer_start_position = padding_len + query_len_in_tokens -1
            possible_negative_positions = list(range(answer_start_position, full_seq_len))

            # Ensure the positive position is not accidentally re-sampled as a negative.
            if positive_position_in_full in possible_negative_positions:
                possible_negative_positions.remove(positive_position_in_full)

            if possible_negative_positions:
                negative_position = random.choice(possible_negative_positions)
                all_examples.append({
                    'text': full_generated_text,
                    'token_position': negative_position,
                    'label': 0,
                    'original_dataset_item_id': dataset_item_ids[i],
                    'source_raw_index': i # This index is now relative to the batch
                })

    print(f"Collected {len(all_examples)} total examples (pre-balanced).")
    # Shuffle the final list to mix positive and negative examples
    random.shuffle(all_examples)
    return all_examples

In [None]:
# Re-execute the split function to get the raw datasets
# Using subset_size=20 for debugging
train_raw, test_raw = split_pts_by_query("codelion/Qwen3-0.6B-pts", test_size=0.2)

# Define generation parameters
generation_params = {
    "temperature": 0.6,
    "top_p": 0.95,
    "top_k": 20,
    "min_p": 0.0,
}

# Now call the data preparation function with the raw datasets and generation parameters
# We can also specify a batch size for the generation process
train_examples_raw_list = generate_full_responses_and_prepare_data(train_raw, model, tokenizer, device, generation_params, batch_size=5, max_new_tokens=512)
test_examples_raw_list = generate_full_responses_and_prepare_data(test_raw, model, tokenizer, device, generation_params, batch_size=5, max_new_tokens=512)

print(f"\nPrepared {len(train_examples_raw_list)} raw examples for training.")
print(f"Prepared {len(test_examples_raw_list)} raw examples for testing.")

In [None]:
# View some examples

train_dataset_balanced = train_examples_raw_list
test_dataset_balanced = test_examples_raw_list

print("Example from balanced train dataset:")
for i in range(len(train_dataset_balanced)):
  print(train_dataset_balanced[i])

print("\nExample from balanced test dataset:")
for i in range(len(test_dataset_balanced)):
  print(test_dataset_balanced[i])

In [None]:
print(tokenizer("Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn? To solve the problem, you should multiply", return_tensors='pt', add_special_tokens=False)["input_ids"].shape[1] - 1)
print(tokenizer("Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn? \n\n", return_tensors='pt', add_special_tokens=False)["input_ids"].shape[1] - 1, ", notice there are two newline characters")

In [None]:
import os
import shutil
from datasets import Dataset as HFDataset # Use alias

# Convert the lists to HuggingFace Datasets
train_dataset_balanced_hf = HFDataset.from_list(train_dataset_balanced)
test_dataset_balanced_hf = HFDataset.from_list(test_dataset_balanced)

# Define directories for saving the balanced datasets
BALANCED_DATA_SAVE_DIR_LOCAL = "./balanced_datasets"
os.makedirs(BALANCED_DATA_SAVE_DIR_LOCAL, exist_ok=True)

# Define the base directory in Google Drive to save the balanced datasets
# This should ideally be within your project folder in Drive
BALANCED_DATA_SAVE_DIR_DRIVE_BASE = '/content/drive/My Drive/Algoverse-AI-Model-Probing/balanced_datasets_backup' # ADJUST THIS PATH AS NEEDED
# Ensure the base directory exists in Drive (this will be checked during saving)
os.makedirs(BALANCED_DATA_SAVE_DIR_DRIVE_BASE, exist_ok=True)


# Define specific paths for the train and test datasets
TRAIN_DATA_SAVE_PATH_LOCAL = os.path.join(BALANCED_DATA_SAVE_DIR_LOCAL, "train_dataset_balanced_WITH_BATCHING")
TEST_DATA_SAVE_PATH_LOCAL = os.path.join(BALANCED_DATA_SAVE_DIR_LOCAL, "test_dataset_balanced_WITH_BATCHING")

TRAIN_DATA_SAVE_PATH_DRIVE = os.path.join(BALANCED_DATA_SAVE_DIR_DRIVE_BASE, "train_dataset_balanced_WITH_BATCHING")
TEST_DATA_SAVE_PATH_DRIVE = os.path.join(BALANCED_DATA_SAVE_DIR_DRIVE_BASE, "test_dataset_balanced_WITH_BATCHING")

logging.info("Saving balanced train dataset locally...")
try:
    train_dataset_balanced_hf.save_to_disk(TRAIN_DATA_SAVE_PATH_LOCAL)
    logging.info(f"Balanced train dataset saved locally to {TRAIN_DATA_SAVE_PATH_LOCAL}")

    # Copy to Google Drive
    logging.info(f"Copying balanced train dataset to Google Drive at {TRAIN_DATA_SAVE_PATH_DRIVE}...")
    # Use shutil.copytree to copy the directory
    # Use dirs_exist_ok=True for Python 3.8+ to overwrite if it exists
    shutil.copytree(TRAIN_DATA_SAVE_PATH_LOCAL, TRAIN_DATA_SAVE_PATH_DRIVE, dirs_exist_ok=True)
    logging.info("Finished copying balanced train dataset to Google Drive.")

except Exception as e:
    logging.error(f"Error saving or copying balanced train dataset: {e}")
    logging.warning("Ensure Google Drive is mounted and the path is correct.")


logging.info("Saving balanced test dataset locally...")
try:
    test_dataset_balanced_hf.save_to_disk(TEST_DATA_SAVE_PATH_LOCAL)
    logging.info(f"Balanced test dataset saved locally to {TEST_DATA_SAVE_PATH_LOCAL}")

    # Copy to Google Drive
    logging.info(f"Copying balanced test dataset to Google Drive at {TEST_DATA_SAVE_PATH_DRIVE}...")
    # Use shutil.copytree to copy the directory
    shutil.copytree(TEST_DATA_SAVE_PATH_LOCAL, TEST_DATA_SAVE_PATH_DRIVE, dirs_exist_ok=True)
    logging.info("Finished copying balanced test dataset to Google Drive.")

except Exception as e:
    logging.error(f"Error saving or copying balanced test dataset: {e}")
    logging.warning("Ensure Google Drive is mounted and the path is correct.")

## Load Balanced Datasets

Use this cell to load previously saved balanced datasets instead of regenerating them.

In [5]:
import os
from datasets import load_from_disk

# Define directories where the balanced datasets are saved
# Prioritize loading from Drive if mounted and available
BALANCED_DATA_LOAD_DIR_DRIVE_BASE = '/content/drive/MyDrive/Zurabi_AI_Data' # ADJUST THIS PATH AS NEEDED
BALANCED_DATA_LOAD_DIR_LOCAL = "./balanced_datasets"

# Define specific paths for the train and test datasets
TRAIN_DATA_LOAD_PATH_DRIVE = os.path.join(BALANCED_DATA_LOAD_DIR_DRIVE_BASE, "train_dataset_balanced_WITHOUT_BATCHING") # CHANGE AS NEEDED
TEST_DATA_LOAD_PATH_DRIVE = os.path.join(BALANCED_DATA_LOAD_DIR_DRIVE_BASE, "test_dataset_balanced_WITHOUT_BATCHING") # CHANGE AS NEEDED

TRAIN_DATA_LOAD_PATH_LOCAL = os.path.join(BALANCED_DATA_LOAD_DIR_LOCAL, "train_dataset_balanced_WITHOUT_BATCHING") # CHANGE AS NEEDED
TEST_DATA_LOAD_PATH_LOCAL = os.path.join(BALANCED_DATA_LOAD_DIR_LOCAL, "test_dataset_balanced_WITHOUT_BATCHING") # CHANGE AS NEEDED


# Attempt to load train dataset, prioritizing Drive
train_dataset_balanced = None
if os.path.exists('/content/drive') and os.path.exists(TRAIN_DATA_LOAD_PATH_DRIVE):
    logging.info(f"Attempting to load balanced train dataset from Google Drive at {TRAIN_DATA_LOAD_PATH_DRIVE}...")
    try:
        train_dataset_balanced = load_from_disk(TRAIN_DATA_LOAD_PATH_DRIVE)
        logging.info("Successfully loaded balanced train dataset from Google Drive.")
    except Exception as e:
        logging.warning(f"Could not load from Google Drive: {e}. Checking local path.")

if train_dataset_balanced is None and os.path.exists(TRAIN_DATA_LOAD_PATH_LOCAL):
    logging.info(f"Attempting to load balanced train dataset from local path at {TRAIN_DATA_LOAD_PATH_LOCAL}...")
    try:
        train_dataset_balanced = load_from_disk(TRAIN_DATA_LOAD_PATH_LOCAL)
        logging.info("Successfully loaded balanced train dataset from local path.")
    except Exception as e:
        logging.error(f"Could not load from local path: {e}. Balanced train dataset not loaded.")


# Attempt to load test dataset, prioritizing Drive
test_dataset_balanced = None
if os.path.exists('/content/drive') and os.path.exists(TEST_DATA_LOAD_PATH_DRIVE):
    logging.info(f"Attempting to load balanced test dataset from Google Drive at {TEST_DATA_LOAD_PATH_DRIVE}...")
    try:
        test_dataset_balanced = load_from_disk(TEST_DATA_LOAD_PATH_DRIVE)
        logging.info("Successfully loaded balanced test dataset from Google Drive.")
    except Exception as e:
        logging.warning(f"Could not load from Google Drive: {e}. Checking local path.")

if test_dataset_balanced is None and os.path.exists(TEST_DATA_LOAD_PATH_LOCAL):
    logging.info(f"Attempting to load balanced test dataset from local path at {TEST_DATA_LOAD_PATH_LOCAL}...")
    try:
        test_dataset_balanced = load_from_disk(TEST_DATA_LOAD_PATH_LOCAL)
        logging.info("Successfully loaded balanced test dataset from local path.")
    except Exception as e:
        logging.error(f"Could not load from local path: {e}. Balanced test dataset not loaded.")


# You can add checks here to see if the datasets were loaded successfully
if train_dataset_balanced is not None:
    print(f"Loaded balanced train dataset with {len(train_dataset_balanced)} examples.")
else:
    print("Balanced train dataset was not loaded. You will need to generate it.")

if test_dataset_balanced is not None:
    print(f"Loaded balanced test dataset with {len(test_dataset_balanced)} examples.")
else:
    print("Balanced test dataset was not loaded. You will need to generate it.")

# Note: After loading, you would proceed to create DataLoaders from these datasets
# in the next cell (the training loop cell), but you would skip the data generation and balancing steps there.

Loaded balanced train dataset with 2056 examples.
Loaded balanced test dataset with 434 examples.


In [6]:
print("Example from balanced train dataset:")
print(train_dataset_balanced[0])

print("\nExample from balanced test dataset:")
print(test_dataset_balanced[0])

Example from balanced train dataset:
{'text': "Ben has 8 apples more than Phillip does. Tom has three eighths as many apples at Ben has. If Phillip has 40 apples, how many apples does Tom have? Let's solve this problem step by step. \n\nFirst, we need to find out how many apples Ben has. Since Ben has 8 apples more than Phillip, and Phillip has 40 apples, we can calculate Ben's apples as follows:\n\nBen's apples = Phillip's apples + 8\nBen's apples = 40 + 8\nBen's apples = 48\n\nNow, we need to find out how many apples Tom has. We are told that Tom has three eighths as many apples as Ben does. So, we can calculate the number of apples Tom has by multiplying Ben's apples by 3/8:\n\nTom's apples = Ben's apples * 3/8\nTom's apples = 48 * (3/8)\n\nLet's compute this:\n\n48 * 3 = 144\n144 / 8 = 18\n\nTherefore, Tom has 18 apples. \n\nLet me double-check the calculations to make sure there are no errors. \n\n1. Ben's apples: 40 + 8 = 48 ✔️\n2. Tom's apples: 48 * 3/8 = 18 ✔️\n\nEverything see

Prevent Colab runtime disconnecting:

In [None]:
# from pynput.mouse import Controller, Button
# import time

# mouse = Controller()

# while True:
#   mouse.click(Button.left, 1)
#   print('clicked')

#   time.sleep(300)

In [None]:
from tqdm.auto import tqdm

def relabel_positive_positions(balanced_dataset, raw_dataset, tokenizer):
    """
    Verifies and corrects the 'token_position' for positive examples in a balanced dataset.

    :param balanced_dataset: The balanced dataset (list of dicts) to check.
    :param raw_dataset: The original raw dataset (HuggingFace Dataset) used as a reference.
    :param tokenizer: The tokenizer used for the model.
    :return: The balanced dataset with corrected positive positions.
    """
    for i, example in enumerate(tqdm(balanced_dataset, desc="Relabeling positive positions")):
        if example['label'] == 1:
            source_index = example['source_raw_index']

            # Ensure the source_index is within the bounds of the raw_dataset
            if source_index < len(raw_dataset):
                original_example = raw_dataset[source_index]
                pivot_context = original_example['pivot_context']

                # Tokenize the pivot_context to get its length in tokens
                pivot_context_tokens = tokenizer(pivot_context, add_special_tokens=False)['input_ids']
                correct_position = len(pivot_context_tokens) - 1

                # Check if the saved position is different from the correct one
                if example['token_position'] != correct_position:
                    print(f"Relabeling example {i}: Old position {example['token_position']}, New position {correct_position}")
                    balanced_dataset[i]['token_position'] = correct_position
            else:
                print(f"Warning: source_raw_index {source_index} is out of bounds for the raw dataset.")

    return balanced_dataset

# Assuming train_dataset_balanced, test_dataset_balanced, train_raw, test_raw, and tokenizer are loaded

# It's a good practice to work with copies to avoid unintended side effects
train_dataset_to_relabel = list(train_dataset_balanced)
test_dataset_to_relabel = list(test_dataset_balanced)


print("Verifying and relabeling training data...")
train_dataset_relabelled = relabel_positive_positions(train_dataset_to_relabel, train_raw, tokenizer)

print("\nVerifying and relabeling testing data...")
test_dataset_relabelled = relabel_positive_positions(test_dataset_to_relabel, test_raw, tokenizer)

print("\nRelabeling process complete.")