In [9]:
import json
import torch
from transformers import PreTrainedTokenizerFast

# Configuration parameters
cfg = {
    "tokenizer_path": "/workspace/slice-monorepo/sub_validations/cl_scaling/20B_tokenizer.json",
    "sequential_data_path": "/workspace/slice-monorepo/sub_validations/memory_training/tokenized_output_10k.json",
    "max_tokens": 4098  # Maximum tokens in a batch
}

# Special tokens
special_tokens = {
    "current_conv": "[CURRENT_CONV]",  # Replace with the actual special token used in the data
    "previous_conv": "[PREVIOUS_CONV]",  # Replace with the actual special token used in the data
    "eos_token": "<|endoftext|>",  # Use the tokenizer's EOS token
    "pad_token": "<|pad|>"  # Define a padding token if not already present in the tokenizer
}

def prepare_data_for_training(input_ids, tokenizer, max_len):
    """
    Prepares the input data by splitting it into AR and Seq2Seq parts, padding, and creating the masks.

    Args:
    - input_ids (torch.Tensor): The input IDs tensor (1D).
    - tokenizer (PreTrainedTokenizerFast): The tokenizer used for encoding/decoding.
    - max_len (int): The maximum length for padding.

    Returns:
    - ar_input (torch.Tensor): The padded input IDs tensor for AR part.
    - seq2seq_input (torch.Tensor): The padded input IDs tensor for Seq2Seq part.
    - ar_mask (torch.Tensor): The attention mask for the AR part.
    - seq2seq_mask (torch.Tensor): The attention mask for the Seq2Seq part.
    """

    # Convert special tokens to their corresponding IDs
    current_conv_id = tokenizer.convert_tokens_to_ids(special_tokens["current_conv"])
    previous_conv_id = tokenizer.convert_tokens_to_ids(special_tokens["previous_conv"])
    eos_token_id = tokenizer.convert_tokens_to_ids(special_tokens["eos_token"])

    # Ensure the special tokens are found
    if current_conv_id is None or previous_conv_id is None or eos_token_id is None:
        raise ValueError("Special tokens are not found in the tokenizer vocabulary.")

    # Find the positions of special tokens within the tokenized IDs
    try:
        current_conv_start = (input_ids == current_conv_id).nonzero(as_tuple=True)[0].item()
        previous_conv_start = (input_ids == previous_conv_id).nonzero(as_tuple=True)[0].item()
    except IndexError:
        raise ValueError("Special tokens not found in input sequence.")

    # Split the sequence into AR (current_conv) and Seq2Seq (previous_conv) parts
    current_conv = input_ids[current_conv_start + 1: previous_conv_start]
    previous_conv = input_ids[previous_conv_start + 1:]

    # Append EOS tokens to each part
    current_conv = torch.cat([current_conv, torch.tensor([eos_token_id], dtype=torch.long)])
    previous_conv = torch.cat([previous_conv, torch.tensor([eos_token_id], dtype=torch.long)])

    # Calculate the actual lengths
    ar_size = current_conv.size(0)
    seq2seq_size = previous_conv.size(0)

    # Combine the sequences for full padding and create padded input
    combined = torch.cat([current_conv, previous_conv])
    padding = torch.full((max_len - combined.size(0),), tokenizer.pad_token_id, dtype=torch.long)
    padded_input = torch.cat([combined, padding])

    # Create the attention masks for training
    ar_mask = torch.ones_like(padded_input, dtype=torch.float)
    ar_mask[ar_size:] = 0  # Zero out attention to the Seq2Seq and padding tokens for AR part

    seq2seq_mask = torch.ones_like(padded_input, dtype=torch.float)
    seq2seq_mask[:ar_size] = 0  # Zero out attention to the AR part for Seq2Seq

    return padded_input, ar_mask, seq2seq_mask, ar_size, seq2seq_size

def main():
    # Load the tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=cfg["tokenizer_path"], clean_up_tokenization_spaces=False)

    # Add special tokens to the tokenizer
    special_tokens_dict = {
        "additional_special_tokens": [special_tokens["current_conv"], special_tokens["previous_conv"]]
    }
    tokenizer.add_special_tokens(special_tokens_dict)

    # Set the pad token if not already set
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": special_tokens["pad_token"]})

    # Load the tokenized dataset
    with open(cfg["sequential_data_path"], 'r') as f:
        data = json.load(f)

    # Iterate through a few samples of the data for testing
    for idx, token_data in enumerate(data[:5]):  # Process first 5 entries for example
        # Convert the tokenized data to a torch tensor
        input_ids = torch.tensor(token_data, dtype=torch.long)

        try:
            # Prepare the data and masks
            padded_input, ar_mask, seq2seq_mask, ar_size, seq2seq_size = prepare_data_for_training(input_ids, tokenizer, cfg["max_tokens"])

            # Print out the padded input, masks, and sizes for verification
            print(f"Sample {idx + 1}:")
            print(f"AR (Current Conv) Size: {ar_size}, Seq2Seq (Previous Conv) Size: {seq2seq_size}")
            print(f"Padded Input IDs (first 10 tokens): {padded_input[:10]}")
            print(f"AR Mask (first 10 tokens): {ar_mask[:10]}")
            print(f"Seq2Seq Mask (first 10 tokens): {seq2seq_mask[:10]}")
            print(f"Full Padded Input IDs Shape: {padded_input.shape}")
            print(f"AR Mask Shape: {ar_mask.shape}, Seq2Seq Mask Shape: {seq2seq_mask.shape}")
            print("-" * 50)

        except ValueError as e:
            print(f"Error in sample {idx + 1}: {e}")
            print("-" * 50)

if __name__ == "__main__":
    main()


Sample 1:
AR (Current Conv) Size: 748, Seq2Seq (Previous Conv) Size: 5
Padded Input IDs (first 10 tokens): tensor([50276,   783,  2926,    15,   187,   187,  4041,   273,   253,   277])
AR Mask (first 10 tokens): tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Seq2Seq Mask (first 10 tokens): tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Full Padded Input IDs Shape: torch.Size([4098])
AR Mask Shape: torch.Size([4098]), Seq2Seq Mask Shape: torch.Size([4098])
--------------------------------------------------
Sample 2:
AR (Current Conv) Size: 1033, Seq2Seq (Previous Conv) Size: 747
Padded Input IDs (first 10 tokens): tensor([50276,  2606,  1309,   534,   271,  8183,   310,   187, 19091,   281])
AR Mask (first 10 tokens): tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Seq2Seq Mask (first 10 tokens): tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Full Padded Input IDs Shape: torch.Size([4098])
AR Mask Shape: torch.Size([4098]), Seq2Seq Mask Shape: torch.Size([4098])
----------------

In [3]:
import json
from transformers import PreTrainedTokenizerFast

# Configuration
cfg = {
    "tokenizer_path": "/workspace/slice-monorepo/sub_validations/cl_scaling/20B_tokenizer.json",
    "sequential_data_path": "/workspace/slice-monorepo/sub_validations/memory_training/tokenized_output_10k.json",
    "special_tokens": {
        "current_conv": "[CURRENT_CONV]",
        "previous_conv": "[PREVIOUS_CONV]",
        "eos_token": "[EOS]",
        "pad_token": "[PAD]"
    },
    "max_samples_to_check": 1000  # Number of samples to check
}

def main():
    # Load the tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=cfg["tokenizer_path"], clean_up_tokenization_spaces=False)
    
    # Add special tokens to the tokenizer if they aren't already there
    special_tokens_dict = {
        "additional_special_tokens": [
            cfg["special_tokens"]["current_conv"],
            cfg["special_tokens"]["previous_conv"],
            cfg["special_tokens"]["eos_token"],
            cfg["special_tokens"]["pad_token"]
        ]
    }
    tokenizer.add_special_tokens(special_tokens_dict)
    
    # Retrieve the IDs of the special tokens
    current_conv_id = tokenizer.convert_tokens_to_ids(cfg["special_tokens"]["current_conv"])
    previous_conv_id = tokenizer.convert_tokens_to_ids(cfg["special_tokens"]["previous_conv"])
    eos_token_id = tokenizer.convert_tokens_to_ids(cfg["special_tokens"]["eos_token"])
    
    print(f"Special Tokens:")
    print(f"  [CURRENT_CONV]: {cfg['special_tokens']['current_conv']} (ID: {current_conv_id})")
    print(f"  [PREVIOUS_CONV]: {cfg['special_tokens']['previous_conv']} (ID: {previous_conv_id})")
    print(f"  [EOS]: {cfg['special_tokens']['eos_token']} (ID: {eos_token_id})")
    
    # Load the dataset
    with open(cfg["sequential_data_path"], "r") as f:
        dataset = json.load(f)
    
    print(f"Loaded dataset with {len(dataset)} samples.")
    
    # Initialize counters
    current_conv_count = 0
    previous_conv_count = 0
    eos_count = 0
    
    # Analyze the dataset
    for idx, sample in enumerate(dataset[:cfg["max_samples_to_check"]]):
        token_ids = sample
        
        # Check for the special tokens in the sequence
        if current_conv_id in token_ids:
            current_conv_count += 1
        if previous_conv_id in token_ids:
            previous_conv_count += 1
        if eos_token_id in token_ids:
            eos_count += 1
        
        # Print out the tokenized sequence and positions of special tokens
        print(f"Sample {idx + 1}/{cfg['max_samples_to_check']}:")
        print(f"  Token IDs (first 20): {token_ids[:20]}")
        print(f"  Token IDs (last 20): {token_ids[-20:]}")
        
        # Find and print the positions of the special tokens
        if current_conv_id in token_ids:
            print(f"  [CURRENT_CONV] found at index: {token_ids.index(current_conv_id)}")
        else:
            print(f"  [CURRENT_CONV] not found in this sample.")
        
        if previous_conv_id in token_ids:
            print(f"  [PREVIOUS_CONV] found at index: {token_ids.index(previous_conv_id)}")
        else:
            print(f"  [PREVIOUS_CONV] not found in this sample.")
        
        if eos_token_id in token_ids:
            print(f"  [EOS] found at index: {token_ids.index(eos_token_id)}")
        else:
            print(f"  [EOS] not found in this sample.")
        
        print()
    
    # Summarize findings
    print(f"Summary of Findings:")
    print(f"  [CURRENT_CONV] found in {current_conv_count}/{cfg['max_samples_to_check']} samples.")
    print(f"  [PREVIOUS_CONV] found in {previous_conv_count}/{cfg['max_samples_to_check']} samples.")
    print(f"  [EOS] found in {eos_count}/{cfg['max_samples_to_check']} samples.")

if __name__ == "__main__":
    main()


Special Tokens:
  [CURRENT_CONV]: [CURRENT_CONV] (ID: 50277)
  [PREVIOUS_CONV]: [PREVIOUS_CONV] (ID: 50278)
  [EOS]: [EOS] (ID: 50279)
Loaded dataset with 10000 samples.
Sample 1/1000:
  Token IDs (first 20): [50277, 50276, 783, 2926, 15, 187, 187, 4041, 273, 253, 277, 2764, 13, 581, 14, 21773, 572, 2824, 326, 253]
  Token IDs (last 20): [670, 327, 667, 1677, 1388, 15, 2064, 457, 84, 816, 581, 625, 253, 8256, 209, 50278, 1621, 2045, 7827, 8256]
  [CURRENT_CONV] found at index: 0
  [PREVIOUS_CONV] found at index: 748
  [EOS] not found in this sample.

Sample 2/1000:
  Token IDs (first 20): [50277, 50276, 2606, 1309, 534, 271, 8183, 310, 187, 19091, 281, 253, 1453, 273, 271, 11169, 13, 285, 3797, 512]
  Token IDs (last 20): [13445, 3762, 597, 403, 5366, 8036, 670, 327, 667, 1677, 1388, 15, 2064, 457, 84, 816, 581, 625, 253, 8256]
  [CURRENT_CONV] found at index: 0
  [PREVIOUS_CONV] found at index: 1033
  [EOS] not found in this sample.

Sample 3/1000:
  Token IDs (first 20): [50277, 5027