In [None]:
# test_process.py

import time
import torch
import pandas as pd
from datasets import Dataset as HFDataset
from transformers import AutoTokenizer

# ------------------------------
# Config
# ------------------------------
TOKENIZER_PATH = "SouthernCrossAI/JoeyLLM_Tokenizer"
INPUT_PATH = "../../JoeyData/10BT/008_00000.parquet"
OUTPUT_PATH = "../../JoeyData/10BT/hugginface/008_00000.parquet"
CHUNK_SIZE = 512
NUM_PROC = 45

# ------------------------------
# Initialize tokenizer
# ------------------------------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)

# ------------------------------
# Tokenization function
# ------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        add_special_tokens=False
    )

# ------------------------------
# Main processing pipeline
# ------------------------------

# Load text column as pandas DataFrame
df = pd.read_parquet(INPUT_PATH, columns=["text"])

# Convert to Hugging Face Dataset
dataset = HFDataset.from_pandas(df)

# Tokenize dataset with multiprocessing
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=NUM_PROC
)

# Convert tokenized dataset to pandas DataFrame and extract input_ids column
df2 = tokenized_dataset.to_pandas()
input_ids_series = df2['input_ids']

# Flatten input_ids into a single list of tokens
all_tokens = [token for sublist in input_ids_series for token in sublist]

# Create input_ids and target_ids chunks
input_ids_chunks = [
    all_tokens[i:i+CHUNK_SIZE]
    for i in range(0, len(all_tokens)-CHUNK_SIZE, CHUNK_SIZE)
]
target_ids_chunks = [
    all_tokens[i+1:i+CHUNK_SIZE+1]
    for i in range(0, len(all_tokens)-CHUNK_SIZE, CHUNK_SIZE)
]

# Build final DataFrame
df_final = pd.DataFrame({
    'input_ids': input_ids_chunks,
    'target_ids': target_ids_chunks
})

# Ensure both columns have same number of rows
if len(df_final['input_ids']) != len(df_final['target_ids']):
    df_final = df_final.iloc[:-1]

# Convert to Hugging Face Dataset and save
hf_dataset = HFDataset.from_pandas(df_final, preserve_index=False)
hf_dataset.save_to_disk(OUTPUT_PATH)

print(f"✅ Dataset saved to {OUTPUT_PATH}")


# Test

In [2]:
from datasets import load_from_disk
from torch.utils.data import DataLoader

# ------------------------------
# Load saved Hugging Face dataset
# ------------------------------
dataset_path = "/home/projects/JoeyData/10BT/hugginface/011_00000.parquet"
hf_dataset = load_from_disk(dataset_path)

# ------------------------------
# Convert to PyTorch tensors
# ------------------------------
hf_dataset.set_format(type="torch", columns=["input_ids", "target_ids"])

# ------------------------------
# Create DataLoader
# ------------------------------
batch_size = 2

dataloader = DataLoader(hf_dataset, batch_size=batch_size, shuffle=True)

# ------------------------------
# Test first batch
# ------------------------------
first_batch = next(iter(dataloader))

print("✅ First batch loaded:")
print(first_batch)
print("input_ids shape:", first_batch["input_ids"].shape)
print("target_ids shape:", first_batch["target_ids"].shape)


✅ First batch loaded:
{'input_ids': tensor([[ 7066,  2102,   272,  ...,    13, 28765,   643],
        [  456, 28725,  3768,  ...,  4916, 28723,    13]]), 'target_ids': tensor([[ 2102,   272,  7126,  ..., 28765,   643, 28706],
        [28725,  3768,  6484,  ..., 28723,    13,  3840]])}
input_ids shape: torch.Size([2, 512])
target_ids shape: torch.Size([2, 512])


In [6]:
len(hf_dataset) // 16 //4

24235

In [3]:
from datasets import Dataset


# 2. Test for the existence of the methods using hasattr()
has_len_method = hasattr(hf_dataset, '__len__')
has_getitem_method = hasattr(hf_dataset, '__getitem__')

print(f"Does the dataset have a '__len__' method? {has_len_method}")
print(f"Does the dataset have a '__getitem__' method? {has_getitem_method}")

print("-" * 20)

# 3. Test the methods in action
# The len() function automatically calls the __len__ method
print(f"Length of the dataset: {len(hf_dataset)}")

# Using square brackets [] automatically calls the __getitem__ method
print(f"First item in the dataset: {hf_dataset[0]}")

Does the dataset have a '__len__' method? True
Does the dataset have a '__getitem__' method? True
--------------------
Length of the dataset: 1551063
First item in the dataset: {'input_ids': tensor([ 5924,  3690,  1252,  1126,  8249,  1435,  1888, 28707, 10040,   904,
          863,   459, 14092,   297,   272,   907,  1819,   302,   272,  1918,
        28809, 28713, 21522,   628,   771,  8508, 28725,   390,  3349, 28725,
         1312,   400, 10352,   298,  9443,   477,   264,  7098,  1942,   369,
         7054,   516, 28705, 28750, 28734, 28740, 28783,  3302, 28723,    13,
         7170,   595, 14407, 15296,  1620,   773,   438,   272, 17868,  7162,
        28809, 28713, 13251,  1432,  2102,   369, 10040,   904,  4048, 28809,
        28707,  1388,   744,   739,   272,   907,  6896, 28725,   690,   349,
        21522,   628, 28725,  3125,   356,  3999, 28705, 28740, 28782, 28723,
        10040,   904,   349,  1309,  9443,   288,   477,  1698, 13301,   298,
         1316,   272,  1721, 