In [6]:
# test_process.py

import time
import torch
import pandas as pd
from datasets import Dataset as HFDataset
from transformers import AutoTokenizer

# ------------------------------
# Config
# ------------------------------
TOKENIZER_PATH = "SouthernCrossAI/JoeyLLM_Tokenizer"
INPUT_PATH = "../../JoeyData/10BT/008_00000.parquet"
OUTPUT_PATH = "../../JoeyData/10BT/hugginface/008_00000.parquet"
CHUNK_SIZE = 512
NUM_PROC = 45

# ------------------------------
# Initialize tokenizer
# ------------------------------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)

# ------------------------------
# Tokenization function
# ------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        add_special_tokens=False
    )

# ------------------------------
# Main processing pipeline
# ------------------------------

# Load text column as pandas DataFrame
df = pd.read_parquet(INPUT_PATH, columns=["text"])

# Convert to Hugging Face Dataset
dataset = HFDataset.from_pandas(df)

# Tokenize dataset with multiprocessing
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=NUM_PROC
)

# Convert tokenized dataset to pandas DataFrame and extract input_ids column
df2 = tokenized_dataset.to_pandas()
input_ids_series = df2['input_ids']

# Flatten input_ids into a single list of tokens
all_tokens = [token for sublist in input_ids_series for token in sublist]

# Create input_ids and target_ids chunks
input_ids_chunks = [
    all_tokens[i:i+CHUNK_SIZE]
    for i in range(0, len(all_tokens)-CHUNK_SIZE, CHUNK_SIZE)
]
target_ids_chunks = [
    all_tokens[i+1:i+CHUNK_SIZE+1]
    for i in range(0, len(all_tokens)-CHUNK_SIZE, CHUNK_SIZE)
]

# Build final DataFrame
df_final = pd.DataFrame({
    'input_ids': input_ids_chunks,
    'target_ids': target_ids_chunks
})

# Ensure both columns have same number of rows
if len(df_final['input_ids']) != len(df_final['target_ids']):
    df_final = df_final.iloc[:-1]

# Convert to Hugging Face Dataset and save
hf_dataset = HFDataset.from_pandas(df_final, preserve_index=False)
hf_dataset.save_to_disk(OUTPUT_PATH)

print(f"✅ Dataset saved to {OUTPUT_PATH}")


Map (num_proc=45):   0%|          | 0/1034640 [00:00<?, ? examples/s]

Saving the dataset (0/13 shards):   0%|          | 0/1553909 [00:00<?, ? examples/s]

✅ Dataset saved to ../../JoeyData/10BT/hugginface/008_00000.parquet


# Test

In [7]:
from datasets import load_from_disk
from torch.utils.data import DataLoader

# ------------------------------
# Load saved Hugging Face dataset
# ------------------------------
dataset_path = "../../JoeyData/10BT/hugginface/008_00000.parquet"
hf_dataset = load_from_disk(dataset_path)

# ------------------------------
# Convert to PyTorch tensors
# ------------------------------
hf_dataset.set_format(type="torch", columns=["input_ids", "target_ids"])

# ------------------------------
# Create DataLoader
# ------------------------------
batch_size = 2

dataloader = DataLoader(hf_dataset, batch_size=batch_size, shuffle=True)

# ------------------------------
# Test first batch
# ------------------------------
first_batch = next(iter(dataloader))

print("✅ First batch loaded:")
print(first_batch)
print("input_ids shape:", first_batch["input_ids"].shape)
print("target_ids shape:", first_batch["target_ids"].shape)


✅ First batch loaded:
{'input_ids': tensor([[15772,   302,  6601,  ...,   547,   706,  5380],
        [ 1287, 13500,   304,  ...,   868, 28723,   315]]), 'target_ids': tensor([[  302,  6601, 13295,  ...,   706,  5380,   272],
        [13500,   304,  4251,  ..., 28723,   315,   863]])}
input_ids shape: torch.Size([2, 512])
target_ids shape: torch.Size([2, 512])


In [8]:
print("test")

test
