# Data Testing

## Data Preprocessing parquet --> huggingface Dataset

In [1]:
# test_process.py

import time
import torch
import pandas as pd
from datasets import Dataset as HFDataset
from transformers import AutoTokenizer

# ------------------------------
# Config
# ------------------------------
TOKENIZER_PATH = "SouthernCrossAI/JoeyLLM_Tokenizer"
INPUT_PATH = "traindata/001_00000.parquet"
OUTPUT_PATH = "008_00000.parquet"
CHUNK_SIZE = 512
NUM_PROC = 50

# ------------------------------
# Initialize tokenizer
# ------------------------------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)

# ------------------------------
# Tokenization function
# ------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        add_special_tokens=False
    )

# ------------------------------
# Main processing pipeline
# ------------------------------

# Load text column as pandas DataFrame
df = pd.read_parquet(INPUT_PATH, columns=["text"])

# Convert to Hugging Face Dataset
dataset = HFDataset.from_pandas(df)

# Tokenize dataset with multiprocessing
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=NUM_PROC
)

# Convert tokenized dataset to pandas DataFrame and extract input_ids column
df2 = tokenized_dataset.to_pandas()
input_ids_series = df2['input_ids']

# Flatten input_ids into a single list of tokens
all_tokens = [token for sublist in input_ids_series for token in sublist]

# Create input_ids and target_ids chunks
input_ids_chunks = [
    all_tokens[i:i+CHUNK_SIZE]
    for i in range(0, len(all_tokens)-CHUNK_SIZE, CHUNK_SIZE)
]
target_ids_chunks = [
    all_tokens[i+1:i+CHUNK_SIZE+1]
    for i in range(0, len(all_tokens)-CHUNK_SIZE, CHUNK_SIZE)
]

# Build final DataFrame
df_final = pd.DataFrame({
    'input_ids': input_ids_chunks,
    'target_ids': target_ids_chunks
})

# Ensure both columns have same number of rows
if len(df_final['input_ids']) != len(df_final['target_ids']):
    df_final = df_final.iloc[:-1]

# Convert to Hugging Face Dataset and save
hf_dataset = HFDataset.from_pandas(df_final, preserve_index=False)
hf_dataset.save_to_disk(OUTPUT_PATH)

print(f"✅ Dataset saved to {OUTPUT_PATH}")


Map (num_proc=50):   0%|          | 0/1046615 [00:00<?, ? examples/s]

Saving the dataset (0/13 shards):   0%|          | 0/1551706 [00:00<?, ? examples/s]

✅ Dataset saved to 008_00000.parquet


## Pulling and load huggingface Dataset

In [2]:
from datasets import load_from_disk
from torch.utils.data import DataLoader

# ------------------------------
# Load saved Hugging Face dataset
# ------------------------------
dataset_path = "008_00000.parquet"
hf_dataset = load_from_disk(dataset_path)

# ------------------------------
# Convert to PyTorch tensors
# ------------------------------
hf_dataset.set_format(type="torch", columns=["input_ids", "target_ids"])

print('dataset loaded')

# Test for the existence of the methods using hasattr()
has_len_method = hasattr(hf_dataset, '__len__')
has_getitem_method = hasattr(hf_dataset, '__getitem__')

print(f"Does the dataset have a '__len__' method? {has_len_method}")
print(f"Does the dataset have a '__getitem__' method? {has_getitem_method}")


# Using square brackets [] automatically calls the __getitem__ method
# print(f"First item in the dataset: {hf_dataset[0]}")

dataset loaded
Does the dataset have a '__len__' method? True
Does the dataset have a '__getitem__' method? True


## Testing Dataloader

In [3]:
batch_size = 2

dataloader = DataLoader(hf_dataset, batch_size=batch_size, shuffle=True)

# ------------------------------
# Test first batch
# ------------------------------
first_batch = next(iter(dataloader))

print("✅ First batch loaded:")
print(first_batch)
print("input_ids shape:", first_batch["input_ids"].shape)
print("target_ids shape:", first_batch["target_ids"].shape)

✅ First batch loaded:
{'input_ids': tensor([[ 5902,  2260,  6788,  ...,   586,  6073,   640],
        [28718, 16181,   349,  ...,   264,   320,  4635]]), 'target_ids': tensor([[ 2260,  6788,  2145,  ...,  6073,   640, 27136],
        [16181,   349,  2651,  ...,   320,  4635,    13]])}
input_ids shape: torch.Size([2, 512])
target_ids shape: torch.Size([2, 512])


## Getting lenths

In [4]:
# The len() function automatically calls the __len__ method
print(f"Length of the dataset: {len(hf_dataset)}")

len(hf_dataset) // 16 //4  # Length // batch // gradent accum

Length of the dataset: 1551706


24245

In [4]:
import os

In [5]:
print(os.getcwd())

/home/remote/u1138167/JoeyLLM/data


In [None]:
print(os.getc)