In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("SouthernCrossAI/JoeyLLM_Tokenizer", use_fast=True)


In [2]:
print(tokenizer.vocab_size)


32000


In [None]:
from datasets import load_dataset
from torch.utils.data import IterableDataset, DataLoader
from tokenizers import Tokenizer
import torch
import os

In [None]:
for var in [
    "HF_HOME",
    "HF_DATASETS_CACHE",
    "TRANSFORMERS_CACHE",
    "HF_HUB_CACHE",
    "HF_DATASETS_HOME",  # deprecated
]:
    print(f"{var} =", os.getenv(var))


In [None]:
# ⚙️ Config
CHUNK_SIZE = 512
BUFFER_TEXT_SIZE = 1000  # Number of samples to buffer before tokenizing (tune this)
BATCH_SIZE = 32
NUM_WORKERS = 8

In [None]:
# 🌊 Load streaming dataset
hf_dataset = load_dataset(
    "HuggingFaceFW/fineweb",
    data_dir="sample/10BT",
    split="train",
    streaming=True
)

In [None]:
# print(len(hf_dataset))

In [None]:
class BufferedStreamTokenChunkDataset(IterableDataset):
    def __init__(self, hf_streaming_dataset, tokenizer, chunk_size, buffer_text_size=10000):
        self.dataset = hf_streaming_dataset
        self.tokenizer = tokenizer
        self.chunk_size = chunk_size
        self.buffer_text_size = buffer_text_size

    def __iter__(self):
        buffer = []
        token_buffer = []

        for example in self.dataset:
            buffer.append(example["text"])
            if len(buffer) >= self.buffer_text_size:
                tokenized = self.tokenizer(
                    " ".join(buffer),
                    return_attention_mask=False,
                    return_token_type_ids=False,
                    add_special_tokens=False,
                )["input_ids"]
                token_buffer.extend(tokenized)
                buffer = []

                while len(token_buffer) >= self.chunk_size + 1:
                    input_ids = token_buffer[:self.chunk_size]
                    target_ids = token_buffer[1:self.chunk_size + 1]

                    yield {
                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
                        "labels": torch.tensor(target_ids, dtype=torch.long)
                    }

                    token_buffer = token_buffer[self.chunk_size:]

        # Final flush
        if buffer:
            tokenized = self.tokenizer(
                " ".join(buffer),
                return_attention_mask=False,
                return_token_type_ids=False,
                add_special_tokens=False,
            )["input_ids"]
            token_buffer.extend(tokenized)

        while len(token_buffer) >= self.chunk_size + 1:
            input_ids = token_buffer[:self.chunk_size]
            target_ids = token_buffer[1:self.chunk_size + 1]

            yield {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "labels": torch.tensor(target_ids, dtype=torch.long)
            }

            token_buffer = token_buffer[self.chunk_size:]


In [None]:
dataset = BufferedStreamTokenChunkDataset(
    hf_streaming_dataset=hf_dataset,
    tokenizer=tokenizer,
    chunk_size=CHUNK_SIZE,
    buffer_text_size=BUFFER_TEXT_SIZE
)


In [None]:

dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)


In [None]:
one_batch = next(iter(dataloader))


In [None]:

print(type(one_batch))

In [None]:
print(one_batch.keys())


In [None]:
print(one_batch['input_ids'].size())

In [None]:
print(one_batch['input_ids'][0])

In [None]:
len(one_batch['input_ids'][0])

In [None]:
token_ids = one_batch['input_ids'][0].tolist()

In [None]:
decoded_text = tokenizer.decode(token_ids)

In [None]:
print(decoded_text)

In [None]:
len(one_batch['input_ids'][0])

In [None]:
len(token_ids)