In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
import random

In [None]:
def clean_dataset():
    dataset = load_dataset("roneneldan/TinyStories")

    full = list(dataset["validation"]["text"]) + list(dataset["train"]["text"])

    cleaned = [s.encode('ascii', 'ignore').decode('ascii') for s in full]
    deduped = list(set(cleaned))
    random.shuffle(deduped)

    train, validation = deduped[2**14:], deduped[:2**14]

    train_ds= Dataset.from_dict(dict(text=train))
    validation_ds = Dataset.from_dict(dict(text=validation))

    dataset_dict = DatasetDict({"train": train_ds, "validation": validation_ds})

    # Load tokenizer and tokenize the dataset
    tokenizer = AutoTokenizer.from_pretrained("tdooms/ts-tokenizer-4096", pad_token="[EOS]")

    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding=True, max_length=256)

    tokenized_dataset = dataset_dict.map(tokenize_function, batched=True, remove_columns=["text"])

    return tokenized_dataset

In [None]:
print("Cleaning and tokenizing dataset...")
tokenized_ds = clean_dataset()

print("Saving tokenized dataset to 'ts-tokenized-final'...")
tokenized_ds.save_to_disk("ts-tokenized-final")

print("Done!")

Cleaning and tokenizing dataset...


Map:   0%|          | 0/1798254 [00:00<?, ? examples/s]

Map:   0%|          | 0/16384 [00:00<?, ? examples/s]

Saving tokenized dataset to 'ts-tokenized-final'...


Saving the dataset (0/6 shards):   0%|          | 0/1798254 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16384 [00:00<?, ? examples/s]

Done!
