In [1]:
pip install tokenizers datasets

Collecting tokenizers
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-

In [2]:
from datasets import load_dataset

original = load_dataset("colesimmons/SumTablets_English")["train"]
augmented = load_dataset("colesimmons/SumTablets_English-augmented")["train"]

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 1907/1907 [00:00<00:00, 43501.05 examples/s]
Generating validation split: 100%|██████████| 107/107 [00:00<00:00, 31133.58 examples/s]
Generating test split: 100%|██████████| 113/113 [00:00<00:00, 27799.66 examples/s]
Generating train split: 100%|██████████| 1772/1772 [00:00<00:00, 142662.85 examples/s]


In [None]:
#Concatenate all transliteration lines into a single .txt file

transliterations = [item["transliteration"] for item in original] + \
                   [item["transliteration"] for item in augmented]

with open("sumerian_transliterations.txt", "w", encoding="utf-8") as f:
    for line in transliterations:
        f.write(line.strip() + "\n")

In [4]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase, Sequence
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

In [6]:
# Initialize empty BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Normalize text
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Tokenize by whitespace 
tokenizer.pre_tokenizer = Whitespace()

# Set training rules
trainer = BpeTrainer(
    vocab_size=10000,  
    show_progress=True,
    special_tokens=["<pad>", "<s>", "</s>", "<unk>"]
)

# Train on your corpus
tokenizer.train(["sumerian_transliterations.txt"], trainer=trainer)







In [7]:
#Wraps sequences with <s> and </s> for model compatibility

from tokenizers.processors import TemplateProcessing

tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> </s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [None]:
import os

output_dir = "sumerian_bpe_tokenizer"
os.makedirs(output_dir, exist_ok=True)  

tokenizer.save(os.path.join(output_dir, "tokenizer.json"))

In [None]:
#Example

encoding = tokenizer.encode("en-lil2 kur2-ta e3-a")
print(encoding.tokens)

['<s>', 'en', '-', 'lil', '2', 'kur', '2', '-', 'ta', 'e', '3', '-', 'a', '</s>']
