## BPE for transliteration

In [1]:
pip install tokenizers datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

original = load_dataset("colesimmons/SumTablets_English")["train"]
augmented = load_dataset("colesimmons/SumTablets_English-augmented")["train"]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Concatenate all transliteration lines into a single .txt file

transliterations = [item["transliteration"] for item in original] + \
                   [item["transliteration"] for item in augmented]

with open("sumerian_transliterations.txt", "w", encoding="utf-8") as f:
    for line in transliterations:
        f.write(line.strip() + "\n")

In [4]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase, Sequence
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

In [None]:
# Initialize empty BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Normalize text
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Tokenize by whitespace 
tokenizer.pre_tokenizer = Whitespace()

# Set training rules
trainer = BpeTrainer(
    vocab_size=10000,  
    show_progress=True,
    special_tokens=["<pad>", "<s>", "</s>", "<unk>"]
)

# Train on your corpus
tokenizer.train(["sumerian_transliterations.txt"], trainer=trainer)







In [None]:
#Wrap sequences with <s> and </s> for model compatibility

from tokenizers.processors import TemplateProcessing

tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> </s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [7]:
import os

output_dir = "sumerian_bpe_tokenizer"
os.makedirs(output_dir, exist_ok=True)  

tokenizer.save(os.path.join(output_dir, "tokenizer.json"))

In [None]:
#Example

encoding = tokenizer.encode("en-lil2 kur2-ta e3-a")
print(encoding.tokens)
# print(encoding.ids)
# print(encoding.offsets)
# print(encoding.attention_mask)
# print(encoding.special_tokens_mask)
# print(encoding.type_ids)
# print(encoding.encoded_offsets)


['<s>', 'en', '-', 'lil', '2', 'kur', '2', '-', 'ta', 'e', '3', '-', 'a', '</s>']


Mapping to Unicode glyphs

In [59]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("tokenizer.json")

In [60]:
sumtablets = load_dataset("colesimmons/SumTablets")["train"]

In [83]:
from collections import defaultdict, Counter

def is_cuneiform(char):
    return 0x12000 <= ord(char) <= 0x123FF

# Collect token-glyph alignments
token_to_glyphs = defaultdict(list)
aligned_count = 0

for entry in sumtablets:
    translit = entry["transliteration"]
    glyphs_raw = entry.get("glyphs", "")  # safer than ["glyphs"] in case of key error

    # Tokenize and filter glyphs
    tokens = tokenizer.encode(translit).tokens
    glyphs = [g for g in glyphs_raw if is_cuneiform(g)]

    # Skip empty alignments
    if not tokens or not glyphs:
        continue

    # Align as much as possible (partial alignments)
    for token, glyph in zip(tokens, glyphs):
        token_to_glyphs[token].append(glyph)

    aligned_count += 1

# Build frequency-based glyph vocab
token_glyph_vocab = {
    token: Counter(glyphs).most_common(5)  # top 5 most frequent glyphs
    for token, glyphs in token_to_glyphs.items()
}

print(f"Successfully aligned {aligned_count} transliterations.\n")

Successfully aligned 82432 transliterations.



In [88]:
sample_token = "en"
print(f"Token '{sample_token}' maps to:")
for glyph, freq in token_glyph_vocab.get(sample_token, []):
    print(f"  {glyph}  (x{freq})")

Token 'en' maps to:
  𒀭  (x679)
  𒀀  (x372)
  𒆠  (x350)
  𒈬  (x341)
  𒁀  (x266)
