### Mapping from glyphs to translitterations

We start by building a dictionary to store the most frequent glyph to translitteration match

In [31]:
from collections import defaultdict, Counter
from datasets import load_dataset

dataset = load_dataset("colesimmons/SumTablets")["train"]

# Helper
def is_cuneiform(char):
    return 0x12000 <= ord(char) <= 0x123FF

# Step 1: build mapping glyph ‚Üí transliteration
glyph_to_translit = defaultdict(list)

for entry in dataset:
    glyphs = [g for g in entry["glyphs"] if is_cuneiform(g)]
    translits = entry["transliteration"].strip().split()

    if len(glyphs) != len(translits):
        continue  # skip unaligned entries

    for g, t in zip(glyphs, translits):
        glyph_to_translit[g].append(t)

# Reduce to most frequent translit per glyph
glyph_to_top_translit = {
    g: Counter(toks).most_common(1)[0][0]
    for g, toks in glyph_to_translit.items()
}

print("Built dictionary of", len(glyph_to_top_translit), "glyphs.")
print("Example:")
for g in list(glyph_to_top_translit)[:5]:
    print(f"{g}: {glyph_to_top_translit[g]}")


Built dictionary of 350 glyphs.
Example:
íÅπ: <SURFACE>
íÑ≠: i‚ÇÉ
íáΩ: <SURFACE>
íÄÄ: <SURFACE>
íäÆ: <unk>


We'll collect rows where:

$\cdot$ number of cuneiform glyphs == number of transliteration tokens,

$\cdot$ all glyphs are valid Unicode signs.

This gives us a good training/testing base.

In [32]:
clean_data = []

for entry in dataset:
    glyphs = [g for g in entry["glyphs"] if is_cuneiform(g)]
    translits = entry["transliteration"].strip().split()

    if len(glyphs) == len(translits) and glyphs and translits:
        clean_data.append({
            "glyphs": "".join(glyphs),
            "tokens": " ".join(translits)
        })

print("Clean aligned pairs:", len(clean_data))
print("Example:\n", clean_data[0])

Clean aligned pairs: 1092
Example:
 {'glyphs': 'íÅπíÑ≠íáΩíÄÄíäÆíâÑíÅçíÖÜíÅï', 'tokens': '<SURFACE> <COLUMN> ...1(di≈°) ...du‚ÇÅ‚ÇÄ...lu‚ÇÇ... <COLUMN> a-≈°a‚ÇÉ ildu‚ÇÉ ≈°i-da ...'}


In [33]:
from collections import Counter
counter = Counter(c for _, t in data for c in t)
print(counter.most_common(20))

[(' ', 28192), ('a', 13832), ('i', 13162), ('u', 11481), ('≈°', 9707), ('.', 9657), ('(', 9248), (')', 9248), ('n', 7210), ('-', 7066), ('d', 7029), ('g', 6739), ('‚ÇÇ', 5335), ('r', 3787), ('l', 3777), ('b', 3436), ('<', 3421), ('>', 3421), ('e', 3275), ('k', 3203)]


In [34]:
#Export Dataset for Training (glyphs ‚Üí transliteration)

import json

with open("sumerian_glyph2translit_clean.jsonl", "w", encoding="utf-8") as f:
    for row in clean_data:
        json.dump({"input": row["glyphs"], "target": row["tokens"]}, f, ensure_ascii=False)
        f.write("\n")

print("Exported training data to sumerian_glyph2translit_clean.jsonl")


Exported training data to sumerian_glyph2translit_clean.jsonl


### Character-level sequence-to-sequence LSTM model

In [35]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm
import random
import numpy as np

# Load the clean dataset
data = []
with open("sumerian_glyph2translit_clean.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        data.append((item["input"], item["target"]))

# Optional: reduce data for testing
data = data[:50000]

In [36]:
def build_vocab(seqs):
    chars = set(c for seq in seqs for c in seq)
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    vocab.update({c: i+4 for i, c in enumerate(sorted(chars))})
    return vocab

input_vocab = build_vocab([g for g, t in data])
target_vocab = build_vocab([t for g, t in data])
inv_target_vocab = {i: c for c, i in target_vocab.items()}


In [37]:
class TransliterationDataset(Dataset):
    def __init__(self, pairs, in_vocab, out_vocab, max_len=64):
        self.pairs = pairs
        self.in_vocab = in_vocab
        self.out_vocab = out_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def encode_seq(self, seq, vocab):
        return [vocab.get(c, vocab["<unk>"]) for c in seq]

    def __getitem__(self, idx):
        glyphs, translit = self.pairs[idx]
        src = self.encode_seq(glyphs, self.in_vocab)
        tgt = [self.out_vocab["<sos>"]] + self.encode_seq(translit, self.out_vocab) + [self.out_vocab["<eos>"]]
        return torch.tensor(src), torch.tensor(tgt)

dataset = TransliterationDataset(data, input_vocab, target_vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda batch: zip(*batch))

In [38]:
class Seq2Seq(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, hidden=512):
        super().__init__()
        self.encoder_emb = nn.Embedding(in_vocab_size, hidden)
        self.encoder = nn.LSTM(hidden, hidden, batch_first=True)

        self.decoder_emb = nn.Embedding(out_vocab_size, hidden)
        self.decoder = nn.LSTM(hidden, hidden, batch_first=True)
        self.output = nn.Linear(hidden, out_vocab_size)

    def forward(self, src, tgt):
        # Encode
        src_emb = self.encoder_emb(src)
        _, (h, c) = self.encoder(src_emb)

        # Decode
        tgt_emb = self.decoder_emb(tgt[:, :-1])
        output, _ = self.decoder(tgt_emb, (h, c))
        logits = self.output(output)
        return logits

In [40]:
device = torch.device(("cuda" if torch.cuda.is_available() else "cpu"))
model = Seq2Seq(len(input_vocab), len(target_vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=target_vocab["<pad>"])

for epoch in range(10):
    model.train()
    total_loss = 0
    for srcs, tgts in tqdm(dataloader):
        srcs = nn.utils.rnn.pad_sequence(srcs, batch_first=True, padding_value=0).to(device)
        tgts = nn.utils.rnn.pad_sequence(tgts, batch_first=True, padding_value=0).to(device)

        optimizer.zero_grad()
        logits = model(srcs, tgts)
        loss = criterion(logits.view(-1, logits.shape[-1]), tgts[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} loss: {total_loss:.2f}")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:32<00:00,  2.66s/it]


Epoch 1 loss: 75.59


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:40<00:00,  2.87s/it]


Epoch 2 loss: 43.26


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:40<00:00,  2.88s/it]


Epoch 3 loss: 37.81


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:40<00:00,  2.87s/it]


Epoch 4 loss: 35.24


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:48<00:00,  3.11s/it]


Epoch 5 loss: 33.54


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:47<00:00,  3.06s/it]


Epoch 6 loss: 32.12


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:45<00:00,  3.02s/it]


Epoch 7 loss: 30.99


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:46<00:00,  3.06s/it]


Epoch 8 loss: 30.09


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:48<00:00,  3.11s/it]


Epoch 9 loss: 29.35


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [01:46<00:00,  3.03s/it]

Epoch 10 loss: 28.33





In [41]:
# Greedy decoding

def decode_glyphs(glyph_seq, max_len=100):
    model.eval()
    src = torch.tensor([input_vocab.get(c, input_vocab["<unk>"]) for c in glyph_seq], device=device).unsqueeze(0)
    with torch.no_grad():
        src_emb = model.encoder_emb(src)
        _, (h, c) = model.encoder(src_emb)

        dec_input = torch.tensor([[target_vocab["<sos>"]]], device=device)
        output = ""

        for _ in range(max_len):
            emb = model.decoder_emb(dec_input)
            dec_out, (h, c) = model.decoder(emb, (h, c))
            logits = model.output(dec_out[:, -1])
            next_token = logits.argmax(dim=-1).item()
            if next_token == target_vocab["<eos>"]:
                break
            output += inv_target_vocab.get(next_token, "?")
            dec_input = torch.tensor([[next_token]], device=device)
    return output

# Try decoding one example
test_glyphs = data[42][0]
print("Glyphs:", test_glyphs)
print("Predicted transliteration:", decode_glyphs(test_glyphs))
print("Ground truth:", data[42][1])


Glyphs: íä∫íÑ•íäïíÖÖíÇçíá∑íá∑íãÉíãóíÅÄíãæ
Predicted transliteration: <SURFACE> <COLUMN> 1(u) 6(di≈°) udu 1(di≈°) udu 1(di≈°) udu 1(di≈°) udu 1(di≈°) udu 1(di≈°) udu 1(di≈°) udu
Ground truth: <SURFACE> 7(a≈°) ≈°e gur sag gal‚ÇÇ <unk> e‚ÇÇ-li-li sanga ≈°u ba-ti


In [42]:
# Measure character-level accuracy
correct = 0
total = 0

for glyphs, true_translit in data[100:200]:  # eval subset
    pred = decode_glyphs(glyphs)
    min_len = min(len(pred), len(true_translit))
    for i in range(min_len):
        if pred[i] == true_translit[i]:
            correct += 1
        total += 1

print(f"Character-level accuracy: {correct / total:.2%}")

Character-level accuracy: 19.46%


In [43]:
print(f"Exact matches: {np.array([pred == true_translit for glyphs, true_translit in data[100:200] for pred in [decode_glyphs(glyphs).strip()]]).sum()/len(data[100:200]):.2%}")

Exact matches: 0.00%
