In [1]:
!pip install evaluate -U
!pip install tf-keras -U
!pip install sacrebleu -U
!pip install hf_xet -U
!pip install jupyter -U
!pip install ipywidgets -U
!pip install transformers[torch] -U
!pip install accelerate -U

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Using cached multiprocess-0.70.18-py310-none-any.whl.metadata (7.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.7.0 (from evaluate)
  Using cached huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from datasets>=2.0.0->evaluate)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dill (from evaluate)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)

## Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase, Sequence
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer, EncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, BertGenerationConfig

from datasets import Dataset, DatasetDict

2025-05-12 10:12:02.429805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747044723.429734   19799 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747044723.710751   19799 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747044726.234810   19799 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747044726.234843   19799 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747044726.234846   19799 computation_placer.cc:177] computation placer alr

In [2]:
# read data sumerian_english
train_dataset = pd.read_csv('datasets/SumTablets_English_train.csv')
test_dataset = pd.read_csv('datasets/SumTablets_English_test.csv')
val_dataset = pd.read_csv('datasets/SumTablets_English_validation.csv')

In [3]:
test_dataset

Unnamed: 0,id,period,genre,transliteration,translation
0,P459086,Ur III,Administrative,\n...guruš engar dumu-ni\n...ur-mes\n1(u) 1(di...,"n male laborers, plowman and his sons,\nforema..."
1,P465343,Ur III,Royal Inscription,\n1(diš) udu gir-ru-um niga\n2(diš) udu eme-gi...,"1 kirrum sheep, grain-fed,\n2 emegi rams,\nfor..."
2,P480378,Ur III,Administrative,\n2(diš) udu-nita₂ kur-ra bar-gal₂\n1(diš) sil...,"2 male sheep of the mountain, with fleece,\n1 ..."
3,P346107,Old Babylonian,Literary,\n...nin₉ ki aŋ₂ {d}dumu-zid-de₃\n...gur₃-ru k...,... beloved sister of Dumuzi\nExuding/bearing ...
4,P454330,Ur III,Administrative,\n<unk> nin\ndub-sar\ndumu šeš-kal-la,"Šu-Suen,\nstrong king,\nking of Ur:\nAḫuni,\nc..."
...,...,...,...,...,...
108,P273525,Ur III,Administrative,\n1(diš) udu bar-gal₂ ba-uš₂\nki ku₃-ga-ni-ta\...,"1 sheep, with fleece, slaughtered,\nfrom Kugan..."
109,P131769,Ur III,Administrative,\n\n3(diš) gal sag-kul zabar\nki-la₂-bi 2(diš)...,"3 large (bowls?), ..., bronze.\nTheir weight: ..."
110,P136312,Ur III,Administrative,\npisan dub-ba\nzi-ga\nu₃ kurušda-e ib₂-dab₅\n...,Basket-of-tablets:\nxxx\nxxx\nxxx\nxxx\nxxx\nxxx
111,P139619,Ur III,Administrative,\npisan dub-ba\nab₂ e₂-tur₃-ra\ngu₄{geš}apin\n...,Basket-of-tablets:\nxxx\nxxx\nxxx\nxxx\nxxx\nx...


In [4]:
# Concatenate all transliteration lines into a single .txt file
with open("sumerian_transliterations.txt", "a", encoding="utf-8") as f:
    for line in train_dataset["transliteration"]:
        f.write(line.strip() + "\n")

In [5]:
# Initialize empty BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Normalize text
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Tokenize by whitespace 
tokenizer.pre_tokenizer = Whitespace()

# Set training rules
trainer = BpeTrainer(
    vocab_size=10000,  
    show_progress=True,
    special_tokens=["<unk>"]
)

# Train on your corpus
tokenizer.train(["sumerian_transliterations.txt"], trainer=trainer)






In [6]:
# Output trained tokenizer to a file
output_dir = "sumerian_bpe_tokenizer"
os.makedirs(output_dir, exist_ok=True)  
tokenizer.save(os.path.join(output_dir, "tokenizer.json"))

## Model

In [2]:
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

import torch

## BERT on Sumerian Dataset

In [8]:
train_ds = Dataset.from_pandas(train_dataset)
test_ds = Dataset.from_pandas(test_dataset)
val_ds = Dataset.from_pandas(val_dataset)

dataset_dict = DatasetDict({
    "train": train_ds,
    "test": test_ds,
    "validation": val_ds
})

In [9]:
source_tokenizer = Tokenizer.from_file(os.path.join("sumerian_bpe_tokenizer", "tokenizer.json"))
target_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# For a simple encoder-decoder approach
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased", "bert-base-uncased"
)

# Configure the model for generation
model.config.decoder_start_token_id = target_tokenizer.cls_token_id
model.config.eos_token_id = target_tokenizer.sep_token_id
model.config.pad_token_id = target_tokenizer.pad_token_id
model.config.vocab_size = target_tokenizer.vocab_size

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [11]:
def preprocess_function(examples):
    # Custom BPE tokenization for Sumerian transliterations (source)
    source_texts = examples["transliteration"]
    source_encodings = {"input_ids": [], "attention_mask": []}
    
    for text in source_texts:
        # Handle None or empty strings
        if not text or pd.isna(text):
            text = "<unk>"
            
        # Tokenize using the custom BPE tokenizer
        encoded = source_tokenizer.encode(str(text))
        ids = encoded.ids
        
        # Apply truncation and padding
        if len(ids) > 128:
            ids = ids[:128]
        else:
            # Pad with zeros (assuming 0 is the pad token ID)
            pad_length = 128 - len(ids)
            ids = ids + [0] * pad_length
            
        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * min(len(encoded.ids), 128) + [0] * max(0, 128 - len(encoded.ids))
        
        source_encodings["input_ids"].append(ids)
        source_encodings["attention_mask"].append(attention_mask)
    
    # Use BERT tokenizer for English translations (target)
    # Handle potential None values in translations
    translations = [str(t) if t and not pd.isna(t) else "" for t in examples["translation"]]
    
    target_encodings = target_tokenizer(
        translations,
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    
    # Add labels from target encodings
    source_encodings["labels"] = target_encodings["input_ids"].copy()
    
    # Replace padding token id with -100 so it's ignored in loss calculation
    for i in range(len(source_encodings["labels"])):
        source_encodings["labels"][i] = [
            -100 if token == target_tokenizer.pad_token_id else token 
            for token in source_encodings["labels"][i]
        ]
    
    return source_encodings

# Apply the preprocessing function to the datasets
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=target_tokenizer, model=model)

training_params = Seq2SeqTrainingArguments(
    output_dir="sumerian-translation-model",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [13]:
from evaluate import load
bleu = load("sacrebleu")

# Ensure all config objects have the necessary token IDs set
# Main model config
model.config.decoder_start_token_id = target_tokenizer.cls_token_id
model.config.pad_token_id = target_tokenizer.pad_token_id
model.config.eos_token_id = target_tokenizer.sep_token_id
model.config.bos_token_id = target_tokenizer.cls_token_id
model.config.vocab_size = target_tokenizer.vocab_size
model.config.max_length = 128

# Decoder model config
model.decoder.config.decoder_start_token_id = target_tokenizer.cls_token_id
model.decoder.config.bos_token_id = target_tokenizer.cls_token_id
model.decoder.config.eos_token_id = target_tokenizer.sep_token_id
model.decoder.config.pad_token_id = target_tokenizer.pad_token_id
model.decoder.config.vocab_size = target_tokenizer.vocab_size

# Create and set a proper generation config
from transformers import GenerationConfig
generation_config = GenerationConfig(
    max_length=128,
    decoder_start_token_id=target_tokenizer.cls_token_id,
    bos_token_id=target_tokenizer.cls_token_id,
    eos_token_id=target_tokenizer.sep_token_id,
    pad_token_id=target_tokenizer.pad_token_id,
    num_beams=5,
    no_repeat_ngram_size=2,
)

# Apply the generation config to the model
model.generation_config = generation_config

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Decode generated translations
    decoded_preds = target_tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, target_tokenizer.pad_token_id)
    decoded_labels = target_tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BLEU score
    result = bleu.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])
    
    return {"bleu": result["score"]}

# Update the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_params,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=target_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss,Bleu
1,4.587,3.086918,3.042728
2,3.097,2.601013,11.064868
3,2.6886,2.417015,10.616672
4,2.4441,2.329031,15.890447
5,2.2597,2.248966,13.467342
6,2.1298,2.205425,14.48527
7,2.0504,2.183482,13.479695
8,1.9769,2.153433,16.025667
9,1.9147,2.147298,16.031876
10,1.881,2.150081,16.165771


There were missing keys in the checkpoint model loaded: ['decoder.cls.predictions.decoder.weight', 'decoder.cls.predictions.decoder.bias'].


TrainOutput(global_step=2390, training_loss=2.5029253588560736, metrics={'train_runtime': 941.9544, 'train_samples_per_second': 20.245, 'train_steps_per_second': 2.537, 'total_flos': 2924642424038400.0, 'train_loss': 2.5029253588560736, 'epoch': 10.0})

In [4]:
# SAsumerian-translation-modelVE BEST CHECKPOINT
from transformers import EncoderDecoderModel, AutoTokenizer
import pandas as pd

best_checkpoint_path = "./sumerian-translation-model/checkpoint-2390" # Replace with your actual best checkpoint
output_path = "./sumerian-translation-model/best_model"

# Load the model and tokenizer from the best checkpoint
model = AutoModelForSeq2SeqLM.from_pretrained(best_checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(best_checkpoint_path)

# Save them in the new inference-ready directory
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

print(f"Inference-ready model saved to: {output_path}")

Inference-ready model saved to: ./sumerian-translation-model/best_model


In [5]:
# remove the old checkpoints
import shutil
import os
import glob
import re

# Define the directory containing the checkpoints
checkpoints_dir = "./sumerian-translation-model"
# Get all checkpoint directories
checkpoints = glob.glob(os.path.join(checkpoints_dir, "checkpoint-*"))

for checkpoint in checkpoints:
    shutil.rmtree(checkpoint)
    print(f"Removed checkpoint: {checkpoint}")

Removed checkpoint: ./sumerian-translation-model/checkpoint-1673
Removed checkpoint: ./sumerian-translation-model/checkpoint-2390
Removed checkpoint: ./sumerian-translation-model/checkpoint-239
Removed checkpoint: ./sumerian-translation-model/checkpoint-2151
Removed checkpoint: ./sumerian-translation-model/checkpoint-956
Removed checkpoint: ./sumerian-translation-model/checkpoint-1434
Removed checkpoint: ./sumerian-translation-model/checkpoint-1912


In [17]:
# Test the model on a few examples
test_samples = tokenized_datasets["test"].select(range(10))

for i, sample in enumerate(test_samples):
    input_ids = torch.tensor([sample["input_ids"]]).to(device)
    attention_mask = torch.tensor([sample["attention_mask"]]).to(device)
    
    outputs = model.generate(
        input_ids=input_ids, 
        attention_mask=attention_mask,
        max_length=50,
        num_beams=5,
        no_repeat_ngram_size=2
    )
    
    # Decode the predicted translation
    predicted_translation = target_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # For source_tokenizer, decode the input ids
    source_tokens = [source_tokenizer.decode([id]) for id in sample["input_ids"] if id != 0]
    actual_transliteration = " ".join(source_tokens)
    
    # Get the actual translation from labels
    actual_translation = target_tokenizer.decode([l for l in sample["labels"] if l != -100], skip_special_tokens=True)
    
    # Also print the original text
    print(f"ID: {test_dataset['id'][i]}")
    print(f"Original Sumerian: {test_dataset['transliteration'][i]}")
    print(f"Decoded Sumerian: {actual_transliteration}")
    print(f"Actual translation: {actual_translation}")
    print(f"Predicted translation: {predicted_translation}")
    print("-" * 50)

ID: P459086
Original Sumerian: 
...guruš engar dumu-ni
...ur-mes
1(u) 1(diš) guruš ugula ur-lugal
8(diš) guruš ugula ab-ba-sag₁₀
6(diš) guruš ugula lugal-ku₃-zu
3(diš) guruš ugula šeš-kal-la
2(diš) guruš ugula lugal-iti-da
4(diš) guruš ugula lu₂-dingir-ra
7(diš) guruš ugula ur-am₃-ma
4(diš) guruš ugula ur-e₂-nun-na

1(geš₂) guruš ugula al-la-igi-še₃-du
gurum₂ u₄ 2(diš)-kam
ki-su₇ ka-ma-ri₂ gub-ba
giri₃ i₃-kal-la
iti še-kar-ra-gal₂-la
mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Decoded Sumerian: ... gurus engar dumu - ni ... ur - mes 1 ( u ) 1 ( dis ) gurus ugula ur - lugal 8 ( dis ) gurus ugula ab - ba - sag ₁₀ 6 ( dis ) gurus ugula lugal - ku ₃- zu 3 ( dis ) gurus ugula ses - kal - la 2 ( dis ) gurus ugula lugal - iti - da 4 ( dis ) gurus ugula lu ₂- dingir - ra 7 ( dis ) gurus ugula ur - am ₃- ma 4 ( dis ) gurus ugula ur - e ₂- nun - na 1 ( ges ₂) gurus ugula al - la - igi - se ₃- du gurum ₂ u ₄ 2 ( dis )- kam ki
Actual translation: n male laborers, plowman and his 