In [None]:
!pip install evaluate -U
!pip install tf-keras -U
!pip install sacrebleu -U
!pip install hf_xet -U
!pip install jupyter -U
!pip install ipywidgets -U
!pip install transformers[torch] -U
!pip install accelerate -U

## Data

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase, Sequence
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer, EncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, BertGenerationConfig

from datasets import Dataset, DatasetDict

In [31]:
# read data sumerian_english
train_dataset = pd.read_csv('datasets/SumTablets_English_train.csv')
test_dataset = pd.read_csv('datasets/SumTablets_English_test.csv')
val_dataset = pd.read_csv('datasets/SumTablets_English_validation.csv')

In [32]:
# Concatenate all transliteration lines into a single .txt file
with open("sumerian_transliterations.txt", "a", encoding="utf-8") as f:
    for line in train_dataset["transliteration"]:
        f.write(line.strip() + "\n")

In [33]:
# Initialize empty BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Normalize text
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Tokenize by whitespace 
tokenizer.pre_tokenizer = Whitespace()

# Set training rules
trainer = BpeTrainer(
    vocab_size=10000,  
    show_progress=True,
    special_tokens=["<unk>"]
)

# Train on your corpus
tokenizer.train(["sumerian_transliterations.txt"], trainer=trainer)

In [35]:
# Output trained tokenizer to a file
output_dir = "sumerian_bpe_tokenizer"
os.makedirs(output_dir, exist_ok=True)  
tokenizer.save(os.path.join(output_dir, "tokenizer.json"))

## Model

In [49]:
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

import torch

## BERT on Sumerian Dataset

In [25]:
train_ds = Dataset.from_pandas(train_dataset)
test_ds = Dataset.from_pandas(test_dataset)
val_ds = Dataset.from_pandas(val_dataset)

dataset_dict = DatasetDict({
    "train": train_ds,
    "test": test_ds,
    "validation": val_ds
})

In [None]:
source_tokenizer = Tokenizer.from_file(os.path.join("sumerian_bpe_tokenizer", "tokenizer.json"))
target_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# For a simple encoder-decoder approach
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased", "bert-base-uncased"
)

# Configure the model for generation
model.config.decoder_start_token_id = target_tokenizer.cls_token_id
model.config.eos_token_id = target_tokenizer.sep_token_id
model.config.pad_token_id = target_tokenizer.pad_token_id
model.config.vocab_size = target_tokenizer.vocab_size

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
def preprocess_function(examples):
    # Custom BPE tokenization for Sumerian transliterations (source)
    source_texts = examples["transliteration"]
    source_encodings = {"input_ids": [], "attention_mask": []}
    
    for text in source_texts:
        # Handle None or empty strings
        if not text or pd.isna(text):
            text = "<unk>"
            
        # Tokenize using the custom BPE tokenizer
        encoded = source_tokenizer.encode(str(text))
        ids = encoded.ids
        
        # Apply truncation and padding
        if len(ids) > 128:
            ids = ids[:128]
        else:
            # Pad with zeros (assuming 0 is the pad token ID)
            pad_length = 128 - len(ids)
            ids = ids + [0] * pad_length
            
        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * min(len(encoded.ids), 128) + [0] * max(0, 128 - len(encoded.ids))
        
        source_encodings["input_ids"].append(ids)
        source_encodings["attention_mask"].append(attention_mask)
    
    # Use BERT tokenizer for English translations (target)
    # Handle potential None values in translations
    translations = [str(t) if t and not pd.isna(t) else "" for t in examples["translation"]]
    
    target_encodings = target_tokenizer(
        translations,
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    
    # Add labels from target encodings
    source_encodings["labels"] = target_encodings["input_ids"].copy()
    
    # Replace padding token id with -100 so it's ignored in loss calculation
    for i in range(len(source_encodings["labels"])):
        source_encodings["labels"][i] = [
            -100 if token == target_tokenizer.pad_token_id else token 
            for token in source_encodings["labels"][i]
        ]
    
    return source_encodings

# Apply the preprocessing function to the datasets
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=target_tokenizer, model=model)

training_params = Seq2SeqTrainingArguments(
    output_dir="sumerian-translation-model",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [51]:
from evaluate import load
bleu = load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Decode generated translations
    decoded_preds = target_tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, target_tokenizer.pad_token_id)
    decoded_labels = target_tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BLEU score
    result = bleu.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])
    
    return {"bleu": result["score"]}

# Update the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_params,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=target_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(


KeyboardInterrupt: 

In [53]:
# Test the model on a few examples
test_samples = tokenized_datasets["test"].select(range(5))

for i, sample in enumerate(test_samples):
    input_ids = torch.tensor([sample["input_ids"]]).to(device)
    attention_mask = torch.tensor([sample["attention_mask"]]).to(device)
    
    outputs = model.generate(
        input_ids=input_ids, 
        attention_mask=attention_mask,
        max_length=50,
        num_beams=5,
        no_repeat_ngram_size=2
    )
    
    # Decode the predicted translation
    predicted_translation = target_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # For source_tokenizer, decode the input ids
    source_tokens = [source_tokenizer.decode([id]) for id in sample["input_ids"] if id != 0]
    actual_transliteration = " ".join(source_tokens)
    
    # Get the actual translation from labels
    actual_translation = target_tokenizer.decode([l for l in sample["labels"] if l != -100], skip_special_tokens=True)
    
    # Also print the original text
    print(f"ID: {test_dataset['id'][i]}")
    print(f"Original Sumerian: {test_dataset['transliteration'][i]}")
    print(f"Decoded Sumerian: {actual_transliteration}")
    print(f"Actual translation: {actual_translation}")
    print(f"Predicted translation: {predicted_translation}")
    print("-" * 50)

ValueError: `decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.