In [1]:
!pip install sentencepiece



In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from evaluate import load

2025-05-31 10:30:44.554435: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748687445.478548    3455 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748687445.738190    3455 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748687448.150198    3455 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748687448.150235    3455 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748687448.150237    3455 computation_placer.cc:177] computation placer alr

In [2]:

MODEL_NAME = "google/mt5-small"
OUTPUT_DIR = "./sumerian_mt5_model"
LOG_DIR = "./sumerian_mt5_logs"

# Create output directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

print(f"Loading model: {MODEL_NAME}")
# Fix the tokenizer initialization by using AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Device being used: {device}")
print(f"Model loaded with {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")
print(f"Tokenizer vocabulary size: {len(tokenizer)}")


Loading model: google/mt5-small


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Device being used: cuda
Model loaded with 300.18M parameters
Tokenizer vocabulary size: 250100


In [None]:
print("Loading data...")
train_data = pd.read_csv('../datasets/SumTablets_English_train.csv')

# For evaluation, use a separate test set if available, otherwise split from train
try:
    test_data = pd.read_csv('../datasets/SumTablets_English_test.csv')
    print(f"Loaded {len(train_data)} training examples and {len(test_data)} test examples")
except:
    print("No separate test file found. Will split from training data.")
    test_data = train_data

train_data

Loading data...
Loaded 1907 training examples and 113 test examples
22
24
19
17
22
23
59
10
22
27
18
18
48
34
31
89
38
22
23
18
21
24
16
18
40
33
23
10
11
19
34
104
52
14
24
5
19
26
29
22
67
21
17
19
16
15
56
72
31
25
18
33
15
12
9
18
8
19
50
92
37
37
30
24
115
27
31
25
25
12
20
22
33
20
21
32
20
25
30
66
20
35
253
171
129
106
104
26
105
62
170
56
19
89
96
431
3
3
4
3
3
1
6
4
5
3
4
3
3
2
4
4
5
3
9
4
4
4
14
3
6
4
4
5
4
4
3
2
4
4
7
9
4
4
10
4
4
4
5
3
9
3
10
2
4
6
14
15
14
14
13
13
16
16
16
13
14
13
16
1
12
10
6
9
10
14
17
19
17
16
11
13
13
15
16
14
15
13
16
17
17
15
15
13
13
16
15
15
13
16
15
16
15
17
14
14
12
15
15
15
15
16
17
15
14
15
15
15
15
17
16
15
13
14
13
15
13
15
13
14
14
15
14
15
16
15
15
18
14
16
15
15
15
16
15
15
16
15
17
13
15
15
14
14
14
13
12
15
16
18
13
14
15
14
15
16
14
13
33
16
12
17
13
5
11
5
3
15
14
3
3
4
4
15
14
13
14
15
14
6
30
20
20
5
21
63
22
24
125
32
14
60
41
42
36
13
9
6
11
21
32
36
36
25
21
38
159
10
16
15
35
33
17
40
19
20
20
32
18
81
16
34
45
8
69
20
26
30
3

In [26]:
MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 256

class SumerianEnglishDataset(Dataset):
    def __init__(self, data, tokenizer, max_source_len, max_target_len):
        self.tokenizer = tokenizer
        self.data = data
        self.max_source_len = max_source_len
        self.max_target_len = max_target_len
        
        # Filter out rows with missing data
        self.filtered_data = []

        for idx, row in data.iterrows():
            if isinstance(row['transliteration'], str) and isinstance(row['translation'], str):
                self.filtered_data.append({
                    'sumerian': row['transliteration'].replace('\n', ' '),
                    'english': row['translation'].replace('\n', ' ')
                })

    def __len__(self):
        return len(self.filtered_data)
    
    def __getitem__(self, idx):
        example = self.filtered_data[idx]
        
        # For MT5, we prepend a task prefix to clarify the task
        source_text = f"translate Sumerian to English: {example['sumerian']}"
        target_text = example['english']
        
        # Tokenize inputs
        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_source_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Tokenize targets
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_target_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Replace padding token id's with -100 for loss calculation
        target_ids = target_encoding["input_ids"]
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": source_encoding["input_ids"].squeeze().numpy(),
            "attention_mask": source_encoding["attention_mask"].squeeze().numpy(),
            "labels": target_ids.squeeze().numpy()
        }

# Create training dataset
train_dataset = SumerianEnglishDataset(
    train_data, 
    tokenizer, 
    max_source_len=MAX_SOURCE_LENGTH, 
    max_target_len=MAX_TARGET_LENGTH
)

# Create test dataset
test_dataset = SumerianEnglishDataset(
    test_data, 
    tokenizer, 
    max_source_len=MAX_SOURCE_LENGTH, 
    max_target_len=MAX_TARGET_LENGTH
)

In [27]:
TRAIN_VALID_SPLIT = 0.1

# Split into training and validation sets
if TRAIN_VALID_SPLIT > 0:
    train_size = int((1 - TRAIN_VALID_SPLIT) * len(train_dataset))
    valid_size = len(train_dataset) - train_size
    train_dataset, eval_dataset = random_split(train_dataset, [train_size, valid_size])
    print(f"Split into {train_size} training and {valid_size} validation samples")
else:
    train_dataset = train_dataset
    eval_dataset = None

Split into 1714 training and 191 validation samples


In [31]:
# Training hyperparameters
NUM_EPOCHS = 1
LEARNING_RATE = 2e-5
BATCH_SIZE = 8

# --- Define evaluation metrics ---
def compute_metrics(eval_preds):
    bleu_metric = load("bleu")
    meteor_metric = load("meteor")
    rouge_metric = load("rouge")
    
    preds, labels = eval_preds
    
    # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Clean up predictions and labels
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    
    # Print some examples for debugging
    print("\nSample predictions (first 2):")
    for i in range(min(2, len(decoded_preds))):
        print(f"Pred: '{decoded_preds[i]}'")
        print(f"Label: '{decoded_labels[i]}'")
        print("---")
    
    # Check if we have any valid predictions/labels to work with
    if not decoded_preds or not decoded_labels:
        print("Warning: Empty predictions or labels")
        return {
            "bleu": 0.0,
            "meteor": 0.0, 
            "rougeL": 0.0,
            "gen_len": 0.0
        }
    
    # Ensure all predictions and labels have content (not empty strings)
    valid_pairs = [(p, l) for p, l in zip(decoded_preds, decoded_labels) if p.strip() and l.strip()]
    if not valid_pairs:
        print("Warning: No valid (non-empty) prediction-label pairs found")
        return {
            "bleu": 0.0,
            "meteor": 0.0, 
            "rougeL": 0.0,
            "gen_len": 0.0
        }
    
    # Unzip the valid pairs
    valid_preds, valid_labels = zip(*valid_pairs)
    
    # Format references for BLEU
    references_for_bleu = [[label] for label in valid_labels]
    
    # Calculate metrics
    results = {}
    
    try:
        # BLEU
        bleu_results = bleu_metric.compute(predictions=valid_preds, references=references_for_bleu)
        results["bleu"] = bleu_results["bleu"] if bleu_results else 0.0
        
        # METEOR
        meteor_results = meteor_metric.compute(predictions=valid_preds, references=valid_labels)
        results["meteor"] = meteor_results["meteor"] if meteor_results else 0.0
        
        # ROUGE
        rouge_results = rouge_metric.compute(predictions=valid_preds, references=valid_labels)
        results["rougeL"] = rouge_results["rougeL"] if rouge_results else 0.0
        
        # Add prediction length
        pred_lens = [len(pred.split()) for pred in valid_preds]
        results["gen_len"] = np.mean(pred_lens) if pred_lens else 0.0
    
    except Exception as e:
        print(f"Error computing metrics: {str(e)}")
        import traceback
        traceback.print_exc()
        # Return zeros for all metrics if computation fails
        return {
            "bleu": 0.0,
            "meteor": 0.0, 
            "rougeL": 0.0,
            "gen_len": 0.0
        }
    
    return {k: round(v, 4) if isinstance(v, float) else v for k, v in results.items()}
    
# --- Data collator ---
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None
)

# --- 7. Training arguments ---
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    logging_dir=LOG_DIR,
    logging_steps=100,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,                      # gradient clipping
    generation_max_length=MAX_TARGET_LENGTH,
    report_to="tensorboard",
    warmup_steps=500,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# --- 8. Initialize trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# --- 9. Train the model ---
print("Starting training...")
trainer.train()

# --- 10. Save the model ---
print(f"Saving model to {OUTPUT_DIR}")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

  trainer = Seq2SeqTrainer(


Starting training...


Epoch,Training Loss,Validation Loss,Bleu,Meteor,Rougel,Gen Len
0,0.0,,0.0,0.0,0.0,0.0


[nltk_data] Downloading package wordnet to /home/default/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/default/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/default/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Sample predictions (first 2):
Pred: ''
Label: '2 nannies, ... of the sukkalmaḫ, via Šeškalla, the household manager, booked out; year: “Šašrum was destroyed.”'
---
Pred: ''
Label: '11 lambs, 1 billy goat, 3rd day, from Abbasaga Intaea accepted; month: “Piglet-feast,” year: “Šašru was destroyed;” (total:) 12.'
---
Saving model to ./sumerian_mt5_model


('./sumerian_mt5_model/tokenizer_config.json',
 './sumerian_mt5_model/special_tokens_map.json',
 './sumerian_mt5_model/spiece.model',
 './sumerian_mt5_model/added_tokens.json',
 './sumerian_mt5_model/tokenizer.json')

In [None]:
print("\nTesting on example data...")

def generate_translation(sumerian_text):
    # Clean and truncate input text to avoid potential issues
    sumerian_text = sumerian_text.strip()
    if len(sumerian_text) > 1000:  # Arbitrary limit to prevent very long inputs
        sumerian_text = sumerian_text[:1000] + "..."
    
    input_text = f"translate Sumerian to English: {sumerian_text}"
    
    try:
        # Process input with truncation to avoid sequence length issues
        inputs = tokenizer(
            input_text, 
            return_tensors="pt", 
            padding=True, 
            truncation=True,
            max_length=MAX_SOURCE_LENGTH
        )
        
        # Move to CPU if CUDA issues persist
        if torch.cuda.is_available():
            try:
                inputs = inputs.to(device)
                
                # Try with beam search (safer parameters)
                outputs = model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_length=MAX_TARGET_LENGTH,
                    min_length=5,
                    num_beams=2,
                    length_penalty=1.0,
                    early_stopping=True,
                    do_sample=False
                )
                
            except RuntimeError as e:
                print(f"CUDA error: {e}. Falling back to CPU.")
                # Fall back to CPU
                inputs = {k: v.cpu() for k, v in inputs.items()}
                model.cpu()
                
                outputs = model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_length=MAX_TARGET_LENGTH,
                    min_length=5,
                    num_beams=2,
                    do_sample=False
                )
                
                # Move model back to the original device
                model.to(device)
        else:
            # Already on CPU
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=MAX_TARGET_LENGTH,
                min_length=5,
                num_beams=2,
                do_sample=False
            )
        
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
    except Exception as e:
        print(f"Error in translation: {e}")
        translation = f"Translation error: {str(e)[:100]}..."
        
    return translation

for i, row in test_data.head(5).iterrows():
    if isinstance(row['transliteration'], str):
        sumerian_text = row['transliteration'].replace('\n', ' ')
        actual_translation = row['translation'].replace('\n', ' ') if isinstance(row['translation'], str) else "N/A"
        
        print(f"\nExample {i+1}:")
        print(f"Sumerian: {sumerian_text}")
        print(f"Actual Translation: {actual_translation}")
        
        generated_translation = generate_translation(sumerian_text)
        print(f"MT5 Translation: {generated_translation}")
        print("-" * 50)


Testing on example data...

Example 1:
Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Actual Translation: n male laborers, plowman and his sons, foreman: Ur-mes, 11 male laborers, foreman: Ur-lugal, 8 male laborers, foreman: Abba-saga, 6 male laborers, foreman: Lugal-kuzu, 3 male laborers, foreman: Šeš-kalla, 2 male laborers, foreman: Lugal-itida, 4 male laborers, foreman: Lu-dingira, 7 male laborers, foreman: Ur-amma, 4 male laborers, foreman: Ur-enunna, 60 male laborers, foreman: Alla-palil; inspection of the second day, on the threshing floor Ka-ma-ri2 stati

: 