In [3]:
import os
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import GenerationConfig, EarlyStoppingCallback

from datasets import load_dataset, Dataset as HFDataset
from load_dataset import preprocess_dataset
from compute_metrics import compute_metrics

In [4]:
MODEL_NAME = "google/mt5-small"
# Directory to save the fine-tuned model
OUTPUT_DIR = "./mt5_model"
# Directory for TensorBoard logs
LOGGING_DIR = "./mt5_logs"

# Some hyperparameters
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 1e-5
NUM_TRAIN_EPOCHS = 100

# check if dirs exist, if not create them
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
preprocessed_train = preprocess_dataset('../datasets/SumTablets_English_train.csv')
preprocessed_val = preprocess_dataset('../datasets/SumTablets_English_validation.csv')
preprocessed_test = preprocess_dataset('../datasets/SumTablets_English_test.csv')

train_data = [{
    'source': row['sumerian'],
    'target': row['english']
} for _, row in preprocessed_train.iterrows()]

val_data = [{
    'source': row['sumerian'],
    'target': row['english']
} for _, row in preprocessed_val.iterrows()]

test_data = [{
    'source': row['sumerian'],
    'target': row['english']
} for _, row in preprocessed_test.iterrows()]

Loaded 1907 examples from ../datasets/SumTablets_English_train.csv
Preprocessed dataset contains 1905 examples
Loaded 107 examples from ../datasets/SumTablets_English_validation.csv
Preprocessed dataset contains 107 examples
Loaded 113 examples from ../datasets/SumTablets_English_test.csv
Preprocessed dataset contains 113 examples


In [6]:
def preprocess_function(examples):
    """
    Tokenizes the source (Sumerian) and target (English) texts.
    """
    inputs = examples['source']
    targets = examples['target']

    # DEBUG: Print flag if any of inputs or targets are none
    if any(x is None for x in inputs) or any(x is None for x in targets):
        print("Warning: Found None values in inputs or targets. This may affect training.")


    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length")

    # Tokenize targets (English) using the newer approach
    labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert lists to Hugging Face Dataset objects
train_dataset = HFDataset.from_list(train_data)
val_dataset = HFDataset.from_list(val_data)

# Apply preprocessing to the datasets
print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

Tokenizing datasets...


Map:   0%|          | 0/1905 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [7]:
print("Example of tokenized input:")
print(tokenized_train_dataset[0])

Example of tokenized input:
{'source': ' 1(u) la₂ 1(diš) udu u₄ 2(u) 8(diš)-kam ki ab-ba-sa₆-ga-ta na-lu₅ i₃-dab₅   iti <unk> bi₂-gu₇ mu en-unu₆-gal {d}inana unu{ki}ga ba-hun  1(u) la₂ 1(diš)', 'target': '9 rams, 28th day, from Abba-saga, Nalu accepted; month: “ubi-feast,” year: “Enunugal of Inanna of Uruk was installed;” (total:) 9 (rams).', 'input_ids': [333, 312, 273, 271, 283, 338, 333, 312, 720, 1166, 271, 259, 17278, 259, 273, 410, 356, 312, 273, 271, 630, 312, 720, 1166, 271, 264, 13555, 504, 1995, 264, 835, 264, 263, 262, 451, 264, 743, 264, 422, 294, 264, 1696, 428, 259, 266, 328, 264, 31256, 428, 259, 2650, 2, 837, 14528, 2871, 487, 890, 289, 264, 14031, 451, 264, 6362, 785, 285, 1354, 348, 1238, 18308, 596, 650, 1354, 743, 810, 264, 20544, 333, 312, 273, 271, 283, 338, 333, 312, 720, 1166, 271, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
# Set the training arguments for the Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,                  # Directory to save the model

    num_train_epochs=NUM_TRAIN_EPOCHS,      # Number of training epochs
    per_device_train_batch_size=BATCH_SIZE, # Batch size for training
    per_device_eval_batch_size=BATCH_SIZE,  # Batch size for evaluation

    learning_rate=LEARNING_RATE,            # Learning rate for the optimizer
    weight_decay=0.01,                      # Weight decay for regularization
    warmup_ratio=0.1,                       # Warmup ratio for learning rate scheduler
    gradient_accumulation_steps=1,          # Gradient accumulation steps to simulate larger batch sizes
    lr_scheduler_type="cosine",             # Use cosine learning rate scheduler
    label_smoothing_factor=0.1,             # Label smoothing factor for better generalization
    max_grad_norm=1.0,                      # gradient clipping

    save_total_limit=1,                     # Only keep the last checkpoint
    predict_with_generate=True,             # Enable generation during evaluation
    report_to="tensorboard",                # Report metrics to TensorBoard
    logging_dir=LOGGING_DIR,                # Directory for TensorBoard logs
    logging_steps=50,                       # Log every 50 steps

    eval_strategy="epoch",                  # Evaluate at the end of each epoch
    save_strategy="epoch",                  # Save model at the end of each epoch
    load_best_model_at_end=True,            # Load the best model at the end of training
    metric_for_best_model="meteor",         # Metric to determine the best model
    fp16=torch.cuda.is_available(),         # Use mixed precision training if GPU is available
)

# Set up generation configuration for the model
generation_config = GenerationConfig(
    max_length=MAX_TARGET_LENGTH,           # Maximum length of the generated sequences
    early_stopping=True,                    # Stop generation when all beams reach the EOS token
    num_beams=4,                            # Number of beams for beam search
    no_repeat_ngram_size=3,                 # Prevent repetition of n-grams in the generated text
    forced_bos_token_id=0,                  # Force the beginning of the sequence to be the BOS token
    pad_token_id=tokenizer.pad_token_id,    # Padding token ID for the tokenizer
    eos_token_id=tokenizer.eos_token_id,    # End of sequence token ID for the tokenizer
    decoder_start_token_id=tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.eos_token_id   # Decoder start token ID for the model
)
model.generation_config = generation_config


# Data collator for padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=MAX_INPUT_LENGTH
)

# Initialize the Seq2SeqTrainer with the model, training arguments, datasets, tokenizer, data collator, and metrics computation
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda p: compute_metrics(p, tokenizer),        # Function to compute metrics during evaluation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]    # Early stopping callback to prevent overfitting
    )

# Start the training process
print("Starting model training...")
try:
    trainer.train()
    print("Training finished successfully!")

    # Save the final model and tokenizer
    trainer.save_model(f"{OUTPUT_DIR}/final_model")
    tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model_tokenizer")
    print(f"Final model saved to {OUTPUT_DIR}/final_model")

except Exception as e:
    print(f"An error occurred during training: {e}")

  trainer = Seq2SeqTrainer(


Starting model training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


An error occurred during training: CUDA out of memory. Tried to allocate 3.82 GiB. GPU 0 has a total capacity of 15.77 GiB of which 1.13 GiB is free. Including non-PyTorch memory, this process has 14.64 GiB memory in use. Of the allocated memory 14.15 GiB is allocated by PyTorch, and 121.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


: 

In [None]:
print("\nTesting on example data...")

def generate_translation(sumerian_text):
    # Clean and truncate input text to avoid potential issues
    sumerian_text = sumerian_text.strip()
    if len(sumerian_text) > 1000:  # Arbitrary limit to prevent very long inputs
        sumerian_text = sumerian_text[:1000] + "..."
    
    input_text = f"translate Sumerian to English: {sumerian_text}"
    
    try:
        # Process input with truncation to avoid sequence length issues
        inputs = tokenizer(
            input_text, 
            return_tensors="pt", 
            padding=True, 
            truncation=True,
            max_length=MAX_SOURCE_LENGTH
        )
        
        # Move to CPU if CUDA issues persist
        if torch.cuda.is_available():
            try:
                inputs = inputs.to(device)
                
                # Try with beam search (safer parameters)
                outputs = model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_length=MAX_TARGET_LENGTH,
                    min_length=5,
                    num_beams=2,
                    length_penalty=1.0,
                    early_stopping=True,
                    do_sample=False
                )
                
            except RuntimeError as e:
                print(f"CUDA error: {e}. Falling back to CPU.")
                # Fall back to CPU
                inputs = {k: v.cpu() for k, v in inputs.items()}
                model.cpu()
                
                outputs = model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_length=MAX_TARGET_LENGTH,
                    min_length=5,
                    num_beams=2,
                    do_sample=False
                )
                
                # Move model back to the original device
                model.to(device)
        else:
            # Already on CPU
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=MAX_TARGET_LENGTH,
                min_length=5,
                num_beams=2,
                do_sample=False
            )
        
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
    except Exception as e:
        print(f"Error in translation: {e}")
        translation = f"Translation error: {str(e)[:100]}..."
        
    return translation

for i, row in test_data.head(5).iterrows():
    if isinstance(row['transliteration'], str):
        sumerian_text = row['transliteration'].replace('\n', ' ')
        actual_translation = row['translation'].replace('\n', ' ') if isinstance(row['translation'], str) else "N/A"
        
        print(f"\nExample {i+1}:")
        print(f"Sumerian: {sumerian_text}")
        print(f"Actual Translation: {actual_translation}")
        
        generated_translation = generate_translation(sumerian_text)
        print(f"MT5 Translation: {generated_translation}")
        print("-" * 50)


Testing on example data...

Example 1:
Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Actual Translation: n male laborers, plowman and his sons, foreman: Ur-mes, 11 male laborers, foreman: Ur-lugal, 8 male laborers, foreman: Abba-saga, 6 male laborers, foreman: Lugal-kuzu, 3 male laborers, foreman: Šeš-kalla, 2 male laborers, foreman: Lugal-itida, 4 male laborers, foreman: Lu-dingira, 7 male laborers, foreman: Ur-amma, 4 male laborers, foreman: Ur-enunna, 60 male laborers, foreman: Alla-palil; inspection of the second day, on the threshing floor Ka-ma-ri2 stati

: 