In [30]:
import pandas as pd
import numpy as np
from load_dataset import preprocess_dataset

from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer

import torch
from torch.utils.data import DataLoader, Dataset

import sys  
sys.path.insert(1, '../utils')

from rclone import list_onedrive_folders, download_folder_from_onedrive

In [14]:
# Get test dataset
test_data = preprocess_dataset('../datasets/SumTablets_English_test.csv')
test_data

Loaded 113 examples from ../datasets/SumTablets_English_test.csv
Preprocessed dataset contains 113 examples


Unnamed: 0,sumerian,english
0,...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) ...,"n male laborers, plowman and his sons, foreman..."
1,1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-r...,"1 kirrum sheep, grain-fed, 2 emegi rams, for H..."
2,2(diš) udu-nita₂ kur-ra bar-gal₂ 1(diš) sila₄...,"2 male sheep of the mountain, with fleece, 1 m..."
3,...nin₉ ki aŋ₂ {d}dumu-zid-de₃ ...gur₃-ru ki ...,... beloved sister of Dumuzi Exuding/bearing ....
4,<unk> nin dub-sar dumu šeš-kal-la,"Šu-Suen, strong king, king of Ur: Aḫuni, cup-b..."
...,...,...
108,1(diš) udu bar-gal₂ ba-uš₂ ki ku₃-ga-ni-ta ki...,"1 sheep, with fleece, slaughtered, from Kugani..."
109,3(diš) gal sag-kul zabar ki-la₂-bi 2(diš) 1/...,"3 large (bowls?), ..., bronze. Their weight: 2..."
110,pisan dub-ba zi-ga u₃ kurušda-e ib₂-dab₅ ša₃ ...,Basket-of-tablets: xxx xxx xxx xxx xxx xxx
111,pisan dub-ba ab₂ e₂-tur₃-ra gu₄{geš}apin udu ...,Basket-of-tablets: xxx xxx xxx xxx xxx xxx xxx


In [16]:
# remove rows with 'xxx' as part of 'english' column
test_data_cleaned = test_data[~test_data['english'].str.contains('xxx', na=False)]
test_data_cleaned

Unnamed: 0,sumerian,english
0,...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) ...,"n male laborers, plowman and his sons, foreman..."
1,1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-r...,"1 kirrum sheep, grain-fed, 2 emegi rams, for H..."
2,2(diš) udu-nita₂ kur-ra bar-gal₂ 1(diš) sila₄...,"2 male sheep of the mountain, with fleece, 1 m..."
3,...nin₉ ki aŋ₂ {d}dumu-zid-de₃ ...gur₃-ru ki ...,... beloved sister of Dumuzi Exuding/bearing ....
4,<unk> nin dub-sar dumu šeš-kal-la,"Šu-Suen, strong king, king of Ur: Aḫuni, cup-b..."
...,...,...
102,gan-kun-sig eriš-dingir {d}pa-bil₃-sag,"Gan-kunsig, the ereš-dingir priestess of the g..."
103,2(u) ma-na siki ki ur-ba-ba-ta da-da-mu šu ba...,20 ma-na of wool. From Ur-Baba. Dadamu receive...
107,1(u) 1(diš) sila₄ niga sa₂-du₁₁ u₄ 1(u) 1(diš...,"11 male grain-fed lambs, regular offering, 11t..."
108,1(diš) udu bar-gal₂ ba-uš₂ ki ku₃-ga-ni-ta ki...,"1 sheep, with fleece, slaughtered, from Kugani..."


In [2]:
list_onedrive_folders()

['FRCNN',
 'bart_large_model',
 '  ├── final_model',
 '  ├── final_model_tokenizer',
 'bart_model',
 '  ├── checkpoint-12428',
 '  ├── final_model',
 '  ├── final_model_tokenizer',
 'datasets',
 'sumerian_gpt2_finetuned',
 '  ├── checkpoint-7894',
 '  ├── checkpoint-8300',
 'sumerian_gpt2_finetuned_logs',
 'synthetic_sentences_labels']

In [None]:
results = pd.DataFrame(columns=['id', 'sumerian', 'english', 'BART_base', 'BART_large', 'GPT2'])

### BART model

In [None]:
download_folder_from_onedrive('bart_model', 'bart_model')

Downloading 'bart_model' from OneDrive to 'bart_model'...
rclone command: rclone copy onedrive_bocconi:AI-project/bart_model bart_model -P
SUCCESS: Folder downloaded successfully.


True

In [22]:
OUTPUT_DIR = 'bart_model'
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 512

model = BartForConditionalGeneration.from_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer = BartTokenizer.from_pretrained(f"{OUTPUT_DIR}/final_model_tokenizer")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate_sumerian_to_english(text, trained_model, trained_tokenizer, device):
    """
    Translates a Sumerian text to English using the fine-tuned model.

    Args:
        text (str): The Sumerian text to translate.
        trained_model (BartForConditionalGeneration): The fine-tuned BART model.
        trained_tokenizer (BartTokenizer): The tokenizer used for the model.
        device (torch.device): The device to run the model on (CPU or GPU).
    Returns:
        str: The translated English text.
    """
    
    # Set model to evaluation mode
    trained_model.eval()
    trained_model.to(device)

    # Prepare the input text
    inputs = trained_tokenizer(text, return_tensors="pt", max_length=MAX_INPUT_LENGTH, truncation=True, padding=True)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # Generate translation
    with torch.no_grad():   # Disable gradient calculations for inference
        outputs = trained_model.generate(
            input_ids,
            attention_mask=attention_mask,      # Use attention mask to ignore padding tokens
            max_length=MAX_TARGET_LENGTH + 2,   # +2 for start/end tokens
            num_beams=5,                        # Beam search width
            early_stopping=True                 # Stop when all beams reach the end token
        )

    # Decode the generated ids to text
    translated_text = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

for index, row in test_data_cleaned.iterrows():
    sumerian_text = row['sumerian']
    english_translation = translate_sumerian_to_english(sumerian_text, model, tokenizer, device)
    true_english_translation = row['english']
    
    print(f"Sumerian: {sumerian_text}")
    print(f"Predicted English: {english_translation}")
    print(f"True English: {true_english_translation}")
    print("-" * 50)

    # add results to the results DataFrame with corresponding index
    results.loc[index, 'sumerian'] = sumerian_text
    results.loc[index, 'english'] = true_english_translation
    results.loc[index, 'BART_base'] = english_translation

Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Predicted English: n male laborers, plowmen, son of Umes; 11 male laborers: foreman: Ur-lugal; 8 male laborer: Abba-saga; 6 male laborers (from) Lugal-kuzu; 3 male laborers stationed: Šeškalla; 2 male laborers for Lugalitida; 4 male laborers from (the account of) Lu-dingira; 7 male laborers of (the accounts of) Ur-amma; 4 workmen, foreman of Ur-Enunna; 90 male laborers foreman (of) Alla-igiše, the threshing floor, the 2nd day, from Ka’ama(’s account) booked out, via Ikalla; month: “Barley at the quay,” year: ”Šu-Sue

### BART large model

In [24]:
download_folder_from_onedrive('bart_large_model', 'bart_large_model')

Downloading 'bart_large_model' from OneDrive to 'bart_large_model'...
rclone command: rclone copy onedrive_bocconi:AI-project/bart_large_model bart_large_model -P
SUCCESS: Folder downloaded successfully.


True

In [25]:
OUTPUT_DIR = 'bart_large_model'
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 512

model = BartForConditionalGeneration.from_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer = BartTokenizer.from_pretrained(f"{OUTPUT_DIR}/final_model_tokenizer")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [26]:
for index, row in test_data_cleaned.iterrows():
    sumerian_text = row['sumerian']
    english_translation = translate_sumerian_to_english(sumerian_text, model, tokenizer, device)
    true_english_translation = row['english']
    
    print(f"Sumerian: {sumerian_text}")
    print(f"Predicted English: {english_translation}")
    print(f"True English: {true_english_translation}")
    print("-" * 50)

    # add results to the results DataFrame with corresponding index
    results.loc[index, 'BART_large'] = english_translation

Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Predicted English: n male laborers, plow-oxen, for Ur-mes; 11 laborers, foreman: Ur-lugal; 8 laborers, overseer: Abba-saga; 6 laborers,foreman: Lugal-kuzu; 3 laborers, Foreman: Šeškalla; 2 laborers,Foreman:Lugal-itida; 4 laborers,FOREman: Lu-dingira; 7 laborers, (foreman): Ur-amma; 4 workers, (for) Ur-E’nunna; 60 laborers, are foreman of Alla-igiše’du; of the 2nd day, from the reservoir of Kamari stationed, via Ikalla; month: “Barley at the quay,” year: � “Šu-Suen, king of Ur, the lands of Zabšali destroyed.”
True E

### GPT2

In [29]:
download_folder_from_onedrive('gpt2_model', 'gpt2_model')

Downloading 'gpt2_model' from OneDrive to 'gpt2_model'...
rclone command: rclone copy onedrive_bocconi:AI-project/gpt2_model gpt2_model -P
SUCCESS: Folder downloaded successfully.


True

In [58]:
OUTPUT_DIR = 'gpt2_model'

model = GPT2LMHeadModel.from_pretrained(f"{OUTPUT_DIR}").to(device)
tokenizer = GPT2Tokenizer.from_pretrained(f"{OUTPUT_DIR}")

def translate_sumerian_to_english_gpt(text, trained_model, trained_tokenizer, device, temperature):
    """
    Translates a Sumerian text to English using the fine-tuned model.

    Args:
        text (str): The Sumerian text to translate.
        trained_model (GPT2LMHeadModel): The fine-tuned GPT-2 model.
        trained_tokenizer (GPT2Tokenizer): The tokenizer used for the model.
        device (torch.device): The device to run the model on (CPU or GPU).
    Returns:
        str: The translated English text.
    """
    
    # Set model to evaluation mode
    trained_model.eval()
    trained_model.to(device)

    # Prepare the input text
    inputs = tokenizer.encode(text, return_tensors="pt")
    input_ids = inputs.to(device)

    # Generate translation
    output_sequences = model.generate(
        input_ids=input_ids,    
        max_length=200,             # Max length of prompt + generated text
        temperature=temperature,    # Controls randomness. Lower is more deterministic.
        do_sample=True,             # Enable sampling
        top_k=40,                   # Considers the top K most probable tokens at each step.
        top_p=0.9,                  # Nucleus sampling: considers tokens with cumulative probability >= P.
        repetition_penalty=1,       # Penalizes repetition.
        num_return_sequences=1,     # Number of different sequences to generate.
        pad_token_id=tokenizer.eos_token_id, # Pad token ID for generation
        no_repeat_ngram_size=3,     # Prevent 3-gram repetition
        early_stopping=True,        # Stop when EOS is generated
        length_penalty=1.0,         # Neutral - neither favor short nor long outputs
        num_beams=3                 # Use beam search instead of sampling
    )

    # --- Decodifica e Stampa ---
    for i, generated_sequence in enumerate(output_sequences):
        translated_text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
    
    return translated_text


for index, row in test_data_cleaned.iterrows():
    sumerian_text = row['sumerian']
    try:
        english_translation = translate_sumerian_to_english_gpt(sumerian_text, model, tokenizer, device, temperature=0.2).split('English: ')[-1]
    except Exception as e:
        print(f"Error translating Sumerian text: {sumerian_text}")
        print(f"Exception: {e}")
        english_translation = "Translation Error"
    true_english_translation = row['english']

    print(f"Sumerian: {sumerian_text}")
    print(f"Predicted English: {english_translation}")
    print(f"True English: {true_english_translation}")
    print("-" * 50)

    # add results to the results DataFrame with corresponding index
    results.loc[index, 'GPT_2:temp=0.2'] = english_translation

Error translating Sumerian text:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Exception: Input length of input_ids is 301, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  

Token indices sequence length is longer than the specified maximum sequence length for this model (1034 > 1024). Running this sequence through the model will result in indexing errors


Sumerian:  2(diš) udu-nita₂ kur-ra bar-gal₂ 1(diš) sila₄ nita₂ kur-ra bar-gal₂ ri-ri-ga  ki ur-ru-ta kišib₃ lu₂-kal-la iti {d}li₉-si₄ mu hu-uh₂-nu-ri{ki} ba-hul  lu₂-kal-la dub-sar dumu ur-e₁₁-e šuš₃
Predicted English: 1 sheep, barley-fed, “barley-fed” (for) the Kitchen; 1 lamb (for the) Kitchen (?), with Rib-gaza; under seal of Lukalla; month: “Lisi,”
True English: 2 male sheep of the mountain, with fleece, 1 male lamb of the mountain, with fleece, fallen; from Urru, under seal of Lukalla; month: “Lisi,” year: “Ḫuḫnuri was destroyed.” Lukalla, the scribe, son of Ur-E’e, chief livestock administrator.
--------------------------------------------------
Error translating Sumerian text:  ...nin₉ ki aŋ₂ {d}dumu-zid-de₃ ...gur₃-ru ki aŋ₂ {d}dur₇-dur₇-e a...zid-de₃ šag₄-ga ri-a nin₉-ŋu₁₀ nam-nin-e tud-da {e₂}tur₃-e i₃ gara₂...mu-un-da-ab-si amaš-e i₃...un-da-an <unk>... an-eden-ne₂...a-ŋu₁₀ {d}ŋeštin-an-na me-en a ki-sikil...nam dumu banda₃{da}...na-nam unug{ki}ga...bi na-nam kul-aba₄{ki}...

In [62]:
for index, row in test_data_cleaned.iterrows():
    sumerian_text = row['sumerian']
    try:
        english_translation = translate_sumerian_to_english_gpt(sumerian_text, model, tokenizer, device, temperature=0.5).split('English: ')[-1]
    except Exception as e:
        print(f"Error translating Sumerian text: {sumerian_text}")
        print(f"Exception: {e}")
        english_translation = "Translation Error"
    true_english_translation = row['english']

    print(f"Sumerian: {sumerian_text}")
    print(f"Predicted English: {english_translation}")
    print(f"True English: {true_english_translation}")
    print("-" * 50)

    # add results to the results DataFrame with corresponding index
    results.loc[index, 'GPT_2:temp=0.5'] = english_translation

Error translating Sumerian text:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Exception: Input length of input_ids is 301, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  

  results.loc[index, 'GPT_2:temp=0.5'] = english_translation


Sumerian:  1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-ra nita₂ ma₂-an-na unu{ki} iti-ta u₄ 2(u) 6(diš) ba-ra-zal  ki lugal-nir-ta giri₃ ba-qar-tum  iti {d}dumu-zi mu en {d}inana unu{ki} maš₂-e i₃-pa₃   {d}šu{d}suen lugal kal-ga lugal uri₅{ki}ma lugal an ub-da limmu₂-ba  wa-qar-tum nin₉-a-ni
Predicted English:  1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-ra nita₂ ma₂-an-na unu{ki} iti-ta u₄ 2(u) 6(diš) ba-ra-zal  ki lugal-nir-ta giri₃ ba-qar-tum  iti {d}dumu-zi mu en {d}inana unu{ki} maš₂-e i₃-pa₃   {d}šu{d}suen lugal kal-ga lugal uri₅{ki}ma lugal an ub-da limmu₂-ba  wa-qar-tum nin₉-a-ni dub-sar dumu ur-sa₆-ga
True English: 1 kirrum sheep, grain-fed, 2 emegi rams, for Heaven-barge of Uruk, of the month, the 26th day passed; from Lugal-nir, via Baqartum. month: “Dumuzi,” year: “The high-priestess of Inanna of Uruk by extispicy was chosen.” Šu-Suen, strong king, king of Ur, king of the four quarters: Waqartum, his sister.
--------------------------------------------------
Sumerian:  2(di

In [63]:
for index, row in test_data_cleaned.iterrows():
    sumerian_text = row['sumerian']
    try:
        english_translation = translate_sumerian_to_english_gpt(sumerian_text, model, tokenizer, device, temperature=0.7).split('English: ')[-1]
    except Exception as e:
        print(f"Error translating Sumerian text: {sumerian_text}")
        print(f"Exception: {e}")
        english_translation = "Translation Error"
    true_english_translation = row['english']

    print(f"Sumerian: {sumerian_text}")
    print(f"Predicted English: {english_translation}")
    print(f"True English: {true_english_translation}")
    print("-" * 50)

    # add results to the results DataFrame with corresponding index
    results.loc[index, 'GPT_2:temp=0.7'] = english_translation

Error translating Sumerian text:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Exception: Input length of input_ids is 301, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  

  results.loc[index, 'GPT_2:temp=0.7'] = english_translation


Sumerian:  1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-ra nita₂ ma₂-an-na unu{ki} iti-ta u₄ 2(u) 6(diš) ba-ra-zal  ki lugal-nir-ta giri₃ ba-qar-tum  iti {d}dumu-zi mu en {d}inana unu{ki} maš₂-e i₃-pa₃   {d}šu{d}suen lugal kal-ga lugal uri₅{ki}ma lugal an ub-da limmu₂-ba  wa-qar-tum nin₉-a-ni
Predicted English:  1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-ra nita₂ ma₂-an-na unu{ki} iti-ta u₄ 2(u) 6(diš) ba-ra-zal  ki lugal-nir-ta giri₃ ba-qar-tum  iti {d}dumu-zi mu en {d}inana unu{ki} maš₂-e i₃-pa₃   {d}šu{d}suen lugal kal-ga lugal uri₅{ki}ma lugal an ub-da limmu₂-ba  wa-qar-tum nin₉-a-ni dub-sar dumu ur-sa₆-ga
True English: 1 kirrum sheep, grain-fed, 2 emegi rams, for Heaven-barge of Uruk, of the month, the 26th day passed; from Lugal-nir, via Baqartum. month: “Dumuzi,” year: “The high-priestess of Inanna of Uruk by extispicy was chosen.” Šu-Suen, strong king, king of Ur, king of the four quarters: Waqartum, his sister.
--------------------------------------------------
Sumerian:  2(di

In [64]:
for index, row in test_data_cleaned.iterrows():
    sumerian_text = row['sumerian']
    try:
        english_translation = translate_sumerian_to_english_gpt(sumerian_text, model, tokenizer, device, temperature=1.0).split('English: ')[-1]
    except Exception as e:
        print(f"Error translating Sumerian text: {sumerian_text}")
        print(f"Exception: {e}")
        english_translation = "Translation Error"
    true_english_translation = row['english']

    print(f"Sumerian: {sumerian_text}")
    print(f"Predicted English: {english_translation}")
    print(f"True English: {true_english_translation}")
    print("-" * 50)

    # add results to the results DataFrame with corresponding index
    results.loc[index, 'GPT_2:temp=1.0'] = english_translation

Error translating Sumerian text:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  1(geš₂) guruš ugula al-la-igi-še₃-du gurum₂ u₄ 2(diš)-kam ki-su₇ ka-ma-ri₂ gub-ba giri₃ i₃-kal-la iti še-kar-ra-gal₂-la mu {d}šu{d}suen lugal uri₅-ma{ki}...da za-ab-ša-li{ki} mu-hul
Exception: Input length of input_ids is 301, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Sumerian:  ...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) guruš ugula ur-lugal 8(diš) guruš ugula ab-ba-sag₁₀ 6(diš) guruš ugula lugal-ku₃-zu 3(diš) guruš ugula šeš-kal-la 2(diš) guruš ugula lugal-iti-da 4(diš) guruš ugula lu₂-dingir-ra 7(diš) guruš ugula ur-am₃-ma 4(diš) guruš ugula ur-e₂-nun-na  

  results.loc[index, 'GPT_2:temp=1.0'] = english_translation


Sumerian:  1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-ra nita₂ ma₂-an-na unu{ki} iti-ta u₄ 2(u) 6(diš) ba-ra-zal  ki lugal-nir-ta giri₃ ba-qar-tum  iti {d}dumu-zi mu en {d}inana unu{ki} maš₂-e i₃-pa₃   {d}šu{d}suen lugal kal-ga lugal uri₅{ki}ma lugal an ub-da limmu₂-ba  wa-qar-tum nin₉-a-ni
Predicted English:  1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-ra nita₂ ma₂-an-na unu{ki} iti-ta u₄ 2(u) 6(diš) ba-ra-zal  ki lugal-nir-ta giri₃ ba-qar-tum  iti {d}dumu-zi mu en {d}inana unu{ki} maš₂-e i₃-pa₃   {d}šu{d}suen lugal kal-ga lugal uri₅{ki}ma lugal an ub-da limmu₂-ba  wa-qar-tum nin₉-a-ni dub-sar dumu ur... <unk> zu

True English: 1 kirrum sheep, grain-fed, 2 emegi rams, for Heaven-barge of Uruk, of the month, the 26th day passed; from Lugal-nir, via Baqartum. month: “Dumuzi,” year: “The high-priestess of Inanna of Uruk by extispicy was chosen.” Šu-Suen, strong king, king of Ur, king of the four quarters: Waqartum, his sister.
--------------------------------------------------
Sumerian:

In [47]:
# create new columns
results['GPT_2:temp=1.0'] = np.nan

In [67]:
# remove column GPT_2:temp=1
results.drop(columns=['id'], inplace=True, errors='ignore')


In [68]:
results

Unnamed: 0,sumerian,english,BART_base,BART_large,GPT_2:temp=0.2,GPT_2:temp=0.5,GPT_2:temp=0.7,GPT_2:temp=1.0
0,...guruš engar dumu-ni ...ur-mes 1(u) 1(diš) ...,"n male laborers, plowman and his sons, foreman...","n male laborers, plowmen, son of Umes; 11 male...","n male laborers, plow-oxen, for Ur-mes; 11 lab...",Translation Error,Translation Error,Translation Error,Translation Error
1,1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-r...,"1 kirrum sheep, grain-fed, 2 emegi rams, for H...","1 sheep, Girru-um, grain-fed; 2 sheep, eme-gir...","1 sheep Girrum, barley-fed, 2 sheep Eme-gira, ...",1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-r...,1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-r...,1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-r...,1(diš) udu gir-ru-um niga 2(diš) udu eme-gi-r...
2,2(diš) udu-nita₂ kur-ra bar-gal₂ 1(diš) sila₄...,"2 male sheep of the mountain, with fleece, 1 m...","2 rams, mountain range, barigal; 1 male lamb, ...","2 rams, grass-fed, full-grown, bar-gal; 1 lamb...","1 sheep, barley-fed, “barley-fed” (for) the Ki...","1 sheep, barley-fed, “barley-fed” (for) the Ki...","1 sheep, barley-fed, “barley-fed” (for) the Ki...","1 sheep, barley-fed, “barley-fed” (for) the Ki..."
3,...nin₉ ki aŋ₂ {d}dumu-zid-de₃ ...gur₃-ru ki ...,... beloved sister of Dumuzi Exuding/bearing ....,"Nin, from the sand dunes of Dumuzi, on the ban...","... from the temple of Dumu-zidde, ... from th...",Translation Error,Translation Error,Translation Error,Translation Error
4,<unk> nin dub-sar dumu šeš-kal-la,"Šu-Suen, strong king, king of Ur: Aḫuni, cup-b...",Basket-of-tablets: xxx xxx,"Nin, scribe, son of Šeškalla.","Šū-Nin, scribe, son of Šeškalla. Šu-Šimašda, h...","Šū-Nin, scribe, son of Šeškalla. Šu-Sukkalla, ...","Šū-Nin, scribe, son of Šeškalla. Šu-Sîn, his b...","Šū-Nin, scribe, son of Šeškalla. Ša-bani, his ..."
...,...,...,...,...,...,...,...,...
102,gan-kun-sig eriš-dingir {d}pa-bil₃-sag,"Gan-kunsig, the ereš-dingir priestess of the g...","Gankun-sig: Eriš-ilī, his servant.","Gan-kun-sig, scribe, son of Pbil-saga.",The one who possesses silver (is) the one who ...,A pig takes away: its burden (is) lifted (from...,The one who possesses silver (and) lapis lazul...,"A.K. (To) the gran-kun: Eriduš-ingir, the prie..."
103,2(u) ma-na siki ki ur-ba-ba-ta da-da-mu šu ba...,20 ma-na of wool. From Ur-Baba. Dadamu receive...,20 mana of wool from Ur-Baba did Dadamu receiv...,"20 mana wool, from Ur-Baba, Dada-mu received; ...","10 minas of silver, from Ur-Baba, Dadamu recei...","10 minas of silver, from Ur-Baba, Dadamu recei...","10 minas of silver, from Ur-Baba, Dadamu recei...","10 minas of silver, from Ur-Baba, Dadamu recei..."
107,1(u) 1(diš) sila₄ niga sa₂-du₁₁ u₄ 1(u) 1(diš...,"11 male grain-fed lambs, regular offering, 11t...","11 male lambs, grain-fed, regular rations, 11t...","11 sila3 barley, regular offerings, on the 11t...",Translation Error,Translation Error,Translation Error,Translation Error
108,1(diš) udu bar-gal₂ ba-uš₂ ki ku₃-ga-ni-ta ki...,"1 sheep, with fleece, slaughtered, from Kugani...","1 sheep, bar-gal, slaughtered, from Kugani, un...","1 sheep, slaughtered, from Kugani, under seal ...","1 sheep, slaughtered, from Kugani, under seal ...","1 sheep, Bar-gal, slaughtered, from Kugani und...","1 sheep, Bar-gal, slaughtered, from Kugani und...","1 sheep, Baraga slaughtered, from Kugani under..."


### Save as a file

In [69]:
# write results to a CSV file. Since there are sentences, use some appropriate separator
results.to_csv('test_results.csv', index=False, sep='|', encoding='utf-8')