In [None]:
import os
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

In [None]:
OUTPUT_DIR = './sumerian_gpt2_finetuned'    # Directory to save the fine-tuned model

train_data = pd.read_csv('datasets/SumTablets_English_train.csv')
test_data = pd.read_csv('datasets/SumTablets_English_train.csv')

# Format the data for GPT-2:
# We'll combine Sumerian and English with a separator.
# GPT-2 will learn to generate the English part after seeing "English: ".
# The <|endoftext|> token is GPT-2's standard end-of-sequence token.
formatted_texts = []
for index, row in train_data.iterrows():
    sumerian_texts = row['transliteration']
    english_translations = row['translation']
    if isinstance(sumerian_texts, str) and isinstance(english_translations, str):
        sumerian_texts = sumerian_texts.replace('\n', ' ')
        english_translations = english_translations.replace('\n', ' ')
        formatted_texts.append(f"Sumerian: {sumerian_texts}\nEnglish: {english_translations}<|endoftext|>")
print(f"Loaded {len(formatted_texts)} formatted examples.")

lengths = [len(text.split()) for text in formatted_texts]
print(lengths)
mean_length = np.mean(lengths)
print(f"Mean length of the texts: {mean_length} words")
print(f"Percentage of texts longer than 528 words: {sum(length > 528 for length in lengths) / len(lengths) * 100:.2f}%")

# remove texts longer than 528 words
formatted_texts = [text for text in formatted_texts if len(text.split()) <= 528]
print(len(formatted_texts), "texts after filtering by length.")

print(f"\nExample formatted text:\n{formatted_texts[0]}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load model from a checkpoint
model = GPT2LMHeadModel.from_pretrained(f"{OUTPUT_DIR}").to(device)
tokenizer = GPT2Tokenizer.from_pretrained(f"{OUTPUT_DIR}")

for index, row in test_data.iloc[:30,:].iterrows():
    sumerian_texts = row['transliteration'].replace('\n', ' ')
    english_translations = row['translation'].replace('\n', ' ')
    prompt_text = f"Sumerian: {sumerian_texts} \nEnglish:"

    # --- Tokenizzazione dell'input ---
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt').to(device)

    # --- Generazione del testo ---
    print("Generazione del testo...")
    try:
        output_sequences = model.generate(
            input_ids=input_ids,    
            max_length=200,             # Max length of prompt + generated text
            temperature=0.2,            # Controls randomness. Lower is more deterministic.
            top_k=40,                   # Considers the top K most probable tokens at each step.
            top_p=0.9,                  # Nucleus sampling: considers tokens with cumulative probability >= P.
            repetition_penalty=1,       # Penalizes repetition.
            num_return_sequences=1,     # Number of different sequences to generate.
            pad_token_id=tokenizer.eos_token_id, # Pad token ID for generation
            no_repeat_ngram_size=3,     # Prevent 3-gram repetition
            early_stopping=True,        # Stop when EOS is generated
            length_penalty=1.0,         # Neutral - neither favor short nor long outputs
            num_beams=3                 # Use beam search instead of sampling
        )

        print(f"Testo di input: {prompt_text}")
        print(f"Traduzione effettiva: {english_translations}")
        
        # --- Decodifica e Stampa ---
        for i, generated_sequence in enumerate(output_sequences):
            text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
            print('Testo Generato:', text)
            print('---')

    except Exception as e:
        print(f"Errore durante la generazione del testo: {e}")