In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

model = GPT2LMHeadModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Test model is working

prompt_text = "Once upon a time, in a land far, far away,"

# Imposta il pad_token se non è definito
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

# --- Device Configuration (GPU o CPU) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Modello caricato su: {device}")

# --- Tokenizzazione dell'input ---
input_ids = tokenizer.encode(prompt_text, return_tensors='pt').to(device)

# --- Generazione del testo ---
print("Generazione del testo...")
try:
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=150,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.1,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    # --- Decodifica e Stampa ---
    for i, generated_sequence in enumerate(output_sequences):
        text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
        print(f"\n--- Testo Generato {i+1} ---")
        print(text)

except Exception as e:
    print(f"Errore durante la generazione del testo: {e}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Modello caricato su: cpu
Generazione del testo...

--- Testo Generato 1 ---
Once upon a time, in a land far, far away, the world was not so much as one of those things that is called "the earth." It had been there for thousands of years. And now it has come to pass; and we are all living on this planet with our own hands!
The Earth's surface temperature rose by about 1 degree Celsius (3 degrees Fahrenheit) during last century alone—a record high since records began being made at least 20 million years ago. The average annual increase over these past two centuries would have taken place only if temperatures were kept constant throughout history: today, they're just below zero or even above 0°C/century — which means no warming whatsoever from human activity until 2100. But what happens


In [5]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
import pandas as pd

# --- 1. Configuration & Parameters ---
# Model choice: you can start with 'gpt2' and try 'gpt2-medium' if you have more resources
MODEL_NAME = 'gpt2'
OUTPUT_DIR = './sumerian_gpt2_finetuned' # Directory to save the fine-tuned model
LOG_DIR = './logs'                     # Directory for training logs

# Training hyperparameters (adjust these based on your dataset size and resources)
NUM_EPOCHS = 3                         # Number of training epochs
BATCH_SIZE_PER_DEVICE = 4              # Batch size for training and evaluation (adjust based on GPU memory)
LEARNING_RATE = 5e-5                   # Learning rate
WARMUP_STEPS = 100                     # Number of warmup steps for learning rate scheduler
WEIGHT_DECAY = 0.01                    # Weight decay
MAX_LENGTH = 256                       # Maximum sequence length for tokenizer (adjust based on your data)
TRAIN_VALID_SPLIT = 0.1                # Proportion of data to use for validation

# --- 2. Load and Prepare Your Dataset ---
# Assume you have your data as two lists: `sumerian_texts` and `english_translations`
# Example:
# sumerian_texts = ["transliteration 1", "transliteration 2", ...]
# english_translations = ["translation 1", "translation 2", ...]

train_data = pd.read_csv('datasets/SumTablets_English_train.csv')
test_data = pd.read_csv('datasets/SumTablets_English_train.csv')


# Format the data for GPT-2:
# We'll combine Sumerian and English with a separator.
# GPT-2 will learn to generate the English part after seeing "English: ".
# The <|endoftext|> token is GPT-2's standard end-of-sequence token.
formatted_texts = []
for index, row in train_data.iterrows():
    sumerian_texts = row['transliteration']
    english_translations = row['translation']
    if isinstance(sumerian_texts, str) and isinstance(english_translations, str):
        formatted_texts.append(f"Sumerian: {sumerian_texts.strip()} English: {english_translations.strip()}")
print(f"Loaded {len(formatted_texts)} formatted examples.")

if formatted_texts:
    print(f"Example formatted text: {formatted_texts[0]}")

# --- 3. Initialize Tokenizer ---
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)

# GPT-2 doesn't have a pad token by default. We'll use the eos_token as the pad_token.
# This is a common practice.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Set tokenizer.pad_token to tokenizer.eos_token ({tokenizer.eos_token})")

# --- 4. Create a PyTorch Dataset ---
class SumerianEnglishDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length
        self.encodings = []
        for text in texts:
            # Tokenize the combined text
            # truncation=True ensures that sequences longer than max_length are cut.
            # padding='max_length' pads shorter sequences to max_length.
            # return_tensors='pt' returns PyTorch tensors.
            encoding = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding="max_length", # Ensure all sequences have the same length for batching
                return_attention_mask=True,
                return_tensors='pt'  # Explicitly specify to return PyTorch tensors
            )
            # For language modeling, the 'labels' are typically the same as 'input_ids'.
            # The model will learn to predict the next token.
            # The DataCollatorForLanguageModeling will handle shifting labels for us.
            self.encodings.append({
                "input_ids": encoding["input_ids"].squeeze(), # Remove batch dimension if present
                "attention_mask": encoding["attention_mask"].squeeze()
            })

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = self.encodings[idx]
        # The labels are the input_ids themselves for language modeling.
        # The model is trained to predict the next token in the sequence.
        # The DataCollatorForLanguageModeling will shift them appropriately.
        return {"input_ids": item["input_ids"], "attention_mask": item["attention_mask"], "labels": item["input_ids"].clone()}

# Create the full dataset
full_dataset = SumerianEnglishDataset(formatted_texts, tokenizer, MAX_LENGTH)

# Split into training and validation sets
if TRAIN_VALID_SPLIT > 0:
    num_train = int((1 - TRAIN_VALID_SPLIT) * len(full_dataset))
    num_valid = len(full_dataset) - num_train
    train_dataset, eval_dataset = random_split(full_dataset, [num_train, num_valid])
    print(f"Split dataset into {len(train_dataset)} training samples and {len(eval_dataset)} validation samples.")
else:
    train_dataset = full_dataset
    eval_dataset = None # No validation
    print(f"Using all {len(train_dataset)} samples for training. No validation set.")


# --- 5. Initialize Model ---
# Load GPT-2 model with a language modeling head
# model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# Resize token embeddings if you added special tokens (not strictly necessary here as we used eos_token as pad_token)
# model.resize_token_embeddings(len(tokenizer)) # Uncomment if you explicitly added new tokens

# Set the pad_token_id in the model configuration (important for generation and padding)
model.config.pad_token_id = tokenizer.pad_token_id
print(f"Set model.config.pad_token_id to {tokenizer.pad_token_id}")


# --- 6. Data Collator ---
# The DataCollatorForLanguageModeling will automatically create batches and
# shift the input_ids to create labels for causal language modeling (predicting the next token).
# It also handles padding. `mlm=False` means we are doing Causal Language Modeling (CLM), not Masked Language Modeling (MLM).
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal Language Modeling for GPT-2
)

# --- 7. Define Training Arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,                  # Directory to save model checkpoints and outputs
    num_train_epochs=NUM_EPOCHS,            # Total number of training epochs
    per_device_train_batch_size=BATCH_SIZE_PER_DEVICE, # Batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE_PER_DEVICE,  # Batch size for evaluation
    warmup_steps=WARMUP_STEPS,              # Number of warmup steps for learning rate scheduler
    weight_decay=WEIGHT_DECAY,              # Strength of weight decay
    logging_dir=LOG_DIR,                    # Directory for storing logs
    logging_steps=10,                       # Log every X updates steps
    eval_strategy="epoch" if eval_dataset else "no", # Evaluate at the end of each epoch if eval_dataset exists
    save_strategy="epoch",                  # Save a checkpoint at the end of each epoch
    load_best_model_at_end=True if eval_dataset else False, # Load the best model found during training (based on eval loss)
    fp16=torch.cuda.is_available(),         # Use 16-bit (mixed) precision training if a GPU is available
    # report_to="tensorboard",              # You can integrate with TensorBoard, WandB, etc.
)

# --- 8. Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# --- 9. Start Fine-tuning ---
print("Starting fine-tuning...")
try:
    trainer.train()
    print("Fine-tuning completed.")
except Exception as e:
    print(f"An error occurred during training: {e}")
    # Potentially save the current state if an error occurs mid-training
    # model.save_pretrained(f"{OUTPUT_DIR}_interrupted")
    # tokenizer.save_pretrained(f"{OUTPUT_DIR}_interrupted")
    # print(f"Interrupted model saved to {OUTPUT_DIR}_interrupted")
    raise e


# --- 10. Save the Fine-tuned Model and Tokenizer ---
print(f"Saving model to {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR) # Saves the model state_dict and configuration
tokenizer.save_pretrained(OUTPUT_DIR) # Saves the tokenizer
print(f"Model and tokenizer saved to {OUTPUT_DIR}")


# --- 11. Inference Example (How to use the fine-tuned model) ---
print("\n--- Inference Example ---")
# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained(OUTPUT_DIR)

# Ensure the pad token is set for the loaded tokenizer (it should be saved, but good to double check)
if fine_tuned_tokenizer.pad_token is None:
    fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token
    fine_tuned_model.config.pad_token_id = fine_tuned_tokenizer.eos_token_id


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tuned_model.to(device)
fine_tuned_model.eval() # Set the model to evaluation mode

# Example Sumerian transliteration to translate
sumerian_prompt = "dingir inana za-me-en" # "Goddess Inana, you are"

# Format the prompt exactly as done during training, up to the point where generation should start
prompt_for_generation = f"Sumerian: {sumerian_prompt.strip()} English:"
print(f"Prompt for generation: '{prompt_for_generation}'")

# Tokenize the prompt
input_ids = fine_tuned_tokenizer.encode(prompt_for_generation, return_tensors='pt').to(device)

# Generate text
# Adjust generation parameters as needed
# max_new_tokens is often preferred over max_length for more control over the generated part
# For this example, we'll use max_length relative to the prompt.
output_sequences = fine_tuned_model.generate(
    input_ids=input_ids,
    max_length=MAX_LENGTH, # Max length of prompt + generated text
    # max_new_tokens=50, # Alternative: specify only the number of new tokens to generate
    temperature=0.7,          # Controls randomness. Lower is more deterministic.
    top_k=50,                 # Considers the top K most probable tokens at each step.
    top_p=0.95,               # Nucleus sampling: considers tokens with cumulative probability >= P.
    repetition_penalty=1.2,   # Penalizes repetition.
    num_return_sequences=1,   # Number of different sequences to generate.
    pad_token_id=fine_tuned_tokenizer.eos_token_id # Crucial for generation
)

# Decode and print the generated text
for generated_sequence in output_sequences:
    full_text = fine_tuned_tokenizer.decode(generated_sequence, skip_special_tokens=False) # Keep special tokens initially for inspection
    # Extract only the generated English part
    # This depends on your prompt format. We look for text after "English: "
    generated_english = full_text.split(prompt_for_generation)[-1]
    # Remove the <|endoftext|> token if present at the end
    generated_english = generated_english.replace(fine_tuned_tokenizer.eos_token, "").strip()

    print(f"Sumerian Input: {sumerian_prompt}")
    print(f"Generated English: {generated_english}")
    # For more detailed inspection:
    # print(f"Full generated sequence: {full_text}")

print("\nScript finished.")

# To run this script:
# 1. Save it as a Python file (e.g., `finetune_sumerian_gpt2.py`).
# 2. Make sure you have your Sumerian and English data ready and update `load_your_data()`.
# 3. Install the necessary libraries: pip install torch transformers datasets (datasets is not used here but often useful)
# 4. Run from your terminal: python finetune_sumerian_gpt2.py

Loaded 1905 formatted examples.
Example formatted text: Sumerian: 1(u) la₂ 1(diš) udu
u₄ 2(u) 8(diš)-kam
ki ab-ba-sa₆-ga-ta
na-lu₅ i₃-dab₅


iti <unk> bi₂-gu₇
mu en-unu₆-gal {d}inana unu{ki}ga ba-hun

1(u) la₂ 1(diš) English: 9 rams,
28th day,
from Abba-saga,
Nalu accepted;
month: “ubi-feast,”
year: “Enunugal of Inanna of Uruk was installed;”
(total:) 9 (rams).
Set tokenizer.pad_token to tokenizer.eos_token (<|endoftext|>)
Split dataset into 1714 training samples and 191 validation samples.
Set model.config.pad_token_id to 50256


  trainer = Trainer(


Starting fine-tuning...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 