In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import GenerationConfig
from transformers import EarlyStoppingCallback
from datasets import load_dataset, Dataset as HFDataset
import pandas as pd
from load_dataset import preprocess_dataset
from compute_metrics import compute_metrics

In [None]:
large = False  # Set to True if using the large version of BART

MODEL_NAME = "facebook/bart-base" if not large else "facebook/bart-large"

MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 1e-5
NUM_TRAIN_EPOCHS = 100

# Directory to save the fine-tuned model
OUTPUT_DIR = "./bart_model" if not large else "./bart_large_model"

# Directory for TensorBoard logs
LOGGING_DIR = "./bart_logs" if not large else "./bart_large_logs"

# check if dirs exist, if not create them
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device)

tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)

if not hasattr(model.config, "decoder_start_token_id") or model.config.decoder_start_token_id is None:
    model.config.decoder_start_token_id = tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.eos_token_id

In [8]:
preprocessed_train = preprocess_dataset('../datasets/SumTablets_English_train.csv')
preprocessed_val = preprocess_dataset('../datasets/SumTablets_English_validation.csv')
preprocessed_test = preprocess_dataset('../datasets/SumTablets_English_test.csv')

train_data = [{
    'source': row['sumerian'],
    'target': row['english']
} for _, row in preprocessed_train.iterrows()]

val_data = [{
    'source': row['sumerian'],
    'target': row['english']
} for _, row in preprocessed_val.iterrows()]

test_data = [{
    'source': row['sumerian'],
    'target': row['english']
} for _, row in preprocessed_test.iterrows()]

Loaded 1907 examples from ../datasets/SumTablets_English_train.csv
Preprocessed dataset contains 1905 examples
Loaded 107 examples from ../datasets/SumTablets_English_validation.csv
Preprocessed dataset contains 107 examples
Loaded 113 examples from ../datasets/SumTablets_English_test.csv
Preprocessed dataset contains 113 examples


In [9]:
def preprocess_function(examples):
    """
    Tokenizes the source (Sumerian) and target (English) texts.
    """
    inputs = examples['source']
    targets = examples['target']

    # print flag if any of inputs or targets are none
    if any(x is None for x in inputs) or any(x is None for x in targets):
        print("Warning: Found None values in inputs or targets. This may affect training.")


    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length")

    # Tokenize targets (English) using the newer approach
    labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert lists to Hugging Face Dataset objects
train_dataset = HFDataset.from_list(train_data)
val_dataset = HFDataset.from_list(val_data)

# Apply preprocessing to the datasets
print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

Tokenizing datasets...


Map:   0%|          | 0/1905 [00:00<?, ? examples/s]

NameError: name 'MAX_INPUT_LENGTH' is not defined

In [10]:
print("Example of tokenized input:")
print(tokenized_train_dataset[0])

Example of tokenized input:
{'source': ' 1(u) la₂ 1(diš) udu u₄ 2(u) 8(diš)-kam ki ab-ba-sa₆-ga-ta na-lu₅ i₃-dab₅   iti <unk> bi₂-gu₇ mu en-unu₆-gal {d}inana unu{ki}ga ba-hun  1(u) la₂ 1(diš)', 'target': '9 rams, 28th day, from Abba-saga, Nalu accepted; month: “ubi-feast,” year: “Enunugal of Inanna of Uruk was installed;” (total:) 9 (rams).', 'input_ids': [0, 112, 1640, 257, 43, 897, 24987, 9264, 9264, 112, 1640, 7506, 4654, 43, 1717, 6588, 1717, 24987, 9264, 11936, 132, 1640, 257, 43, 290, 1640, 7506, 4654, 19281, 330, 424, 27651, 4091, 12, 3178, 12, 11146, 24987, 9264, 27819, 12, 2538, 12, 4349, 2750, 12, 6487, 24987, 9264, 5782, 939, 24987, 9264, 862, 12, 417, 873, 24987, 9264, 5782, 1437, 1437, 24, 118, 1437, 3, 4003, 24987, 9264, 9264, 12, 5521, 24987, 9264, 6382, 14701, 1177, 12, 879, 257, 24987, 9264, 27819, 12, 9487, 25522, 417, 24303, 179, 1113, 542, 257, 45152, 3144, 24303, 2538, 17279, 12, 18458, 1437, 112, 1640, 257, 43, 897, 24987, 9264, 9264, 112, 1640, 7506, 4654, 43, 2,

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=500,          # Warmup steps for learning rate scheduler
    gradient_accumulation_steps=1, 
    lr_scheduler_type="cosine", # Use cosine learning rate scheduler
    label_smoothing_factor=0.1,
    generation_max_length=MAX_TARGET_LENGTH,
    generation_num_beams=1,   # Number of beams for beam search during generation

    save_total_limit=3,         # Only keep the last 3 checkpoints
    predict_with_generate=True, # Important for generation tasks like translation
    logging_dir=LOGGING_DIR,
    logging_steps=50,          # Log training loss every N steps (e.g. 100)
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model at the end of each epoch
    load_best_model_at_end=True, # Optionally load the best model at the end of training
    metric_for_best_model="meteor", # Metric to determine the best model (e.g., 'bleu' if you add custom metrics)
    fp16=torch.cuda.is_available(), # Use mixed precision training if GPU is available
    report_to="tensorboard" # To visualize logs with TensorBoard
)

# --- 6. Data Collator ---
# The DataCollatorForSeq2Seq handles padding dynamically batch-wise for inputs and labels.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# --- 7. Initialize Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda p: compute_metrics(p, tokenizer)  # Use the compute_metrics function defined in compute_metrics.py
    )
    
# --- 8. Train the Model ---
print("Starting model training...")
try:
    trainer.train()
    print("Training finished successfully!")
    # Save the final model and tokenizer
    trainer.save_model(f"{OUTPUT_DIR}/final_model")
    tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model_tokenizer")
    print(f"Final model saved to {OUTPUT_DIR}/final_model")
except Exception as e:
    print(f"An error occurred during training: {e}")

Starting model training...


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Meteor,Rougel,Gen Len
1,8.5441,5.227262,0.0062,0.0596,0.0428,190.9346




KeyboardInterrupt: 

: 

In [3]:
model = BartForConditionalGeneration.from_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer = BartTokenizer.from_pretrained(f"{OUTPUT_DIR}/final_model_tokenizer")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- 9. Inference (Translation) ---
def translate_samerian_to_english(text, trained_model, trained_tokenizer, device_to_use):
    """
    Translates a Sumerian text to English using the fine-tuned model.
    """
    trained_model.eval() # Set model to evaluation mode
    trained_model.to(device_to_use)

    # Prepare the input text
    inputs = trained_tokenizer(text, return_tensors="pt", max_length=MAX_INPUT_LENGTH, truncation=True, padding=True)
    input_ids = inputs.input_ids.to(device_to_use)
    attention_mask = inputs.attention_mask.to(device_to_use)

    # Generate translation
    # You can adjust generation parameters like num_beams, max_length, etc.
    with torch.no_grad(): # Disable gradient calculations for inference
        outputs = trained_model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=MAX_TARGET_LENGTH + 2,  # +2 for start/end tokens
            num_beams=5, # Beam search width
            early_stopping=True
        )

    # Decode the generated ids to text
    translated_text = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

testing_data = preprocess_dataset('../datasets/SumTablets_English_test.csv')

for index, row in testing_data.iterrows():
    sumerian_text = row['sumerian']
    english_translation = translate_samerian_to_english(sumerian_text, model, tokenizer, device)
    true_english_translation = row['english']
    print(f"Sumerian: {sumerian_text}")
    print(f"Predicted English: {english_translation}")
    print(f"True English: {true_english_translation}")
    print("-" * 50)

NameError: name 'device' is not defined

In [4]:
import sys  
sys.path.insert(1, '../utils')

from rclone import update_folder_on_onedrive

In [6]:
update_folder_on_onedrive("bart_large_model", "bart_large_model")

Updating 'bart_large_model' on OneDrive with 'bart_large_model'...
rclone command: rclone sync bart_large_model onedrive_bocconi:AI-project/bart_large_model -P
SUCCESS: Folder updated successfully.
Local folder 'bart_large_model' has been removed after successful update.


True