In [1]:
!pip install bitsandbytes
!pip install -q diffusers transformers accelerate peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.0


In [10]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForSeq2SeqLM, 
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from evaluate import load
import numpy as np

In [None]:
# Initialize the model
import os
from getpass import getpass
import time

# set HF_TOKEN environment variable
if 'HF_TOKEN' not in os.environ:
    os.environ['HF_TOKEN'] = getpass("Enter your Hugging Face token: ")

model_id = "google/gemma-3-1b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=True)
model = AutoModelForCausalLM.from_pretrained(model_id, token=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set padding token to the eos token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.12 GiB. GPU 0 has a total capacity of 15.77 GiB of which 45.12 MiB is free. Including non-PyTorch memory, this process has 15.72 GiB memory in use. Of the allocated memory 15.30 GiB is allocated by PyTorch, and 55.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [4]:
def load_data(file_path):
    # Load your dataset - adjust based on actual file format
    df = pd.read_csv(file_path)
    
    # extract columns and rename
    df = df[['transliteration', 'translation']].copy()
    df.columns = ['sumerian', 'italian']
    
    return df

# Replace these paths with your actual data files
train_df = load_data('datasets/SumTablets_English_train.csv') 
val_df = load_data('datasets/SumTablets_English_validation.csv')
test_df = load_data('datasets/SumTablets_English_test.csv')

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(f"Dataset loaded: {len(train_dataset)} training, {len(val_dataset)} validation, {len(test_dataset)} test examples")

Dataset loaded: 1907 training, 107 validation, 113 test examples


In [5]:
# Configure for translation task (Gemma is a causal LM, not seq2seq)
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    # Format inputs for causal language modeling
    # We'll use a format: "Translate from Sumerian to Italian: {sumerian_text} => "
    inputs = ["Translate from Sumerian to Italian: " + str(text) + " => " for text in examples["sumerian"]]
    
    # Ensure all targets are strings (handle None values)
    targets = [str(target) if target is not None else "" for target in examples["italian"]]
    
    # Concatenate inputs and targets for training
    concatenated_examples = [input_text + target_text + tokenizer.eos_token for input_text, target_text in zip(inputs, targets)]
    
    # Tokenize the concatenated sequences
    model_inputs = tokenizer(
        concatenated_examples, 
        max_length=max_input_length + max_target_length, 
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Create labels for causal LM (same as input_ids)
    labels = model_inputs["input_ids"].clone()
    
    # Mask the prompt part in labels with -100 so they don't contribute to the loss
    for i, input_text in enumerate(inputs):
        # Find where the input ends and the target begins
        input_tokens = tokenizer(input_text, add_special_tokens=False)["input_ids"]
        input_length = len(input_tokens)
        
        # Set the prompt part to -100
        labels[i, :input_length] = -100
    
    model_inputs["labels"] = labels
    
    return model_inputs

# Apply preprocessing to all datasets
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True, remove_columns=dataset_dict["train"].column_names)

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [18]:
batch_size = 8
output_dir = "./gemma-finetuned"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if available
    push_to_hub=False,  # Set to True if you want to upload to HF Hub
    report_to="tensorboard",
)

# 5. Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length"
)

# 6. Define evaluation metric (BLEU)
bleu = load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Postprocess to compute BLEU properly
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]  # BLEU expects a list of lists
    
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    
    # Add mean generated length
    prediction_lens = [len(pred.split()) for pred in decoded_preds]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# 7. Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train the model
print("Starting training...")
trainer.train()

# 9. Save the model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# 10. Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Test results: {test_results}")

# 11. Translation examples
def translate(sumerian_text):
    inputs = tokenizer(sumerian_text, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_target_length,
        num_beams=5,
        early_stopping=True
    )
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Test translation on a few examples
test_examples = test_dataset.select(range(5))
print("\nTranslation examples:")
for example in test_examples:
    sumerian = example["sumerian"]
    actual_italian = example["italian"]
    predicted_italian = translate(sumerian)
    print(f"Sumerian:          {sumerian}")
    print(f"Actual Italian:    {actual_italian}")
    print(f"Predicted Italian: {predicted_italian}")
    print("-" * 80)

  trainer = Seq2SeqTrainer(


Starting training...


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 45.12 MiB is free. Including non-PyTorch memory, this process has 15.72 GiB memory in use. Of the allocated memory 15.30 GiB is allocated by PyTorch, and 55.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)