In [None]:
!pip install bitsandbytes
!pip install -q diffusers transformers accelerate peft

Defaulting to user installation because normal site-packages is not writeable
Collecting bitsandbytes
  Using cached bitsandbytes-0.46.0-py3-none-win_amd64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.0-py3-none-win_amd64.whl (66.5 MB)
   ---------------------------------------- 0.0/66.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/66.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/66.5 MB ? eta -:--:--
    --------------------------------------- 1.0/66.5 MB 2.5 MB/s eta 0:00:27
   - -------------------------------------- 1.8/66.5 MB 3.0 MB/s eta 0:00:22
   - -------------------------------------- 2.1/66.5 MB 2.9 MB/s eta 0:00:23
   - -------------------------------------- 3.1/66.5 MB 3.0 MB/s eta 0:00:21
   -- ------------------------------------- 3.7/66.5 MB 3.2 MB/s eta 0:00:20
   -- ------------------------------------- 4.2/66.5 MB 3.0 MB/s eta 0:00:21
   -- ------------------------------------- 5.0/66.5 MB 3.1 MB/s eta 0:00:21
   --- --

In [7]:
import os
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForSeq2SeqLM, 
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from evaluate import load

In [None]:
# Initialize the model
import os
from getpass import getpass
import time

# set HF_TOKEN environment variable
if 'HF_TOKEN' not in os.environ:
    os.environ['HF_TOKEN'] = getpass("Enter your Hugging Face token: ")

model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", token=True)
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", token=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set padding token to the eos token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

In [None]:

# 1. Load and prepare the dataset
# Assuming you have Sumerian-Italian parallel data in CSV/TSV format
# Format should have 'sumerian' and 'italian' columns

def load_data(file_path):
    # Load your dataset - adjust based on actual file format
    df = pd.read_csv(file_path, sep='\t')  # or pd.read_csv for comma-separated
    
    # Ensure the dataframe has 'sumerian' and 'italian' columns
    assert 'sumerian' in df.columns, "Dataset must contain 'sumerian' column"
    assert 'italian' in df.columns, "Dataset must contain 'italian' column"
    
    return df

# Replace these paths with your actual data files
train_df = load_data('path/to/train_data.csv') 
val_df = load_data('path/to/val_data.csv')
test_df = load_data('path/to/test_data.csv')

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(f"Dataset loaded: {len(train_dataset)} training, {len(val_dataset)} validation, {len(test_dataset)} test examples")

# Configure the tokenizer for translation task
tokenizer.src_lang = "sumerian"
tokenizer.tgt_lang = "english"

# 3. Preprocess the data
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    # Tokenize the Sumerian texts
    inputs = [text for text in examples["sumerian"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    
    # Tokenize the Italian translations
    targets = [text for text in examples["italian"]]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    
    # Replace padding token id with -100 so it's ignored in loss calculation
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            -100 if token == tokenizer.pad_token_id else token 
            for token in model_inputs["labels"][i]
        ]
    
    return model_inputs

# Apply preprocessing to all datasets
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

# 4. Configure training
batch_size = 16  # Adjust based on your GPU memory
output_dir = "./gemma-sumerian-italian"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if available
    push_to_hub=False,  # Set to True if you want to upload to HF Hub
    report_to="tensorboard",
)

# 5. Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length"
)

# 6. Define evaluation metric (BLEU)
bleu = load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Postprocess to compute BLEU properly
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]  # BLEU expects a list of lists
    
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    
    # Add mean generated length
    prediction_lens = [len(pred.split()) for pred in decoded_preds]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# 7. Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train the model
print("Starting training...")
trainer.train()

# 9. Save the model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# 10. Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Test results: {test_results}")

# 11. Translation examples
def translate(sumerian_text):
    inputs = tokenizer(sumerian_text, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_target_length,
        num_beams=5,
        early_stopping=True
    )
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Test translation on a few examples
test_examples = test_dataset.select(range(5))
print("\nTranslation examples:")
for example in test_examples:
    sumerian = example["sumerian"]
    actual_italian = example["italian"]
    predicted_italian = translate(sumerian)
    print(f"Sumerian:          {sumerian}")
    print(f"Actual Italian:    {actual_italian}")
    print(f"Predicted Italian: {predicted_italian}")
    print("-" * 80)