In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

In [2]:
# --- 1. Configuration ---
MODEL_CHECKPOINT = "alakxender/flan-t5-base-dhivehi-en-latin"
NEW_MODEL_NAME = "models/flan-t5-base-dv2latin-mihaaru"
SOURCE_LANG = "dhivehi"
TARGET_LANG = "latin"
PREFIX = "dv2latin: "

## Load the data

In [3]:
def load_and_prepare_translation_data(train_csv_path: str, test_csv_path: str, source_lang: str, target_lang: str) -> DatasetDict:
    """
    Loads translation data from separate train and test CSV files, cleans it,
    and returns a Hugging Face DatasetDict.

    Args:
        train_csv_path (str): Path to the training CSV file.
        test_csv_path (str): Path to the testing CSV file.
        source_lang (str): Column name for the source language text.
        target_lang (str): Column name for the target language text.

    Returns:
        DatasetDict: A Hugging Face DatasetDict containing 'train' and 'test' splits.
    """

    # Load training data
    df_train = pd.read_csv(train_csv_path)
    df_train.dropna(subset=[source_lang, target_lang], inplace=True)
    df_train[source_lang] = df_train[source_lang].astype(str)
    df_train[target_lang] = df_train[target_lang].astype(str)
    train_dataset = Dataset.from_pandas(df_train)

    # Load testing data
    df_test = pd.read_csv(test_csv_path)
    df_test.dropna(subset=[source_lang, target_lang], inplace=True)
    df_test[source_lang] = df_test[source_lang].astype(str)
    df_test[target_lang] = df_test[target_lang].astype(str)
    test_dataset = Dataset.from_pandas(df_test)

    # Create a DatasetDict
    dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    return dataset

In [4]:
dataset = load_and_prepare_translation_data("data/train.csv", "data/test.csv", SOURCE_LANG, TARGET_LANG)
print("Dataset prepared and split:", dataset)

Dataset prepared and split: DatasetDict({
    train: Dataset({
        features: ['latin', 'dhivehi'],
        num_rows: 17979
    })
    test: Dataset({
        features: ['latin', 'dhivehi'],
        num_rows: 1998
    })
})


## Tokenizer and preprocessing

In [5]:
# --- 3. Load Tokenizer and Model ---
print(f"Loading tokenizer and model from '{MODEL_CHECKPOINT}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = T5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)

# --- 4. Preprocessing Function ---
def preprocess_function(examples):
    """Tokenizes the source and target texts."""
    # Add the task-specific prefix to the inputs
    inputs = [PREFIX + doc for doc in examples[SOURCE_LANG]]
    
    # Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    
    # Tokenize the targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples[TARGET_LANG], max_length=128, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing the dataset...")
tokenized_datasets = dataset.map(preprocess_function, batched=True)
print("Tokenization complete.")

Loading tokenizer and model from 'alakxender/flan-t5-base-dhivehi-en-latin'...


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Tokenizing the dataset...


Map:   0%|          | 0/17979 [00:00<?, ? examples/s]



Map:   0%|          | 0/1998 [00:00<?, ? examples/s]

Tokenization complete.


## Training

In [6]:
# --- 5. Set up Training ---
# Data collator is responsible for creating batches of data
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=NEW_MODEL_NAME,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8, # Adjust based on your GPU memory
    per_device_eval_batch_size=8,  # Adjust based on your GPU memory
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=5, # Start with 3 epochs and adjust as needed
    predict_with_generate=True,
    fp16=True, # Use mixed precision for faster training if you have a compatible GPU
    push_to_hub=False, # Set to True if you want to upload to Hugging Face Hub
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# --- 6. Start Fine-Tuning ---
print("\nStarting the fine-tuning process...")
trainer.train()
print("Fine-tuning complete.")

# --- 7. Save the Fine-Tuned Model ---
print(f"Saving the fine-tuned model to '{NEW_MODEL_NAME}'...")
trainer.save_model(NEW_MODEL_NAME)
tokenizer.save_pretrained(NEW_MODEL_NAME)
print("Model saved successfully.")

  trainer = Seq2SeqTrainer(



Starting the fine-tuning process...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.24,0.195387
2,0.1928,0.170716
3,0.1636,0.160836
4,0.1526,0.156717
5,0.1461,0.155982


Fine-tuning complete.
Saving the fine-tuned model to 'models/flan-t5-base-dv2latin-mihaaru'...
Model saved successfully.


## Example inference

In [7]:
# Load the fine-tuned model and tokenizer
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL_NAME)
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(NEW_MODEL_NAME)

# Example text
source_text = "އިންޑިއާގައި ފޭކް އެމްބަސީއެއް ހަދައިގެން އުޅުނު މީހަކު ހައްޔަރުކޮށްފި"
prompt = f"dv2latin: {source_text.strip()}"

# Generate translation
inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
output_ids = fine_tuned_model.generate(**inputs, max_length=128)
result = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"\nSource (Dhivehi): {source_text}")
print(f"Result (Latin): {result}")


Source (Dhivehi): އިންޑިއާގައި ފޭކް އެމްބަސީއެއް ހަދައިގެން އުޅުނު މީހަކު ހައްޔަރުކޮށްފި
Result (Latin): India gai fake embassy eh hadhaigen ulhunu meehaku hayyaru koffi


In [9]:
# inference function
def transliterate_dhivehi_to_latin(text):
    """
    Transliterates Dhivehi text to Latin script using the fine-tuned model.
    
    Args:
        text (str): The Dhivehi text to transliterate.
        
    Returns:
        str: The transliterated Latin text.
    """
    prompt = f"dv2latin: {text.strip()}"
    inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
    output_ids = fine_tuned_model.generate(**inputs, max_length=128)
    result = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return result

transliterate_dhivehi_to_latin("އިންޑިއާގައި ފޭކް އެމްބަސީއެއް ހަދައިގެން އުޅުނު މީހަކު ހައްޔަރުކޮށްފި")

'India gai fake embassy eh hadhaigen ulhunu meehaku hayyaru koffi'

In [13]:
# test running on a batch
# Example texts (batch)
source_texts = [
    "އިންޑިއާގައި ފޭކް އެމްބަސީއެއް ހަދައިގެން އުޅުނު މީހަކު ހައްޔަރުކޮށްފި",
    "މިއީ ދިވެހި ބަހުން ލިޔެފައިވާ ނަމޫނާ ޖުމްލައެކެވެ.",
    "ކޮންމެ ދުވަހަކުވެސް ފަތިހު ހޭލާށެވެ."
]

# Prepare prompts for the batch
prompts = [f"dv2latin: {text.strip()}" for text in source_texts]

# Generate translations for the batch
# The tokenizer can handle a list of strings directly for batch processing
inputs = fine_tuned_tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

# Generate translations
# The generate method will process all inputs in the batch
output_ids = fine_tuned_model.generate(**inputs, max_length=128)

# Decode results
results = [fine_tuned_tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids]

# Print source and result for each item in the batch
for i in range(len(source_texts)):
    print(f"\nSource (Dhivehi): {source_texts[i]}")
    print(f"Result (Latin): {results[i]}")


Source (Dhivehi): އިންޑިއާގައި ފޭކް އެމްބަސީއެއް ހަދައިގެން އުޅުނު މީހަކު ހައްޔަރުކޮށްފި
Result (Latin): India gai fake embassy eh hadhaigen ulhunu meehaku hayyaru koffi

Source (Dhivehi): މިއީ ދިވެހި ބަހުން ލިޔެފައިވާ ނަމޫނާ ޖުމްލައެކެވެ.
Result (Latin): Miee dhivehi bahun liye namoonaa jumla eh

Source (Dhivehi): ކޮންމެ ދުވަހަކުވެސް ފަތިހު ހޭލާށެވެ.
Result (Latin): Komme dhuvahaku ves fathihu heyley
