In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

def fine_tune_dhivehi_transliterator():
    """
    This script fine-tunes the alakxender/flan-t5-base-dhivehi-en-latin model
    for the dv2latin transliteration task.
    """
    # --- 1. Configuration ---
    MODEL_CHECKPOINT = "alakxender/flan-t5-base-dhivehi-en-latin"
    CSV_FILE_PATH = "training_data.csv"  # <-- IMPORTANT: Change this to the path of your CSV file
    NEW_MODEL_NAME = "flan-t5-base-dv2latin-finetuned"
    SOURCE_LANG = "dhivehi"
    TARGET_LANG = "latin"
    PREFIX = "dv2latin: "
    
    # --- 2. Load and Prepare the Dataset ---
    print("Loading and preparing dataset...")
    
    # Load your data from the CSV file
    try:
        df = pd.read_csv(CSV_FILE_PATH)
    except FileNotFoundError:
        print(f"Error: The file '{CSV_FILE_PATH}' was not found.")
        print("Please create this file in the specified format or update the CSV_FILE_PATH variable.")

    # --- FIX STARTS HERE ---
    # Clean the data: remove rows with missing values and ensure data is string type
    df.dropna(subset=[SOURCE_LANG, TARGET_LANG], inplace=True)
    df[SOURCE_LANG] = df[SOURCE_LANG].astype(str)
    df[TARGET_LANG] = df[TARGET_LANG].astype(str)
    # --- FIX ENDS HERE ---

    # Convert pandas DataFrame to Hugging Face Dataset
    raw_dataset = Dataset.from_pandas(df)

    # Split the dataset into training and testing sets (e.g., 90% train, 10% test)
    train_test_split = raw_dataset.train_test_split(test_size=0.1)
    
    # Create a DatasetDict which is the standard format
    dataset = DatasetDict({
        'train': train_test_split['train'],
        'test': train_test_split['test']
    })
    
    print("Dataset prepared and split:")
    print(dataset)

    # --- 3. Load Tokenizer and Model ---
    print(f"Loading tokenizer and model from '{MODEL_CHECKPOINT}'...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    model = T5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)

    # --- 4. Preprocessing Function ---
    def preprocess_function(examples):
        """Tokenizes the source and target texts."""
        # Add the task-specific prefix to the inputs
        inputs = [PREFIX + doc for doc in examples[SOURCE_LANG]]
        
        # Tokenize the inputs
        model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
        
        # Tokenize the targets (labels)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples[TARGET_LANG], max_length=128, truncation=True, padding="max_length")
        
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print("Tokenizing the dataset...")
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    print("Tokenization complete.")

    # --- 5. Set up Training ---
    
    # Data collator is responsible for creating batches of data
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, 
        model=model
    )

    # Define training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=NEW_MODEL_NAME,
        evaluation_strategy="epoch",  # Evaluate at the end of each epoch
        learning_rate=2e-5,
        per_device_train_batch_size=8, # Adjust based on your GPU memory
        per_device_eval_batch_size=8,  # Adjust based on your GPU memory
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3, # Start with 3 epochs and adjust as needed
        predict_with_generate=True,
        fp16=True, # Use mixed precision for faster training if you have a compatible GPU
        push_to_hub=False, # Set to True if you want to upload to Hugging Face Hub
    )

    # Initialize the Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # --- 6. Start Fine-Tuning ---
    print("\nStarting the fine-tuning process...")
    trainer.train()
    print("Fine-tuning complete.")

    # --- 7. Save the Fine-Tuned Model ---
    print(f"Saving the fine-tuned model to '{NEW_MODEL_NAME}'...")
    trainer.save_model(NEW_MODEL_NAME)
    tokenizer.save_pretrained(NEW_MODEL_NAME)
    print("Model saved successfully.")

    # --- 8. Example Usage of the Fine-Tuned Model ---
    print("\n--- Testing the fine-tuned model ---")
    
    # Load the fine-tuned model and tokenizer
    fine_tuned_tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL_NAME)
    fine_tuned_model = T5ForConditionalGeneration.from_pretrained(NEW_MODEL_NAME)
    
    # Example text
    source_text = "އާ ޓާމިނަލް ހުޅުވަން އެރުވި ހަވާ ދުނިޔޭގެ ރެކޯޑް ފޮތުން ޖާގަ ހޯދައިފި"
    prompt = f"dv2latin: {source_text.strip()}"
    
    # Generate translation
    inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
    output_ids = fine_tuned_model.generate(**inputs, max_length=128)
    result = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    print(f"\nSource (Dhivehi): {source_text}")
    print(f"Result (Latin): {result}")


if __name__ == '__main__':
    # Before running, make sure you have the required libraries installed:
    # pip install transformers[torch] datasets pandas sentencepiece accelerate
    fine_tune_dhivehi_transliterator()

Loading and preparing dataset...
Dataset prepared and split:
DatasetDict({
    train: Dataset({
        features: ['latin', 'dhivehi', '__index_level_0__'],
        num_rows: 17979
    })
    test: Dataset({
        features: ['latin', 'dhivehi', '__index_level_0__'],
        num_rows: 1998
    })
})
Loading tokenizer and model from 'alakxender/flan-t5-base-dhivehi-en-latin'...


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Tokenizing the dataset...


Map:   0%|          | 0/17979 [00:00<?, ? examples/s]



Map:   0%|          | 0/1998 [00:00<?, ? examples/s]

Tokenization complete.


  trainer = Seq2SeqTrainer(



Starting the fine-tuning process...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2439,0.196933
2,0.1944,0.176245
3,0.175,0.172525


Fine-tuning complete.
Saving the fine-tuned model to 'flan-t5-base-dv2latin-finetuned'...
Model saved successfully.

--- Testing the fine-tuned model ---

Source (Dhivehi): އާ ޓާމިނަލް ހުޅުވަން އެރުވި ހަވާ ދުނިޔޭގެ ރެކޯޑް ފޮތުން ޖާގަ ހޯދައިފި
Result (Latin): Aa terminal hulhuvan eruvi havaa dhuniyeyge record fothun jaaga hoadhaifi


In [4]:
NEW_MODEL_NAME = "flan-t5-base-dv2latin-finetuned"

# Load the fine-tuned model and tokenizer
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL_NAME)
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(NEW_MODEL_NAME)

# Example text
source_text = "އިޒްރޭލު"
prompt = f"dv2latin: {source_text.strip()}"

# Generate translation
inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
output_ids = fine_tuned_model.generate(**inputs, max_length=128)
result = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"\nSource (Dhivehi): {source_text}")
print(f"Result (Latin): {result}")


Source (Dhivehi): އިޒްރޭލު
Result (Latin): Israel
