In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"‚úÖ Loaded {model_name}")
print(f"üìä Vocabulary size: {tokenizer.vocab_size:,}")


‚úÖ Loaded t5-small
üìä Vocabulary size: 32,100


## Get Dataset

In [1]:
from get_dataset import create_comprehensive_yoda_dataset

yoda_dataset = create_comprehensive_yoda_dataset()

  from .autonotebook import tqdm as notebook_tqdm


## Pre-processing

In [6]:
'''
add prefix to the dataset
'''
def preprocess_yoda_function(examples):
    """
    Format: "translate to yoda: [normal text]" ‚Üí "[yoda text]"
    """
    # Create task-specific inputs with clear task description
    inputs = [f"translate to yoda voice: {text}" for text in examples["input"]]
    targets = examples["target"]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding=True
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding=True
        )
    
    # Add labels for training
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# process the dataset
tokenized_yoda_dataset = yoda_dataset.map(
    preprocess_yoda_function,
    batched=True,
    remove_columns=yoda_dataset.column_names,
    desc="Tokenizing yoda dataset"
)


Tokenizing yoda dataset:   0%|          | 0/50 [00:00<?, ? examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tokenizing yoda dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 2048.42 examples/s]


### Train/Test

In [7]:
from sklearn.model_selection import train_test_split

# Split data (80% train, 20% eval)
dataset_size = len(tokenized_yoda_dataset)
train_size = int(0.8 * dataset_size)

train_dataset = tokenized_yoda_dataset.select(range(train_size))
eval_dataset = tokenized_yoda_dataset.select(range(train_size, dataset_size))

print(f"üìä Train dataset: {len(train_dataset)} examples")
print(f"üìä Eval dataset:  {len(eval_dataset)} examples")

üìä Train dataset: 40 examples
üìä Eval dataset:  10 examples


## Train Config

In [11]:
## metric evaluation function

import evaluate
import numpy as np

def compute_metrics_robust(eval_pred):
    """
    Robust metrics computation that handles early training issues
    """
    predictions, labels = eval_pred
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Filter out empty predictions (common early in training)
    valid_pairs = []
    for pred, label in zip(decoded_preds, decoded_labels):
        if pred.strip() and label.strip():  # Both must be non-empty
            valid_pairs.append((pred.strip(), label.strip()))
    
    if len(valid_pairs) == 0:
        # No valid predictions yet - return zero scores
        return {
            "bleu": 0.0,
            "valid_predictions": 0,
            "total_predictions": len(decoded_preds)
        }
    
    # Separate valid predictions and references
    valid_preds, valid_refs = zip(*valid_pairs)
    
    try:
        # Try to compute BLEU
        bleu_metric = evaluate.load("bleu")
        result = bleu_metric.compute(
            predictions=list(valid_preds), 
            references=[[ref] for ref in valid_refs]
        )
        
        return {
            "bleu": result["bleu"] if result["bleu"] is not None else 0.0,
            "valid_predictions": len(valid_pairs),
            "total_predictions": len(decoded_preds)
        }
        
    except (ZeroDivisionError, ValueError) as e:
        # Fallback: simple accuracy-like metric
        print(f"BLEU computation failed: {e}. Using fallback metric.")
        
        # Simple token-level accuracy
        correct_tokens = 0
        total_tokens = 0
        
        for pred, ref in valid_pairs:
            pred_tokens = pred.split()
            ref_tokens = ref.split()
            
            for i in range(min(len(pred_tokens), len(ref_tokens))):
                total_tokens += 1
                if pred_tokens[i] == ref_tokens[i]:
                    correct_tokens += 1
        
        accuracy = correct_tokens / max(total_tokens, 1)
        
        return {
            "bleu": accuracy,  # Use accuracy as proxy for BLEU
            "valid_predictions": len(valid_pairs),
            "total_predictions": len(decoded_preds)
        }

print("‚úÖ Robust metrics function created")

‚úÖ Robust metrics function created


In [12]:
## training config

from transformers import Seq2SeqTrainingArguments

# More conservative training arguments
training_args_fixed = Seq2SeqTrainingArguments(
    output_dir="./yoda-translator",
    eval_strategy="steps",  # Change to steps instead of epoch
    eval_steps=50,          # Evaluate every 50 steps
    save_strategy="steps",
    save_steps=50,
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    
    # Training hyperparameters
    num_train_epochs=10,     # Reduced epochs
    per_device_train_batch_size=2,  # Smaller batch size
    per_device_eval_batch_size=2,
    learning_rate=1e-4,      # Lower learning rate
    weight_decay=0.01,
    warmup_steps=50,         # Fewer warmup steps
    
    # Generation settings
    predict_with_generate=True,
    generation_max_length=64,  # Shorter max length
    generation_num_beams=2,    # Fewer beams for stability
    
    # Performance
    fp16=False,
    dataloader_pin_memory=False,
    report_to=None,
    
    # Skip first evaluations to avoid early errors
    skip_memory_metrics=True,
)

print("‚úÖ Fixed training arguments created")

‚úÖ Fixed training arguments created


### Trainer

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args_fixed,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_robust,
)

print("‚úÖ Yoda Trainer created and ready!")

‚úÖ Yoda Trainer created and ready!


  trainer = Seq2SeqTrainer(


# TRAIN

In [14]:
print("üöÄ Training Yoda translator...")
print("This may take 15-30 minutes depending on your hardware...")

trainer.train()

# Save the final model
final_model_path = "./yoda-translator-final"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"‚úÖ Yoda translator saved to: {final_model_path}")

üöÄ Training Yoda translator...
This may take 15-30 minutes depending on your hardware...


Step,Training Loss,Validation Loss,Bleu,Valid Predictions,Total Predictions
50,2.107,1.396541,0.0,2,10
100,1.1688,0.985728,0.0,10,10
150,0.6594,0.862383,0.129307,10,10
200,0.7844,0.819774,0.169352,10,10


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


‚úÖ Yoda translator saved to: ./yoda-translator-final


# TEST

In [18]:
import torch

def test_yoda_model(model_path):
    """Test the trained model"""
    
    print(f"\nüß™ Testing model from: {model_path}")
    
    # Load the trained model
    try:
        trained_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        trained_tokenizer = AutoTokenizer.from_pretrained(model_path)
        print("‚úÖ Model loaded successfully")
    except Exception as e:
        print(f"‚ùå Error loading model: {e}")
        raise e
    
    def translate_to_yoda_fixed(text):
        """Translate using the trained model"""
        input_text = f"translate to yoda: {text}"
        
        inputs = trained_tokenizer(
            input_text, 
            return_tensors="pt", 
            max_length=128, 
            truncation=True
        )
        
        with torch.no_grad():
            outputs = trained_model.generate(
                inputs["input_ids"],
                max_length=64,  # Shorter for stability
                num_beams=2,
                length_penalty=1.0,
                early_stopping=True,
                do_sample=False
            )
        
        result = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return result if result else "Strong with errors, this model is. Try again, you must."
    
    # Test sentences
    test_sentences = [
        "You are very strong.",
        "I will help you.",
        "The Force is powerful.",
        "Trust your feelings.",
        "We must be patient."
    ]
    
    print("\nüê∏ YODA TRANSLATION TESTS:")
    print("-" * 40)
    
    for sentence in test_sentences:
        try:
            yoda_result = translate_to_yoda_fixed(sentence)
            print(f"Normal: {sentence}")
            print(f"Yoda:   {yoda_result}")
            print()
        except Exception as e:
            print(f"Error translating '{sentence}': {e}")
    
    return translate_to_yoda_fixed

# Test the model
translate_function = test_yoda_model('./yoda-translator-final')

print("\nüéâ Training complete! May the Force be with your model! ‚ú®")


üß™ Testing model from: ./yoda-translator-final
‚úÖ Model loaded successfully

üê∏ YODA TRANSLATION TESTS:
----------------------------------------
Normal: You are very strong.
Yoda:   Strong you are.

Normal: I will help you.
Yoda:   Help you, I will.

Normal: The Force is powerful.
Yoda:   Powerful the Force is.

Normal: Trust your feelings.
Yoda:   Trust your feelings, you must.

Normal: We must be patient.
Yoda:   Patient, we must be.


üéâ Training complete! May the Force be with your model! ‚ú®


In [27]:
## standalone function
def translate_to_yoda(text, model_path='./yoda-translator-final'):
    try:
        trained_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        trained_tokenizer = AutoTokenizer.from_pretrained(model_path)
        print("‚úÖ Model loaded successfully")
    except Exception as e:
        print(f"‚ùå Error loading model: {e}")
        raise e
    

    input_text = f"translate to yoda voice: {text}"
        
    inputs = trained_tokenizer(
        input_text, 
        return_tensors="pt", 
        max_length=128, 
        truncation=True
    )
    
    with torch.no_grad():
        outputs = trained_model.generate(
            inputs["input_ids"],
            max_length=64,  # Shorter for stability
            num_beams=2,
            length_penalty=1.0,
            early_stopping=True,
            do_sample=False
        )
    
    result = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result if result else "Strong with errors, this model is. Try again, you must."


# Example usage
input_text = "I will train you well."
yoda_translation = translate_to_yoda(input_text)
print(f"Input: {input_text}")
print(f"Yoda Translation: {yoda_translation}")

‚úÖ Model loaded successfully
Input: I will train you well.
Yoda Translation: Train you well, I will.


In [23]:
import os
print(os.getcwd())

/Users/stephen/Nottingham/fine_tuning/src/t5_seq_2_seq
