# Notebook Overview

#### This notebook fine-tunes a T5-small sequence-to-sequence model to translate between Modern English and Shakespearean English using instruction-based prefixes. It loads and cleans a parallel TSV dataset, builds a bidirectional training set, and evaluates performance with ROUGE and BLEU while logging training progress and saving the best checkpoint for inference.




Installs all required libraries for fine-tuning and evaluating a T5, this includes the core transformer, evaluation, SentencePiece, and Weights & Biases integration.

In [None]:

!pip -q install -U transformers datasets accelerate evaluate sacrebleu sentencepiece wandb rouge_score nltk


Imports core libraries for data handling, model training, and evalution.

In [None]:
import os, re, inspect, random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
)

import evaluate
import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Make runs reproducible-ish
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if device == "cuda":
    torch.cuda.manual_seed_all(seed)


This cell authenticates Weights & Biases logging using a secret stored in the environment.

In [None]:
from google.colab import userdata
import wandb

WANDB_API_KEY = userdata.get("WANDB_API_KEY")

if WANDB_API_KEY:
    wandb.login(key=WANDB_API_KEY)
    print("W&B login ok.")
else:
    print("No WANDB_API_KEY found in Colab Secrets.")


Uploads the TSV dataset into the Colab runtime.

In [None]:
from google.colab import files

uploaded = files.upload()
print("Uploaded:", list(uploaded.keys()))


Loads the TSV file into a dataset and makes sure the columns are set up correctly. We also do light normalization like removing stray numbers and fixing whitespace so the model isn’t learning formatting junk instead of language.

In [None]:
def load_tsv(path="shakes_only_data.tsv"):
    # Try no-header assumption first
    try:
        ds = load_dataset(
            "csv",
            data_files=path,
            delimiter="\t",
            column_names=["shakespeare", "modern"],
            split="train"
        )
        ex = ds[0]
        if ex.get("shakespeare") and ex.get("modern"):
            return ds
    except Exception:
        pass

    # Fallback: headered TSV
    ds = load_dataset(
        "csv",
        data_files=path,
        delimiter="\t",
        split="train"
    )
    cols = ds.column_names
    if "shakespeare" not in cols or "modern" not in cols:
        if len(cols) >= 2:
            ds = ds.rename_columns({cols[0]: "shakespeare", cols[1]: "modern"})
    return ds

number_re = re.compile(r"\b\d+\b")

def clean_text(s: str) -> str:
    if s is None:
        return ""
    s = number_re.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def clean_example(ex):
    return {
        "shakespeare": clean_text(ex.get("shakespeare", "")),
        "modern": clean_text(ex.get("modern", "")),
    }

raw = load_tsv("data.tsv").map(clean_example)

print("Columns:", raw.column_names)
print("Examples:", len(raw))
print("Sample:")
print(raw[0])


Turns each Shakespeare–Modern pair into two training examples so the model learns both directions. We add instruction-style prefixes to tell T5 which way to translate, then split everything into train and validation sets for fair evaluation.

In [None]:
def make_bidir(ds):
    src_list, tgt_list = [], []
    for ex in ds:
        s = ex["shakespeare"].strip()
        m = ex["modern"].strip()
        if not s or not m:
            continue
        # Modern -> Shakespeare
        src_list.append(f"translate modern to shakespeare: {m}")
        tgt_list.append(s)
        # Shakespeare -> Modern
        src_list.append(f"translate shakespeare to modern: {s}")
        tgt_list.append(m)

    return Dataset.from_dict({"src": src_list, "tgt": tgt_list})

bidir = make_bidir(raw)
print("Bidir size:", len(bidir))

# 90/10 split
dataset = bidir.train_test_split(test_size=0.1, seed=seed)
dataset


Loads the pretrained t5-small model and its tokenizer. We define how to tokenize our instruction prompts and targets, apply truncation for efficiency,
and set up dynamic padding so batches stay clean and fast.

In [None]:
model_name = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

max_src_len = 128
max_tgt_len = 128

def preprocess(batch):
    model_inputs = tokenizer(
        batch["src"],
        max_length=max_src_len,
        truncation=True,
    )
    labels = tokenizer(
        text_target=batch["tgt"],
        max_length=max_tgt_len,
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

print(tokenized)


Sets up ROUGE and BLEU so we can quantify how close the model outputs are to the references. We use these to track validation performance and compare results across models.

In [None]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_result = bleu.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels]
    )
    rouge_result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    pred_lens = [np.count_nonzero(p != tokenizer.pad_token_id) for p in preds]

    return {
        "bleu": round(bleu_result["score"], 4),
        "rouge1": round(rouge_result["rouge1"], 4),
        "rougeL": round(rouge_result["rougeL"], 4),
        "gen_len": round(float(np.mean(pred_lens)), 4),
    }


Defines the training settings like learning rate, batch size, epoch count, and checkpoint rules. Connecting with wnb here to visualize and track progress.

In [None]:
def build_training_args():
    base_kwargs = dict(
        output_dir="t5_shakespeare_bidir",
        run_name="t5-small-shakespeare-bidir",
        learning_rate=3e-4,
        warmup_ratio=0.05,
        weight_decay=0.01,
        num_train_epochs=50,
        per_device_train_batch_size=16 if device == "cuda" else 4,
        per_device_eval_batch_size=16 if device == "cuda" else 4,
        gradient_accumulation_steps=1,
        predict_with_generate=True,

        logging_strategy="steps",
        logging_steps=50,   # frequent enough to see epoch progress
        save_strategy="epoch",
        save_total_limit=2,

        load_best_model_at_end=True,
        metric_for_best_model="rougeL",
        greater_is_better=True,

        fp16=(device == "cuda"),
        report_to="wandb",
    )

    sig = inspect.signature(Seq2SeqTrainingArguments.__init__)
    if "evaluation_strategy" in sig.parameters:
        base_kwargs["evaluation_strategy"] = "epoch"
    else:
        base_kwargs["eval_strategy"] = "epoch"

    return Seq2SeqTrainingArguments(**base_kwargs)

training_args = build_training_args()
training_args


Builds the Seq2SeqTrainer with our model, datasets, collator, and metrics. This is sort of like a control panel where all of the training pipeline comes together.

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

print("Train size:", len(tokenized["train"]))
print("Val size:", len(tokenized["test"]))


Runs fine-tuning for the planned number of epochs.

In [None]:

# The trainer logs epoch automatically in its logging dict.

train_out = trainer.train()
train_out


Runs one final evaluation on the validation set after training finishes.

In [None]:
metrics = trainer.evaluate()
print(metrics)


Saves the best version of the fine-tuned T5 model and tokenizer for any future use/training

In [None]:
trainer.save_model("t5_shakespeare_bidir_best")
tokenizer.save_pretrained("t5_shakespeare_bidir_best")

print("Saved: t5_shakespeare_bidir_best")


Reloads the saved model and runs a few sample translations. This is a quick check to see how the model did. Validated by us humans.

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5_shakespeare_bidir_best").to(device)
tokenizer = T5TokenizerFast.from_pretrained("t5_shakespeare_bidir_best")

def gen(prompt, max_new_tokens=80):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        num_beams=6,
        length_penalty=1.0,
        early_stopping=True,
    )
    return tokenizer.decode(out[0], skip_special_tokens=True)

tests_modern = [
    "I have a bad feeling about this.",
    "We must act quickly before night falls.",
    "The council will meet tomorrow.",
]

for t in tests_modern:
    print("Modern:", t)
    print("Shakes:", gen(f"translate modern to shakespeare: {t}"))
    print()

tests_shakes = [
    "I prithee, speak plain.",
    "Wherefore dost thou linger in the night?",
]

for t in tests_shakes:
    print("Shakes:", t)
    print("Modern:", gen(f"translate shakespeare to modern: {t}"))
    print()


# WER Setup

In [None]:
def translate(text):
  encoded = tokenizer(text, return_tensors="pt").to(device)
  out_tokens = model.generate(**encoded, max_length=256)
  return tokenizer.decode(out_tokens[0], skip_special_tokens=True)


In [None]:
from google.colab import files
uploaded = files.upload()

#import file called test.tsv

In [None]:
from pathlib import Path
import re
from datasets import load_dataset

testset = load_dataset(
    "csv",
    data_files={"full": str(Path("./test.tsv"))},
    delimiter="\t",
    column_names=["shakespeare", "modern"]
)


In [None]:
import re

def remove_numbers(row):
    row["shakespeare"] = re.sub(r"\d+", "", str(row["shakespeare"]))
    row["modern"] = re.sub(r"\d+", "", str(row["modern"]))
    return row

testset["full"] = testset["full"].map(remove_numbers)


In [None]:
def too_long(testset):
    sh_words = len(str(testset["shakespeare"]).split())
    mod_words = len(str(testset["modern"]).split())
    return (sh_words <= 25) and (mod_words <= 25)

testset["full"] = testset["full"].filter(too_long)


In [None]:
inp = "<to_modern> " + str(testset["full"][1]["shakespeare"])
print("shakes: " + inp)
print("modern: " + str(testset["full"][1]["modern"]))

print(translate(inp))


In [None]:
import csv

with open("modern_test.tsv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["original modern", "translated from early modern"])

    for i in range(len(testset["full"])):
        original = testset["full"][i]["shakespeare"]
        prompt = "<to_modern> " + str(original)

        translated = translate(prompt)
        print(i)
        print(original)
        print(translated)

        writer.writerow([str(testset["full"][i]["modern"]), translated])


In [None]:
with open("shakespeare_test.tsv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["original early modern", "translated from modern"])

    for i in range(len(testset["full"])):
        original = testset["full"][i]["modern"]
        prompt = "<to_shakespeare> " + str(original)

        translated = translate(prompt)

        writer.writerow([str(testset["full"][i]["shakespeare"]), translated])
