In [1]:
import re
import torch
from datasets import load_dataset, Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

def clean(text):
    return re.sub(r"\s+", " ", text.strip())

def synthesize(example):
    original = clean(example["translation"]["en"])
    degraded = re.sub(r"[.,?!:;\"'()]", "", original).lower()
    return {
        "input_text": degraded,
        "target_text": original
    }

def preprocess(example):
    prefix = "restore punctuation and capitalization: "
    input_text = prefix + example["input_text"]
    model_input = tokenizer(input_text, truncation=True, padding="max_length", max_length=64)
    label = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=64)
    model_input["labels"] = label["input_ids"]
    return model_input

def restore(text):
    prompt = "restore punctuation and capitalization: " + text.lower().strip()
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = model.generate(**inputs, max_length=64)
    return tokenizer.decode(output[0], skip_special_tokens=True)

if __name__ == "__main__":
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = torch.device("mps")
        print("Using device: MPS")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using device: CUDA")
    else:
        device = torch.device("cpu")
        print("Using device: CPU")

    # Load + clean + synthesize punctuation data
    raw_dataset = load_dataset("opus100", "en-fr", split="train[:5%]")

    filtered = [synthesize(x) for x in raw_dataset if x["translation"]["en"]]
    filtered = [x for x in filtered if x["input_text"] != x["target_text"]][:5000]
    dataset = Dataset.from_dict({
        "input_text": [x["input_text"] for x in filtered],
        "target_text": [x["target_text"] for x in filtered]
    })

    # Tokenize
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    model.to(device)

    tokenized = dataset.map(preprocess, batched=False, remove_columns=["input_text", "target_text"])
    split = tokenized.train_test_split(test_size=0.1)

    # Training config (10 epochs, use fp16 only on CUDA, else disabled)
    training_args = Seq2SeqTrainingArguments(
        output_dir="./t5_punct_model",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        logging_steps=50,
        save_total_limit=1,
        save_strategy="epoch",
        fp16=torch.cuda.is_available(),  
        predict_with_generate=True,
        report_to="none"
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=split["train"],
        eval_dataset=split["test"],
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
    )

    # Train and export model
    trainer.train()
    model.save_pretrained("t5_punct_model")
    tokenizer.save_pretrained("t5_punct_model")

    #  Test inference
    test_input = "its raining outside please take an umbrella with you and be careful"
    print("Input: ", test_input)
    print("Restored:", restore(test_input))


  from .autonotebook import tqdm as notebook_tqdm


Using device: MPS


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 5000/5000 [00:00<00:00, 7505.04 examples/s]
  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,2.8066
100,0.6467
150,0.4549
200,0.2909
250,0.2565
300,0.2311
350,0.2002
400,0.196
450,0.2065
500,0.1961




Input:  its raining outside please take an umbrella with you and be careful
Restored: It's raining outside, please take an umbrella with you and be careful.


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model_dir = "t5_punct_model"

tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
model.eval()

# Example inference function (same as before)
def restore(text):
    prompt = "restore punctuation and capitalization: " + text.lower().strip()
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        output = model.generate(**inputs, max_length=64)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test
print(restore("its raining outside please take an umbrella with you and be careful"))


It's raining outside, please take an umbrella with you and be careful.
