In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import textwrap


model_name = "pradhap1125/t5-small-sentence-validator"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [14]:
# Input transcript
raw_transcript = "helloeveryonewelcome to todayssession"
# Prefix "normalize:" if your training used that
input_text = "normalize: " + raw_transcript.strip()

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

# Generate output
with torch.no_grad():
    generated_ids = model.generate(
    **inputs,
    max_length=1024,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9
)

chunks = textwrap.wrap(raw_transcript, 400)
normalized_chunks = []
for ch in chunks:
    input_text = "normalize: " + ch
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    output_ids = model.generate(**inputs, max_length=256, num_beams=5)
    normalized_chunks.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))

final_output = " ".join(normalized_chunks)

print("===== RAW TRANSCRIPT =====")
print(raw_transcript)
print("\n===== NORMALIZED OUTPUT =====")
print(final_output)

===== RAW TRANSCRIPT =====
helloeveryonewelcome to todayssession

===== NORMALIZED OUTPUT =====
Hallo, everyone welcome to today's session.
