## Load Pre-trained T5 Model

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"  # You can switch to "t5-base" or "t5-large" if needed
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Load Dataset

In [2]:
import pandas as pd
from datasets import Dataset

# Load data from CSV
df = pd.read_csv("data/web_to_kjv.csv")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into train and validation sets
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

## Tokenize and Prepare Data

In [3]:
def preprocess_function(examples):
    # Define task prefix
    prefix = "translate modern to kjv: "
    
    # Apply prefix and tokenize
    inputs = [prefix + text for text in examples["modern_text"]]
    targets = [text for text in examples["kjv_text"]]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = train_test_split.map(preprocess_function, batched=True)

100%|██████████| 3/3 [00:01<00:00,  2.99ba/s]
100%|██████████| 1/1 [00:00<00:00,  9.45ba/s]


## Fine-tune the Model

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-scripture-style",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs"
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer
)

# Start training
trainer.train()

  trainer = Seq2SeqTrainer(
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107702610>>
Traceback (most recent call last):
  File "/Users/sethbrock/Desktop/IS693R/venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


Epoch,Training Loss,Validation Loss


## Evaluate & Test the Model

In [None]:
# Test the model with an example
input_text = "translate modern to kjv: Love your enemies and do good to those who hate you."
input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids

# Generate output
output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"Generated KJV-like text: {output_text}")