## Load Pre-trained T5 Model

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load pre-trained T5 model and tokenizer
# You can switch to "t5-small", "t5-base" or "t5-large" if needed, though anything besides
# 't5-small' is the only one that will work on our GPU
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Load Dataset

In [2]:
import pandas as pd
from datasets import Dataset

# Load data from CSV
df = pd.read_csv("data/web_to_kjv.csv")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into train and validation sets
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

## Tokenize and Prepare Data

In [3]:
def preprocess_function(examples):
    # Define task prefix
    prefix = "translate modern to kjv: "
    
    # Clean and apply prefix, skipping None values
    inputs = [prefix + text if text is not None else "" for text in examples["modern_text"]]
    targets = [text if text is not None else "" for text in examples["kjv_text"]]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = train_test_split.map(preprocess_function, batched=True)

100%|██████████| 28/28 [00:15<00:00,  1.84ba/s]
100%|██████████| 4/4 [00:01<00:00,  2.44ba/s]


## Fine-tune the Model

In [4]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-scripture-style",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs"
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer
)

# Start training
trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0666,0.058253
2,0.0543,0.052706


TrainOutput(global_step=5247, training_loss=0.06936240482493902, metrics={'train_runtime': 5436.5172, 'train_samples_per_second': 15.446, 'train_steps_per_second': 0.965, 'total_flos': 1.1359508081147904e+16, 'train_loss': 0.06936240482493902, 'epoch': 2.9985710202915117})

## Evaluate & Test the Model

In [5]:
import torch

# Test the model with an example
input_text = "translate modern to kjv: Love your enemies and do good to those who hate you."

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Tokenize input and move to the same device as the model
input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

# Generate output
output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"Generated KJV-like text: {output_text}")

Generated KJV-like text: Love thy enemies, and do good unto them that hate thee.


## Save the Model

In [6]:
model.save_pretrained("./t5-scripture-style/fine_tuned_model")
tokenizer.save_pretrained("./t5-scripture-style/fine_tuned_model")

('./t5-scripture-style/fine_tuned_model/tokenizer_config.json',
 './t5-scripture-style/fine_tuned_model/special_tokens_map.json',
 './t5-scripture-style/fine_tuned_model/spiece.model',
 './t5-scripture-style/fine_tuned_model/added_tokens.json')

## Load and Test the Model

In [7]:
def prompt_for_kjv(prompt):
    from transformers import T5ForConditionalGeneration, T5Tokenizer
    import torch

    # Load the model and tokenizer from the saved directory
    model = T5ForConditionalGeneration.from_pretrained("./t5-scripture-style/fine_tuned_model")
    tokenizer = T5Tokenizer.from_pretrained("./t5-scripture-style/fine_tuned_model")
    
    # Move model to the appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Tokenize input and move to the same device as the model
    prompt = f"translate modern to kjv: {prompt}"
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
    
    # Generate output
    output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return output_text

In [8]:
# Test the model and the function with an example
prompt = "As you go to your knees in fasting prayer, God will make known to you what you can do to help others."
output = prompt_for_kjv(prompt)
print(f"Generated KJV-like text: {output}")

Generated KJV-like text: And as ye go to your knees in fasting prayer, God shall make known unto you what thou canst do to help others.
