In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
import ast
from transformers import T5Tokenizer
from sklearn.model_selection import train_test_split
import os

In [105]:
recipes_data = pd.read_csv('preprocessed_recipes.csv')
output_dir = "./t5_recipe_generator_pretrained_model"

In [58]:
recipes_data.head()

Unnamed: 0.1,Unnamed: 0,id,Ingredients,Instructions,Simplified_Ingredients,Simplified_Instructions,Instr_Tok_Wrds,Ingredients_Tok,Instructions_Tok,Ingr_Len,Instr_Len,Ingr_Tnsr,Instr_Tnsr,Padded_Length_Ingr,Padded_Length_Instr
0,4,5,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,"['darkbrownsugar', 'hotwater', '.bourbon', '.f...",stir together brown sugar and hot water in a c...,"['stir', 'together', 'brown', 'sugar', 'and', ...","[4518, 15771, 26253, 2404, 2661, 5839, 22833, ...","[4518, 45648, 52523, 26349, 43483, 52523, 3540...",6,264,"[4518, 15771, 26253, 2404, 2661, 5839, 22833, ...","[4518, 45648, 52523, 26349, 43483, 52523, 3540...",25,410
1,5,6,"['2 chamomile tea bags', '1½ oz. reposado tequ...",Place 2 chamomile tea bags in a heatsafe vesse...,"['chamomileteabags', '.reposadotequila', '.fre...",place 2 chamomile tea bags in a heatsafe vesse...,"['place', '2', 'chamomile', 'tea', 'bags', 'in...","[4518, 11023, 3022, 2661, 52808, 4516]","[4518, 39014, 28084, 4741, 9862, 17629, 3834, ...",4,340,"[4518, 11023, 3022, 2661, 52808, 4516, 4517, 4...","[4518, 39014, 28084, 4741, 9862, 17629, 3834, ...",25,410
2,6,7,"['3 oz. Grand Marnier', '1 oz. Amaro Averna', ...","Add 3 oz. Grand Marnier, 1 oz. Amaro Averna, a...","['.grandmarnier', '.amaroaverna', 'smallpatsal...",add 3 oz grand marnier 1 oz amaro averna and a...,"['add', '3', 'oz', 'grand', 'marnier', '1', 'o...","[4518, 2707, 2334, 49621, 26127, 2661, 22781, ...","[4518, 4741, 15693, 15693, 4003, 35408, 22379,...",6,452,"[4518, 2707, 2334, 49621, 26127, 2661, 22781, ...","[4518, 4741, 15693, 15693, 4003, 35408, 22379,...",25,410
3,17,18,"['6 Tbsp. virgin coconut oil', '4 ripe (spotte...",Heat oil in a large nonstick skillet over medi...,"['tbsp.virgincoconutoil', 'ripeplátanosmanzano...",heat oil in a large nonstick skillet over medi...,"['heat', 'oil', 'in', 'a', 'large', 'nonstick'...","[4518, 53286, 45056, 27988, 41505, 45001, 5443...","[4518, 24818, 17629, 4741, 52523, 35408, 26349...",6,279,"[4518, 53286, 45056, 27988, 41505, 45001, 5443...","[4518, 24818, 17629, 4741, 52523, 35408, 26349...",25,410
4,31,32,"['3 garlic cloves, minced', '2 large onions, c...","In a large heavy kettle cook garlic, onions, c...","['garliccloves,minced', 'largeonions,choppedfi...",in a large heavy kettle cook garlic onions cel...,"['in', 'a', 'large', 'heavy', 'kettle', 'cook'...","[4518, 22595, 29501, 19176, 43993, 24140, 3558...","[4518, 26349, 34909, 4741, 28084, 4741, 43483,...",14,397,"[4518, 22595, 29501, 19176, 43993, 24140, 3558...","[4518, 26349, 34909, 4741, 28084, 4741, 43483,...",25,410


In [59]:
def format_ingredients(ingredients_list):
    """
    Formats ingredients with measurements for T5 input.
    """
    retstring = "Ingredients:\n"
    for ingredient in ast.literal_eval(ingredients_list):
        retstring += f"- {ingredient}\n"
    return f"{retstring.strip()}\nSteps:\n"

recipes_data["To_Input"] = recipes_data["Simplified_Ingredients"].apply(format_ingredients)

In [60]:
recipes_data.head()

Unnamed: 0.1,Unnamed: 0,id,Ingredients,Instructions,Simplified_Ingredients,Simplified_Instructions,Instr_Tok_Wrds,Ingredients_Tok,Instructions_Tok,Ingr_Len,Instr_Len,Ingr_Tnsr,Instr_Tnsr,Padded_Length_Ingr,Padded_Length_Instr,To_Input
0,4,5,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,"['darkbrownsugar', 'hotwater', '.bourbon', '.f...",stir together brown sugar and hot water in a c...,"['stir', 'together', 'brown', 'sugar', 'and', ...","[4518, 15771, 26253, 2404, 2661, 5839, 22833, ...","[4518, 45648, 52523, 26349, 43483, 52523, 3540...",6,264,"[4518, 15771, 26253, 2404, 2661, 5839, 22833, ...","[4518, 45648, 52523, 26349, 43483, 52523, 3540...",25,410,Ingredients:\n- darkbrownsugar\n- hotwater\n- ...
1,5,6,"['2 chamomile tea bags', '1½ oz. reposado tequ...",Place 2 chamomile tea bags in a heatsafe vesse...,"['chamomileteabags', '.reposadotequila', '.fre...",place 2 chamomile tea bags in a heatsafe vesse...,"['place', '2', 'chamomile', 'tea', 'bags', 'in...","[4518, 11023, 3022, 2661, 52808, 4516]","[4518, 39014, 28084, 4741, 9862, 17629, 3834, ...",4,340,"[4518, 11023, 3022, 2661, 52808, 4516, 4517, 4...","[4518, 39014, 28084, 4741, 9862, 17629, 3834, ...",25,410,Ingredients:\n- chamomileteabags\n- .reposadot...
2,6,7,"['3 oz. Grand Marnier', '1 oz. Amaro Averna', ...","Add 3 oz. Grand Marnier, 1 oz. Amaro Averna, a...","['.grandmarnier', '.amaroaverna', 'smallpatsal...",add 3 oz grand marnier 1 oz amaro averna and a...,"['add', '3', 'oz', 'grand', 'marnier', '1', 'o...","[4518, 2707, 2334, 49621, 26127, 2661, 22781, ...","[4518, 4741, 15693, 15693, 4003, 35408, 22379,...",6,452,"[4518, 2707, 2334, 49621, 26127, 2661, 22781, ...","[4518, 4741, 15693, 15693, 4003, 35408, 22379,...",25,410,Ingredients:\n- .grandmarnier\n- .amaroaverna\...
3,17,18,"['6 Tbsp. virgin coconut oil', '4 ripe (spotte...",Heat oil in a large nonstick skillet over medi...,"['tbsp.virgincoconutoil', 'ripeplátanosmanzano...",heat oil in a large nonstick skillet over medi...,"['heat', 'oil', 'in', 'a', 'large', 'nonstick'...","[4518, 53286, 45056, 27988, 41505, 45001, 5443...","[4518, 24818, 17629, 4741, 52523, 35408, 26349...",6,279,"[4518, 53286, 45056, 27988, 41505, 45001, 5443...","[4518, 24818, 17629, 4741, 52523, 35408, 26349...",25,410,Ingredients:\n- tbsp.virgincoconutoil\n- ripep...
4,31,32,"['3 garlic cloves, minced', '2 large onions, c...","In a large heavy kettle cook garlic, onions, c...","['garliccloves,minced', 'largeonions,choppedfi...",in a large heavy kettle cook garlic onions cel...,"['in', 'a', 'large', 'heavy', 'kettle', 'cook'...","[4518, 22595, 29501, 19176, 43993, 24140, 3558...","[4518, 26349, 34909, 4741, 28084, 4741, 43483,...",14,397,"[4518, 22595, 29501, 19176, 43993, 24140, 3558...","[4518, 26349, 34909, 4741, 28084, 4741, 43483,...",25,410,"Ingredients:\n- garliccloves,minced\n- largeon..."


In [61]:
recipes_data['Simplified_Instructions'][0]

'stir together brown sugar and hot water in a cocktail shaker to dissolve let cool then add bourbon lemon juice and apple butter and fill with ice shake until well chilled about 15 seconds strain into an ice filled rocks glass garnish with orange twist and cinnamon'

In [62]:
recipes_data["To_Input"] = recipes_data["To_Input"].astype(str)
recipes_data["Simplified_Instructions"] = recipes_data["Simplified_Instructions"].astype(str)


In [63]:
trimmed_recipes = recipes_data[["To_Input", "Simplified_Instructions"]].copy()
trimmed_recipes.head()

Unnamed: 0,To_Input,Simplified_Instructions
0,Ingredients:\n- darkbrownsugar\n- hotwater\n- ...,stir together brown sugar and hot water in a c...
1,Ingredients:\n- chamomileteabags\n- .reposadot...,place 2 chamomile tea bags in a heatsafe vesse...
2,Ingredients:\n- .grandmarnier\n- .amaroaverna\...,add 3 oz grand marnier 1 oz amaro averna and a...
3,Ingredients:\n- tbsp.virgincoconutoil\n- ripep...,heat oil in a large nonstick skillet over medi...
4,"Ingredients:\n- garliccloves,minced\n- largeon...",in a large heavy kettle cook garlic onions cel...


In [64]:
train_data, temp_data = train_test_split(trimmed_recipes, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 2037
Validation size: 437
Test size: 437


In [65]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [66]:
def preprocess_data(data, tokenizer, max_len_input=128, max_len_output=512):
    inputs = []
    outputs = []
    for _, row in data.iterrows():
        input_text = (
            f"Generate a recipe using these ingredients: {row['To_Input']}.\n"
            f"Include preparation steps and cooking instructions in a clear, step-by-step format."
        )

        output_text = row["Simplified_Instructions"]

        inputs.append(tokenizer(
            input_text,
            max_length=max_len_input,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].squeeze(0))

        outputs.append(tokenizer(
            output_text,
            max_length=max_len_output,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].squeeze(0))

    return inputs, outputs

train_inputs, train_outputs = preprocess_data(train_data, tokenizer)
val_inputs, val_outputs = preprocess_data(val_data, tokenizer)

In [67]:
class RecipeDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "labels": self.outputs[idx],
        }

train_dataset = RecipeDataset(train_inputs, train_outputs)
val_dataset = RecipeDataset(val_inputs, val_outputs)

In [68]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [69]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
training_args = TrainingArguments(
    output_dir="./t5-recipe-generation",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    fp16=torch.cuda.is_available(),
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [71]:
trainer.train()

                                                  
 12%|█▎        | 255/2040 [01:36<10:59,  2.71it/s]

{'eval_loss': 0.5690538883209229, 'eval_runtime': 5.4239, 'eval_samples_per_second': 80.569, 'eval_steps_per_second': 10.14, 'epoch': 1.0}


 25%|██▍       | 500/2040 [03:04<08:51,  2.90it/s]  

{'loss': 0.982, 'grad_norm': 0.5530691146850586, 'learning_rate': 2.2647058823529413e-05, 'epoch': 1.96}


                                                  
 25%|██▌       | 510/2040 [03:13<08:09,  3.13it/s]

{'eval_loss': 0.5137606263160706, 'eval_runtime': 5.3128, 'eval_samples_per_second': 82.254, 'eval_steps_per_second': 10.352, 'epoch': 2.0}


                                                  
 38%|███▊      | 765/2040 [04:49<06:46,  3.14it/s]

{'eval_loss': 0.49404284358024597, 'eval_runtime': 5.287, 'eval_samples_per_second': 82.656, 'eval_steps_per_second': 10.403, 'epoch': 3.0}


 49%|████▉     | 1000/2040 [06:13<06:04,  2.85it/s]

{'loss': 0.532, 'grad_norm': 0.3396269977092743, 'learning_rate': 1.5294117647058822e-05, 'epoch': 3.92}


                                                   
 50%|█████     | 1020/2040 [06:26<05:38,  3.01it/s]

{'eval_loss': 0.4824254810810089, 'eval_runtime': 5.5676, 'eval_samples_per_second': 78.49, 'eval_steps_per_second': 9.879, 'epoch': 4.0}


                                                   
 62%|██████▎   | 1275/2040 [08:01<04:04,  3.13it/s]

{'eval_loss': 0.4750309884548187, 'eval_runtime': 5.2655, 'eval_samples_per_second': 82.993, 'eval_steps_per_second': 10.445, 'epoch': 5.0}


 74%|███████▎  | 1500/2040 [09:27<03:30,  2.57it/s]

{'loss': 0.5078, 'grad_norm': 0.24938392639160156, 'learning_rate': 7.941176470588236e-06, 'epoch': 5.88}


                                                   
 75%|███████▌  | 1530/2040 [09:44<03:07,  2.71it/s]

{'eval_loss': 0.47040730714797974, 'eval_runtime': 5.5641, 'eval_samples_per_second': 78.54, 'eval_steps_per_second': 9.885, 'epoch': 6.0}


                                                   
 88%|████████▊ | 1785/2040 [11:24<01:21,  3.13it/s]

{'eval_loss': 0.46788322925567627, 'eval_runtime': 5.3179, 'eval_samples_per_second': 82.176, 'eval_steps_per_second': 10.342, 'epoch': 7.0}


 98%|█████████▊| 2000/2040 [12:42<00:14,  2.78it/s]

{'loss': 0.4978, 'grad_norm': 0.26523786783218384, 'learning_rate': 5.882352941176471e-07, 'epoch': 7.84}


                                                   
100%|██████████| 2040/2040 [13:03<00:00,  3.03it/s]

{'eval_loss': 0.46705394983291626, 'eval_runtime': 5.5583, 'eval_samples_per_second': 78.622, 'eval_steps_per_second': 9.895, 'epoch': 8.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 2040/2040 [13:05<00:00,  2.60it/s]

{'train_runtime': 785.0101, 'train_samples_per_second': 20.759, 'train_steps_per_second': 2.599, 'train_loss': 0.6271211904637953, 'epoch': 8.0}





TrainOutput(global_step=2040, training_loss=0.6271211904637953, metrics={'train_runtime': 785.0101, 'train_samples_per_second': 20.759, 'train_steps_per_second': 2.599, 'total_flos': 551382499196928.0, 'train_loss': 0.6271211904637953, 'epoch': 8.0})

In [None]:
#Uncomment this if you want to save another model
# os.makedirs(output_dir, exist_ok=True)
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

#Uncomment this if you want to load the model
# loaded_model = T5ForConditionalGeneration.from_pretrained(output_dir)
# loaded_tokenizer = T5Tokenizer.from_pretrained(output_dir)

def generate_recipe(ingredients_list, model, tokenizer, max_length=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    formatted_ingredients = ", ".join(ingredients_list)
    input_text = (
        f"Generate a recipe using these ingredients: {formatted_ingredients}.\n"
        f"Include preparation steps and cooking instructions in a clear, step-by-step format."
    )
    inputs = tokenizer(
        input_text, return_tensors="pt", padding=True, truncation=True, max_length=128
    ).to(device)

    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
        no_repeat_ngram_size=3,
    )

    raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_output(raw_output)

def clean_output(output_text):
    output_text = output_text.replace("<RECIPE>", "").replace("<INGR>", "").strip()
    output_text = output_text.capitalize()

    if not output_text.endswith("."):
        output_text += ".\n"
    
    output_text = " ".join(output_text.split())
    return output_text




In [111]:
ingredients_list_1 = ["chicken", "garlic", "onion", "salt", "wine"]
ingredients_list_2 = ["flour", "sugar", "butter", "eggs", "chocolate"]

for i in range(10):
  recipe_1 = generate_recipe(ingredients_list_1, model, tokenizer)
  print("Recipe" + str(i) + ":", recipe_1)


Recipe0: In a large skillet whisk chicken garlic garlic onion salt and wine in a medium saucepan over medium heat until mixture is softened about 12 minutes in dillio sauce melt chicken and simmer until garlic is tender and tender about 5 minutes transfer to a plate and serve as a foil in preheat oven to 350f.
Recipe1: 2 ingredients in a medium bowl combine chicken garlic garlic onion salt and wine stir until tender about 5 minutes place chicken in mash chicken over a large large bowl sprinkle in if desired add chicken garlic salt and a little wine in ice and cook until finely ground about 20 minutes.
Recipe2: A large bowl mix the chicken garlic onion salt and wine in a bowl over medium high heat the chicken and garlic in reposition the chicken in ice and reroll the chicken with the garlic and onion add the salt and garlic and bring to a boil the chicken together until it is tender about 2 minutes for a set of hours for the chicken to be browned about 1 minute in octroy the chicken bro