# Recipe-GPT
This notebook implements a project called Recipe-GPT transformer. The dataset used in this project is: https://huggingface.co/datasets/tengomucho/all-recipes-split

In [None]:
#Imports

import torch
import torch.nn as nn

import math
import os
import random
from pathlib import Path

# HuggingFace imports
from datasets import load_dataset, Dataset
from transformers import GPT2TokenizerFast, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer


In [2]:
# Hyperparameters
dataset = 'tengomucho/all-recipes-split'
dataset_split = 'train'

# Set up device agnostic code
if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print("Using device:", device)

Using device: cuda


### Download and preview dataset

In [None]:
def get_dataset(dataset, dataset_split):
    """
    Function to grab dataset based on `dataset` and `dataset_split`.
    """
    data = load_dataset(dataset, split=dataset_split)
    
    return data

data = get_dataset(dataset, dataset_split)


In [10]:
#Printing few sample data points
for i in range(3):
    print(f"\nExample {i+1}")
    print("Title", data[i]["title"])
    print("Ingredients", data[i]["ingredients"])
    print("Directions", data[i]["directions"])



2147248

Example 1
Title No-Bake Nut Cookies
Ingredients - 1 c. firmly packed brown sugar
- 1/2 c. evaporated milk
- 1/2 tsp. vanilla
- 1/2 c. broken nuts (pecans)
- 2 Tbsp. butter or margarine
- 3 1/2 c. bite size shredded rice biscuits
Directions - In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.
- Stir over medium heat until mixture bubbles all over top.
- Boil and stir 5 minutes more. Take off heat.
- Stir in vanilla and cereal; mix well.
- Using 2 teaspoons, drop and shape into 30 clusters on wax paper.
- Let stand until firm, about 30 minutes.

Example 2
Title Jewell Ball'S Chicken
Ingredients - 1 small jar chipped beef, cut up
- 4 boned chicken breasts
- 1 can cream of mushroom soup
- 1 carton sour cream
Directions - Place chipped beef on bottom of baking dish.
- Place chicken on top of beef.
- Mix soup and cream together; pour over chicken. Bake, uncovered, at 275° for 3 hours.

Example 3
Title Creamy Corn
Ingredients - 2 (16 oz.) pkg

In [None]:
def save_data(dataset, dataset_split):
    """
    Function to save data locally as a text file.
    """
    data = get_dataset(dataset, dataset_split)
    data_path = Path('../data/')

    if data_path.is_dir():
        print(f"Directory exists, skipping...")
    else:
        print("Creating diretory...")
        data_path.mkdir(parents=True, exist_ok=True)

    with open(data_path / "recipes.txt", "w", encoding="utf-8") as f:
        for i in range(len(data)):
            title = data[i].get("title", "").strip()
            ingredients = data[i].get("ingredients", "").strip()
            directions = data[i].get("directions", "").strip()

            recipe_block = f"<start>\n"  # Add special token to the beginning of each recipe
            if title:
                recipe_block += f"Title: {title}\n"
            recipe_block += f"Ingredients:\n{ingredients}\n"
            recipe_block += f"Directions:\n{directions}\n"
            recipe_block += f"<end>\n\n"  # Add a special token to the end of each recipe

            f.write(recipe_block) 

    print("File saved.")

save_data(dataset, dataset_split)


Directory exists, skipping...
File saved.


In [2]:
#loading a few examples from recipes.txt
with open("../data/recipes.txt", "r", encoding="utf-8") as f:
     recipes = f.read()

print(len(recipes))
print(recipes[:1000])

1604728762
<start>
Title: No-Bake Nut Cookies
Ingredients:
- 1 c. firmly packed brown sugar
- 1/2 c. evaporated milk
- 1/2 tsp. vanilla
- 1/2 c. broken nuts (pecans)
- 2 Tbsp. butter or margarine
- 3 1/2 c. bite size shredded rice biscuits
Directions:
- In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.
- Stir over medium heat until mixture bubbles all over top.
- Boil and stir 5 minutes more. Take off heat.
- Stir in vanilla and cereal; mix well.
- Using 2 teaspoons, drop and shape into 30 clusters on wax paper.
- Let stand until firm, about 30 minutes.
<end>

<start>
Title: Jewell Ball'S Chicken
Ingredients:
- 1 small jar chipped beef, cut up
- 4 boned chicken breasts
- 1 can cream of mushroom soup
- 1 carton sour cream
Directions:
- Place chipped beef on bottom of baking dish.
- Place chicken on top of beef.
- Mix soup and cream together; pour over chicken. Bake, uncovered, at 275° for 3 hours.
<end>

<start>
Title: Creamy Corn
Ingredients:


### Build Tokenizer
Using GPT2TokenizerFast from HuggingFace, based on byte-level Byte-Pair-Encoding.

(https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2TokenizerFast)

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")

special_tokens = {
    "bos_token" : "<start>",
    "eos_token" : "<end>",
    "additional_special_tokens": []
}

tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token = tokenizer.eos_token

print(tokenizer.special_tokens_map)
print(tokenizer.convert_tokens_to_ids("<start>"))
print(tokenizer.vocab_size)


{'bos_token': '<start>', 'eos_token': '<end>', 'unk_token': '<|endoftext|>', 'pad_token': '<end>'}
50257
50257


### Tokenizing the Corpus
We now have a tokenizer setup with custom tokens, so the next goal is to save this to a Dataset class.

In [4]:
# Split recipes.txt by <start>

def process_recipes(recipes, tokenizer):
    """
    Splits recipes by "<start>", prepends "<start>" back to each block,
    and tokenizes each block.
    
    Args:
        recipes (str): Raw recipe text containing multiple recipes separated by "<start>"
        tokenizer (callable): Tokenizer function to apply to each recipe block
    
    Returns:
        list: List of tokenized recipe blocks
    """
    
    # Split by "<start>" and remove empty strings
    recipe_blocks = [block.strip() for block in recipes.split("<start>") if block.strip()]

    # Prepend <start> back to each block and tokenize

    tokenized_recipes = []
    for i, block in enumerate(recipe_blocks):
        if i == 50000:  # Processing first 200,000 recipe blocks due to memory constraints
            break
        full_block = "<start>\n" + block
        tokenized_block = tokenizer(full_block, truncation=True, max_length=512, padding=False)
        tokenized_recipes.append(tokenized_block)
    
    return tokenized_recipes

processed_recipes = process_recipes(recipes, tokenizer)
print(f"Number of recipe blocks processed: {len(processed_recipes)}")
lengths = [len(r["input_ids"]) for r in processed_recipes]
print(f"Max length: {max(lengths)} | Mean: {sum(lengths) // len(lengths)}")  # Comparing mean length of block with max_length of tokenized block



Number of recipe blocks processed: 50000
Max length: 512 | Mean: 148


### Wrapping in datasets.Dataset Object

This is required to train with HuggingFace's Trainer, enables batching, shuffling and streaming and preparing clearn train and val splits

In [5]:
def create_dataset(processed_recipes):
    """
    Extract input_ids and attention_mask from processed recipes and create a Dataset.
    
    Args:
        processed_recipes (list): List of tokenized recipe blocks
    
    Returns:
        Dataset: HuggingFace Dataset with input_ids and attention_mask
    """
    input_ids = []
    attention_mask = []

    for recipe in processed_recipes:
        input_ids.append(recipe['input_ids'])
        attention_mask.append(recipe['attention_mask'])

    dataset = Dataset.from_dict({
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })

    return dataset

dataset = create_dataset(processed_recipes)
print(dataset)
print(dataset[random.randint(0, len(dataset) - 1)]) # Peek at a random sample from tokenized dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 50000
})
{'input_ids': [50257, 198, 19160, 25, 27601, 6932, 198, 41222, 25, 198, 12, 362, 357, 23, 15649, 2014, 279, 10025, 13, 8566, 9891, 198, 12, 352, 1588, 279, 10025, 13, 8891, 11, 20720, 198, 12, 604, 4077, 24681, 11, 20720, 198, 12, 362, 309, 24145, 13, 6366, 298, 357, 6230, 278, 8, 198, 12, 362, 23053, 13, 16597, 728, 1010, 10695, 10746, 198, 13470, 507, 25, 198, 12, 15561, 477, 9391, 290, 15936, 656, 257, 2613, 26, 1309, 900, 13417, 13, 1680, 4836, 287, 20720, 613, 66, 504, 13, 198, 50258], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Train/Validation Split and save tokenized dataset to disk

In [6]:
def split_and_save_dataset(dataset, train_split=0.9, save_dir="../data/tokenized_recipes"):
    """
    Split tokenized dataset into train/validation and save both to disk.
    
    Args:
        dataset (Dataset): HuggingFace Dataset to split
        train_split (float): Proportion for training set (default 0.9 = 90%)
        save_dir (str): Directory to save the datasets
    
    Returns:
        tuple: (train_dataset, val_dataset)
    """

    # Split the dataset
    split_dataset = dataset.train_test_split(test_size=1-train_split, seed=42)
    train_dataset = split_dataset['train']
    val_dataset = split_dataset['test']
    
    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save datasets to disk
    train_path = os.path.join(save_dir, "train")
    val_path = os.path.join(save_dir, "validation")
    
    train_dataset.save_to_disk(train_path)
    val_dataset.save_to_disk(val_path)
    
    print(f"Datasets saved successfully!")
    print(f"Train dataset: {train_path} | ({len(train_dataset)} samples)")
    print(f"Validation dataset: {val_path} | ({len(val_dataset)} samples)")
    
    return train_dataset, val_dataset


def load_saved_datasets(save_dir="../data/tokenized_recipes"):
    """
    Load previously saved tokenized train/validation datasets from disk.
    
    Args:
        save_dir (str): Directory where datasets were saved
    
    Returns:
        tuple: (train_dataset, val_dataset)
    """
    
    train_path = os.path.join(save_dir, "train")
    val_path = os.path.join(save_dir, "validation")
    
    if not os.path.exists(train_path) or not os.path.exists(val_path):
        raise FileNotFoundError(f"Datasets not found in {save_dir}. Please run processing first.")
    
    train_dataset = Dataset.load_from_disk(train_path)
    val_dataset = Dataset.load_from_disk(val_path)
    
    print(f"Datasets loaded successfully!")
    print(f"Train dataset: {len(train_dataset)} samples")
    print(f"Validation dataset: {len(val_dataset)} samples")
    
    return train_dataset, val_dataset

In [7]:
split_and_save_dataset(dataset)


Saving the dataset (0/1 shards):   0%|          | 0/45000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Datasets saved successfully!
Train dataset: ../data/tokenized_recipes\train | (45000 samples)
Validation dataset: ../data/tokenized_recipes\validation | (5000 samples)


(Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 45000
 }),
 Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 5000
 }))

In [8]:
train_dataset, val_dataset = load_saved_datasets()

sample = train_dataset[0]

print("Input IDs (first 50):", sample["input_ids"][:50])
print("Attention Mask (first 50):", sample["attention_mask"][:50])
print("\nDecoded Text:\n", tokenizer.decode(sample["input_ids"]))

Datasets loaded successfully!
Train dataset: 45000 samples
Validation dataset: 5000 samples
Input IDs (first 50): [50257, 198, 19160, 25, 17973, 39528, 198, 41222, 25, 198, 12, 352, 3091, 1815, 676, 444, 198, 12, 352, 3091, 475, 1010, 25557, 354, 9113, 44670, 198, 12, 14380, 198, 12, 352, 3091, 11311, 9113, 44670, 198, 12, 362, 3661, 273, 18550, 9210, 198, 12, 352, 1588, 9290, 15226, 40930]
Attention Mask (first 50): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Decoded Text:
 <start>
Title: Simply Delicious
Ingredients:
- 1 box Twinkies
- 1 box butterscotch instant pudding
- nuts
- 1 box chocolate instant pudding
- 2 Skor candy bars
- 1 large container Cool Whip
Directions:
- Unwrap Twinkies and lay at the bottom of a baking dish.
- Mix both puddings in separate bowls.
- Pour one over the top of the Twinkies, then the other.
- Chop Skor candy bars in small pieces. Sprinkle over to

## Fine Tune GPT-2 Model

- Load the base GPT-2 model: `AutoModelForCausalLM`
- Resize embedding layer to accomodate `<start>` and `<end>`
- Define training configs
- Use Hugging Face's `Trainer` to train

In [9]:
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

# Resize token embeddings so that model learns new tokens
model.resize_token_embeddings(len(tokenizer))

print(model.config.vocab_size)  #Expected 50259
print(model.transformer.wte.weight.shape)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


50259
torch.Size([50259, 768])


In [11]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False #Causal LM, not BERT
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="../models/recipe-gpt",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_total_limit=2,
    num_train_epochs=5,
    per_device_train_batch_size=4,  # adjust based on available memory
    per_device_eval_batch_size=4,
    warmup_steps=50,
    weight_decay=0.01,
    fp16=True,  # Set to True if using a GPU with FP16 support, using RTX 3070
    report_to="none"
)

#Set up trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)



  trainer = Trainer(


In [12]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.3843,1.370153
2,1.3046,1.302903
3,1.2611,1.272295
4,1.2493,1.255499
5,1.1499,1.251273


TrainOutput(global_step=56250, training_loss=1.3055569748263889, metrics={'train_runtime': 8357.7505, 'train_samples_per_second': 26.921, 'train_steps_per_second': 6.73, 'total_flos': 2.3194003654656e+16, 'train_loss': 1.3055569748263889, 'epoch': 5.0})

In [None]:
from transformers import AutoModelForCausalLM, GPT2TokenizerFast
import torch

# === 1. Load fine-tuned model and tokenizer ===
model_path = "../models/recipe-gpt/checkpoint-56250"
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()  # set to inference mode

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === 2. Define generation function ===
def generate_recipe(ingredients, max_length=300, temperature=0.8, top_k=50, top_p=0.95):
    prompt = "<start>\nIngredients:\n"
    for ingredient in ingredients:
        prompt += f"- {ingredient}\n"
    prompt += "Directions:\nFollow these steps to prepare:\n"

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            # temperature=temperature,
            # top_k=top_k,
            # top_p=top_p,
            # max_length=max_length,
            temperature=0.7,
            top_k=40,
            top_p=0.9,
            max_length=140,
            eos_token_id=tokenizer.convert_tokens_to_ids("<end>")
        )

    generated = tokenizer.decode(output_ids[0], skip_special_tokens=False)

    # Extract directions block
    if "Directions:" in generated:
        generated = generated.split("Directions:")[1]
    if "<end>" in generated:
        generated = generated.split("<end>")[0]

    return generated.strip()

# === 3. Try a generation ===
ingredients = ["waffle", "chocolate", "milk"]
recipe = generate_recipe(ingredients)
print("Generated Recipe Directions:\n")
print(recipe)


Setting `pad_token_id` to `eos_token_id`:50258 for open-end generation.


Generated Recipe Directions:

- Line waffle iron with waffle iron.
- Pour chocolate over waffle iron.
- Sprinkle milk over chocolate.
- Put in oven.
- Cook on lowest heat until chocolate is melted.
- Serve with whipped cream or ice cream.
- Serve warm with chocolate syrup.
- Yields 4 servings.
- Can be frozen.
- You may use one of the other flavors, like orange or strawberry.
- You can use any other flavor.
- Can be frozen in freezer.
- It is better to use a blender or food processor
