In [1]:
# !pip uninstall accelerate -y
# !pip install transformers[torch] accelerate -U

In [None]:
import transformers
import accelerate

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")


Transformers version: 4.41.2
Accelerate version: 0.31.0


In [None]:
import os
from datasets import load_dataset

# Load the CSV file
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/RecipeGPT/Food Ingredients and Recipe Dataset with Image Name Mapping.csv')

# Inspect the dataset
print(dataset)



# Function to clean the Instructions column
def clean_instructions(entry):
    # Check if the entry is a string, if not, convert it to an empty string
    if isinstance(entry, str):
        return entry
    else:
        return ""

# Extract and clean the Instructions column for each split
cleaned_datasets = {}
for split in dataset.keys():
    cleaned_datasets[split] = dataset[split].map(lambda x: {'text': clean_instructions(x['Instructions'])})



from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add a padding token
tokenizer.pad_token = tokenizer.eos_token

# Verify the padding token
print(f"Padding token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")

def tokenize_function(examples):
    tokens = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    tokens['labels'] = tokens['input_ids'].copy()  # Use input_ids as labels for language modeling
    return tokens

# Tokenize the cleaned datasets for each split
tokenized_datasets = {}
for split in cleaned_datasets.keys():
    tokenized_datasets[split] = cleaned_datasets[split].map(tokenize_function, batched=True)

# Print the tokenized datasets to check
print(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Title', 'Ingredients', 'Instructions', 'Image_Name', 'Cleaned_Ingredients'],
        num_rows: 13501
    })
})
Padding token: <|endoftext|>, ID: 50256


Map:   0%|          | 0/13501 [00:00<?, ? examples/s]

{'train': Dataset({
    features: ['Unnamed: 0', 'Title', 'Ingredients', 'Instructions', 'Image_Name', 'Cleaned_Ingredients', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 13501
})}


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    save_total_limit=5,
    learning_rate=3e-5,
)


In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'] if 'validation' in tokenized_datasets else tokenized_datasets['train'],  # Use validation set if available
)

# Train the model
trainer.train()

# Define the path to save the model in Google Drive
drive_model_path = '/content/drive/MyDrive/RecipeGPT/fine-tuned-gpt2-instructions'
os.makedirs(drive_model_path, exist_ok=True)

# Save the model
model.save_pretrained('./fine-tuned-gpt2-instructions')
tokenizer.save_pretrained('./fine-tuned-gpt2-instructions')


Step,Training Loss
2500,1.9972
5000,1.7792


('./fine-tuned-gpt2-instructions/tokenizer_config.json',
 './fine-tuned-gpt2-instructions/special_tokens_map.json',
 './fine-tuned-gpt2-instructions/vocab.json',
 './fine-tuned-gpt2-instructions/merges.txt',
 './fine-tuned-gpt2-instructions/added_tokens.json')

In [None]:
# Load the fine-tuned model and tokenizer
model_path = './fine-tuned-gpt2-instructions'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Define a function to generate recipes
def generate_recipe(ingredient_list):
    # Prepare the prompt with the ingredient list
    prompt = f"Ingredients: {', '.join(ingredient_list)}\n\nRecipe:\n"

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Create attention mask
    attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

    # Generate the recipe
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=300,  # Adjust the max_length as needed
        num_return_sequences=1,
        temperature=0.3,  # Control the creativity of the generated text
        top_p=0.9,  # Use nucleus sampling
        do_sample=True,  # Enable sampling to generate more diverse outputs
        pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to eos_token_id
    )

    # Decode the generated text
    generated_recipe = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_recipe

# Example usage
ingredient_list = ["2 eggs", "1 cup of flour", "1 cup of milk", "1 tsp of baking powder"]
generated_recipe = generate_recipe(ingredient_list)

print(generated_recipe)


Ingredients: 2 eggs, 1 cup of flour, 1 cup of milk, 1 tsp of baking powder

Recipe:

Prepare a grill for medium-high heat.
Place the chicken breasts on a grill rack set over medium-high heat.
While the chicken is cooking, heat the oil in a large skillet over medium-high. Add the onion and cook, stirring occasionally, until softened, about 5 minutes. Add the garlic and cook, stirring, until fragrant, about 3 minutes. Add the tomatoes and cook, stirring, until just beginning to soften, about 2 minutes. Add the tomatoes and cook, stirring, until just beginning to soften, about 2 minutes. Add the remaining 1/2 cup flour and cook, stirring, until just beginning to soften, about 2 minutes. Add the remaining 1/2 cup milk and cook, stirring, until just beginning to soften, about 2 minutes. Add the eggs, 1 cup of flour, 1 tsp of baking powder, and 1/2 tsp salt and cook, stirring, until just beginning to soften, about 2 minutes. Add the remaining 1/2 cup milk and cook, stirring, until just begin

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from google.colab import drive
import os
import torch
from datasets import load_metric

In [None]:

# Mount Google Drive
drive.mount('/content/drive')

# Load the CSV file
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/RecipeGPT/Food Ingredients and Recipe Dataset with Image Name Mapping.csv')

# Create a train-validation split if it does not exist
if 'train' not in dataset or 'validation' not in dataset:
    train_test_data = dataset['train'].train_test_split(test_size=0.1)
    dataset = DatasetDict({
        'train': train_test_data['train'],
        'validation': train_test_data['test']
    })

# Function to clean the Instructions column
def clean_instructions(entry):
    # Check if the entry is a string, if not, convert it to an empty string
    if isinstance(entry, str):
        return entry
    else:
        return ""

# Extract and clean the Instructions column for each split
cleaned_datasets = {}
for split in dataset.keys():
    cleaned_datasets[split] = dataset[split].map(lambda x: {'text': clean_instructions(x['Instructions'])})

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add a padding token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokens = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    tokens['labels'] = tokens['input_ids'].copy()  # Use input_ids as labels for language modeling
    return tokens

# Tokenize the cleaned datasets for each split
tokenized_datasets = {}
for split in cleaned_datasets.keys():
    tokenized_datasets[split] = cleaned_datasets[split].map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,   # Reduced batch size
    logging_steps=500,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    learning_rate=3e-5,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'] if 'validation' in tokenized_datasets else tokenized_datasets['train'],  # Use validation set if available
)

# Train the model
trainer.train()

# Define the path to save the model in Google Drive
drive_model_path = '/content/drive/MyDrive/RecipeGPT/fine-tuned-gpt2-instructions'
os.makedirs(drive_model_path, exist_ok=True)

# Save the model and tokenizer to Google Drive
model.save_pretrained(drive_model_path)
tokenizer.save_pretrained(drive_model_path)

print(f"Model and tokenizer saved to {drive_model_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Step,Training Loss
500,2.2397
1000,2.0429
1500,1.9665
2000,1.8854
2500,1.8521
3000,1.8247
3500,1.7867
4000,1.7808
4500,1.7728


Model and tokenizer saved to /content/drive/My Drive/fine-tuned-gpt2-instructions


### Saving the model

In [None]:
# Define the path to save the model in Google Drive
drive_model_path = '/content/drive/MyDrive/RecipeGPT/fine-tuned-gpt2-instructions'
os.makedirs(drive_model_path, exist_ok=True)

# Save the model and tokenizer to Google Drive
model.save_pretrained(drive_model_path)
tokenizer.save_pretrained(drive_model_path)

print(f"Model and tokenizer saved to {drive_model_path}")


Model and tokenizer saved to /content/drive/MyDrive/RecipeGPT/fine-tuned-gpt2-instructions


### Generating Recipes

In [None]:
# Load the fine-tuned model and tokenizer
model_path = '/content/drive/MyDrive/RecipeGPT/fine-tuned-gpt2-instructions'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Add a padding token if it's not already set
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
def generate_recipe(ingredient_list, max_length=300, num_return_sequences=1, temperature=0.7, top_p=0.9):
    # Prepare the prompt with the ingredient list
    prompt = f"Ingredients: {', '.join(ingredient_list)}\n\nRecipe:\n"

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

    # Generate the recipe
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


In [None]:
# Example ingredient list
ingredient_list = ["2 eggs", "1 cup of flour", "1 cup of milk", "1 tsp of baking powder"]

# Generate the recipe
generated_recipe = generate_recipe(ingredient_list)

print(generated_recipe)


Ingredients: 2 eggs, 1 cup of flour, 1 cup of milk, 1 tsp of baking powder

Recipe:
Place 1/4 cup of flour, 1/2 cup of milk, 1 tsp of baking powder in a large bowl. Mix in 1/4 cup of water. Add yeast and whisk until smooth.
Divide dough in half and roll each half into a ball. Transfer to a 13x9" baking dish. Cover and chill for at least 2 hours.
To make the cake: Place the eggs and milk in a large bowl and stir in the flour, baking powder, salt, and pepper. Whisk until combined.
Using an electric mixer, beat the flour mixture until smooth, about 5 minutes. With the mixer on low speed, gradually add the eggs and milk and beat until combined.
Divide dough in half and roll each half into a ball. Cover and chill for at least 2 hours.
To make the filling: Place the dough in a large bowl and stir in the egg mixture. Cover and chill for at least 2 hours.
To make the filling: Place the dough in a large bowl and stir in the sugar and vanilla. Whisk until combined.
Add the milk and vanilla mixtu

In [None]:
ingredient_list = ["200g spaghetti", "100g pancetta", "2 large eggs", "50g pecorino cheese", "50g parmesan", "2 cloves of garlic", "Salt", "Black pepper"]
generated_recipe = generate_recipe(ingredient_list)
print(generated_recipe)


Ingredients: 200g spaghetti, 100g pancetta, 2 large eggs, 50g pecorino cheese, 50g parmesan, 2 cloves of garlic, Salt, Black pepper

Recipe:
Pizza Dough:
Pizza Dough:
1/2 cup flour, 1 tsp. salt, and 2 cups water
1/2 cup sugar
3/4 cup water
2 tsp. pepper
Preheat oven to 350°F. Butter a 9x13" baking dish.
In a medium bowl, whisk together flour, baking powder, baking soda, and salt.
In a medium bowl, combine remaining 1/2 cup sugar, 1/2 cup flour, baking soda, and salt.
In a medium bowl, combine eggs, remaining 1/2 cup sugar, and 1/2 cup water.
In a medium bowl, whisk together pasta, egg mixture, Parmesan, and garlic.
In a large bowl, whisk together breadcrumbs and remaining 1/2 cup sugar, then add flour mixture.
In a large bowl, whisk together remaining 1/2 cup sugar, 1/2 cup flour, and salt.
In a medium bowl, whisk together egg mixture, Parmesan, and garlic.
In a large bowl, whisk together breadcrumbs and remaining 1/2 cup sugar, then add breadcrumbs and remaining 1/2 cup sugar.
Pour ba

In [None]:
ingredient_list = ["1 kg chicken pieces", "2 onions", "3 tomatoes", "2 teaspoons ginger-garlic paste", "2 teaspoons chili powder", "1 teaspoon turmeric powder", "2 teaspoons garam masala", "Salt", "Oil"]
generated_recipe = generate_recipe(ingredient_list)
print(generated_recipe)


Ingredients: 1 kg chicken pieces, 2 onions, 3 tomatoes, 2 teaspoons ginger-garlic paste, 2 teaspoons chili powder, 1 teaspoon turmeric powder, 2 teaspoons garam masala, Salt, Oil

Recipe:
Chile: 1/4 cup cumin, 1/4 cup paprika, 1/4 teaspoon paprika powder, 1/4 teaspoon black pepper, 1/4 teaspoon cayenne, 1/4 teaspoon salt, 1/2 teaspoon black pepper, 1/4 teaspoon black pepper paste, 1/4 teaspoon coriander, 1/4 teaspoon black pepper paste, 1/4 teaspoon coriander, 1/2 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander paste, 1/4 teaspoon coriander pa

In [None]:
ingredient_list = ["1 kg beef chuck, cut into chunks", "2 tablespoons flour", "2 tablespoons olive oil", "1 large onion", "2 cloves garlic", "3 carrots", "2 potatoes", "2 cups beef broth", "1 cup red wine", "2 bay leaves", "Salt", "Pepper"]
generated_recipe = generate_recipe(ingredient_list)
print(generated_recipe)


Ingredients: 1 kg beef chuck, cut into chunks, 2 tablespoons flour, 2 tablespoons olive oil, 1 large onion, 2 cloves garlic, 3 carrots, 2 potatoes, 2 cups beef broth, 1 cup red wine, 2 bay leaves, Salt, Pepper

Recipe:
Preheat oven to 300°F. Cook beef in a large pot of boiling salted water until tender and cooked through, about 5 minutes per side. Drain. Transfer to a bowl with a slotted spoon and let cool.
Drain beef.
Combine all ingredients except beef in a large bowl and toss with a fork. Season to taste with salt and pepper.
Divide beef into 4 portions and toss to coat. Let sit until ready to use.
Divide beef between 2 large bowls and toss to coat.
