In [1]:
import ssl
import certifi
import urllib.request


ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

import spacy
from spacy_layout import spaCyLayout
import spacy_curated_transformers

nlp = spacy.load('en_core_web_trf')
layout = spaCyLayout(nlp)

doc = layout("cookbooks/pdfs/TheFeluCookbook_V2.0.pdf")

page_chunks = []
for page in doc.spans:
    if page.label_ == "PAGE":
        page_chunks.append(page.text)


  from .autonotebook import tqdm as notebook_tqdm
Downloading detection model, please wait. This may take several minutes depending upon your network connection.
Downloading recognition model, please wait. This may take several minutes depending upon your network connection.
  "scores": score[score > threshold],


AttributeError: 'str' object has no attribute 'label_'

In [10]:
layout_spans = doc.spans["layout"]

page_chunks = []
current_page = []

for span in layout_spans:
    if span.label_ == "PAGE":
        if current_page:
            page_chunks.append(" ".join([s.text for s in current_page]))
            current_page = []
    current_page.append(span)

# Add last page
if current_page:
    page_chunks.append(" ".join([s.text for s in current_page]))

In [14]:
import fitz  # PyMuPDF
import os

# Path to your PDF
pdf_path = "cookbooks/pdfs/TheFeluCookbook_V2.0.pdf"

# Output folder (change if you want)
output_dir = "cookbooks/pdfs/FeluCookbook"
os.makedirs(output_dir, exist_ok=True)

# Load the PDF
doc = fitz.open(pdf_path)

# Loop through pages and save text
for i, page in enumerate(doc):
    text = page.get_text()
    filename = os.path.join(output_dir, f"page_{i+1}.txt")
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"Saved Page {i+1} to {filename}")


Saved Page 1 to cookbooks/pdfs/FeluCookbook/page_1.txt
Saved Page 2 to cookbooks/pdfs/FeluCookbook/page_2.txt
Saved Page 3 to cookbooks/pdfs/FeluCookbook/page_3.txt
Saved Page 4 to cookbooks/pdfs/FeluCookbook/page_4.txt
Saved Page 5 to cookbooks/pdfs/FeluCookbook/page_5.txt
Saved Page 6 to cookbooks/pdfs/FeluCookbook/page_6.txt
Saved Page 7 to cookbooks/pdfs/FeluCookbook/page_7.txt
Saved Page 8 to cookbooks/pdfs/FeluCookbook/page_8.txt
Saved Page 9 to cookbooks/pdfs/FeluCookbook/page_9.txt
Saved Page 10 to cookbooks/pdfs/FeluCookbook/page_10.txt
Saved Page 11 to cookbooks/pdfs/FeluCookbook/page_11.txt
Saved Page 12 to cookbooks/pdfs/FeluCookbook/page_12.txt
Saved Page 13 to cookbooks/pdfs/FeluCookbook/page_13.txt
Saved Page 14 to cookbooks/pdfs/FeluCookbook/page_14.txt
Saved Page 15 to cookbooks/pdfs/FeluCookbook/page_15.txt
Saved Page 16 to cookbooks/pdfs/FeluCookbook/page_16.txt
Saved Page 17 to cookbooks/pdfs/FeluCookbook/page_17.txt
Saved Page 18 to cookbooks/pdfs/FeluCookbook/page

In [17]:
from pydantic import BaseModel, Field, model_validator
from typing import List, Literal
import re


class Ingredient(BaseModel):
    name: str = Field(..., description="Name of the ingredient", example="Flour")
    quantity: str = Field(..., description="Amount required (with units)", example="2 cups")

    @model_validator(mode="before")
    @classmethod
    def accept_string_or_dict(cls, value):
        if isinstance(value, str):
            # naive split: quantity first, rest is name
            match = re.match(r"^(\d+[^a-zA-Z]*)\s+(.*)", value.strip())
            if match:
                return {"quantity": match.group(1).strip(), "name": match.group(2).strip()}
            else:
                # fallback: put full string as name, unknown quantity
                return {"quantity": "unknown", "name": value.strip()}
        if isinstance(value, dict):
            return value
        raise ValueError("Ingredient must be a string or a dictionary with 'name' and 'quantity'.")


class Instruction(BaseModel):
    step: str = Field(..., description="Cooking step")

    @model_validator(mode="before")
    @classmethod
    def accept_string_or_dict(cls, value):
        if isinstance(value, str):
            return {"step": value}
        if isinstance(value, dict) and "step" in value:
            return value
        raise ValueError("Instruction must be a string or a dictionary with 'step'.")


class Recipe(BaseModel):
    recipe_title: str = Field(..., description="Title of the recipe")
    ingredients: List[Ingredient] = Field(..., description="List of ingredients with quantities")
    instructions: List[Instruction] = Field(..., description="Step-by-step instructions")
    servings: int = Field(..., description="Number of servings")
    calories: int = Field(..., description="Calories per serving")
    carbs: int = Field(..., description="Carbohydrates per serving (g)")
    protein: int = Field(..., description="Protein per serving (g)")
    fat: int = Field(..., description="Fat per serving (g)")
    meal: Literal["breakfast", "lunch", "dinner"] = Field(..., description="Meal type")


In [13]:
import openai
import os

with open("cookbooks/pdfs/FeluCookbook/page_153.txt", "r") as file:
    recipe_text = file.read()

prompt = f"""
Extract the following structured fields from the recipe below and return them as JSON:
- recipe_title: string
- ingredients: list of objects with 'name' and 'quantity'
- instructions: list of steps with numbers, ex. 1. 2. etc.
- servings: integer
- calories: integer (per serving)
- carbs: integer (grams per serving)
- protein: integer (grams per serving)
- fat: integer (grams per serving)
- meal: one of ['breakfast', 'lunch', 'dinner']

Here is the recipe text:

{recipe_text}
"""


In [14]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

response = openai.responses.create(
    model="gpt-4.1-nano",
    instructions="You are a helpful assistant that extracts structured data from recipe text.",
    input = prompt,
    temperature=0
)

response_text = response.output_text

In [18]:
import json

try:
    recipe_data = json.loads(response_text)
    recipe = Recipe(**recipe_data)
    print(recipe)
except Exception as e:
    print("Error parsing recipe:", e)


recipe_title='Pesto on chicken with potatoes' ingredients=[Ingredient(name='pesto', quantity='125 g'), Ingredient(name='water', quantity='1'), Ingredient(name='chicken breast', quantity='180 g'), Ingredient(name='medium onion', quantity='50 g'), Ingredient(name='potatoes', quantity='250 g'), Ingredient(name='oil', quantity='6 g'), Ingredient(name='Salt', quantity='to taste'), Ingredient(name='black pepper', quantity='to taste')] instructions=[Instruction(step='Peel and cut the potatoes into thick slices, then into fries, and then again into bite-size cubes (2x2cm or 1x1 inch).'), Instruction(step='Microwave the potatoes in a bowl for 5 minutes at 600W.'), Instruction(step='Prepare 1 serving of pesto. Dice the onion and cut the chicken breast into equally sized pieces.'), Instruction(step='Add oil to a pan over medium heat and add the microwaved potatoes with a pinch of salt. Fry for 5 minutes until slightly golden brown.'), Instruction(step='Add another drizzle of oil and add the onion

In [19]:
import pandas as pd

def recipe_to_row(recipe: Recipe) -> pd.DataFrame:
    data = {
        "recipe_title": recipe.recipe_title,
        "servings": recipe.servings,
        "meal": recipe.meal,
        "calories": recipe.calories,
        "carbs": recipe.carbs,
        "protein": recipe.protein,
        "fat": recipe.fat,
        "ingredients": "; ".join([f"{ing.quantity} {ing.name}" for ing in recipe.ingredients]),
        "instructions": " | ".join([instr.step for instr in recipe.instructions]),
    }
    return pd.DataFrame([data])


In [22]:
df = recipe_to_row(recipe)
df.to_csv("db/recipe_data.csv", index=False)
