In [1]:
import glob

pdf_paths = glob.glob('../data/pdf/*.pdf')
print('[INFO] PDF files:', pdf_paths)

[INFO] PDF files: ['../data/pdf/Childrens2021.pdf', '../data/pdf/Wine2024.pdf', '../data/pdf/Lunch2024.pdf']


In [2]:
import pdfplumber

def pdf_to_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

pdf_text = pdf_to_text(pdf_paths[0])
print(f'[INFO] PDF content for {pdf_paths[0]}\n\n', pdf_text)

[INFO] PDF content for ../data/pdf/Childrens2021.pdf

 600 S. Brea Blvd. | Brea, CA 92821
(10 years and under)
These Prices are valid only in combination with a
regularly priced adult entree purchase.
Spaghetti with Tomato Sauce ...............................6.95
Spaghetti with Meat Sauce ..................................7.25
Spaghetti with Meat Ball .......................................7.25
Shell Macaroni with Meat Sauce .........................7.25
Ravioli (Cheese or Beef) with Tomato Sauce ......7.25
Lasagne with Tomato Sauce ................................7.50
Chicken Tenders (2)
Served with a side of Spaghetti .................7.50
Baked Pasta Shells with Cheese and Meat Sauce ..7.50
INCLUDES:
Salad with Choice of Dressing and Garlic Bread
or
Ministrone Soup and Garlic Bread
and
Ice Cream for Dessert
Add a Small Drink for $1.25



In [3]:
from pprint import pprint

def chunk_text(text, chunk_size=5000, overlap=400):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = chunk_text(pdf_text)
print('[INFO] Content after chunking:')
pprint(chunks)

[INFO] Content after chunking:
['600 S. Brea Blvd. | Brea, CA 92821\n'
 '(10 years and under)\n'
 'These Prices are valid only in combination with a\n'
 'regularly priced adult entree purchase.\n'
 'Spaghetti with Tomato Sauce ...............................6.95\n'
 'Spaghetti with Meat Sauce ..................................7.25\n'
 'Spaghetti with Meat Ball .......................................7.25\n'
 'Shell Macaroni with Meat Sauce .........................7.25\n'
 'Ravioli (Cheese or Beef) with Tomato Sauce ......7.25\n'
 'Lasagne with Tomato Sauce ................................7.50\n'
 'Chicken Tenders (2)\n'
 'Served with a side of Spaghetti .................7.50\n'
 'Baked Pasta Shells with Cheese and Meat Sauce ..7.50\n'
 'INCLUDES:\n'
 'Salad with Choice of Dressing and Garlic Bread\n'
 'or\n'
 'Ministrone Soup and Garlic Bread\n'
 'and\n'
 'Ice Cream for Dessert\n'
 'Add a Small Drink for $1.25\n']


### Data standardization

In [4]:
STANDARDIZATION_PROMPT = """I'm going to provide some text describing various menu items. Convert this text into a JSON array of objects, where each object represents a specific menu item with the following structure:
```json
{
    "item": "Name of the specific dish, appetizer, entrée, drink, etc.",
    "description": "Detailed description of the menu item.",
    "category": "Decide appropriate category such as Children's Menu, Appetizer, Entrée, Dessert, Drink, etc. The category could be multiple.",
    "price": Numeric value representing the item's price (e.g., 6.95),
    "additional_info": "Additional details included with the item, such as sides, drinks, or desserts. If none, leave blank or omit this field."
}
```

Important Guidelines:
- Specific Items Only: The item names must represent specific dishes, entrées, appetizers, or drinks, NOT general categories.
- Complete Entries Only: If the provided data chunk contains incomplete or unclear information at the beginning or end, skip these incomplete entries to ensure accuracy.
- Accurate Categories: Assign each item an appropriate category based on the data provided or inferred from context.
- Pricing: Include accurate numerical pricing data whenever available; if pricing information isn't provided, omit the field.
- Additional Info: Clearly include any mentioned sides, salads, soups, desserts, beverages, or extras that come with the item. If no additional info is provided, leave this field blank or omit it entirely.

NOTE: "Category" should be the a 2-4 categories/common search terms describing the item, for example (but not limited to) -> dessert, side, chicken, sandwich, cocktail, drink, burger, salad, etc etc -> Try to use multiple categories/descriptors
THIS DATA IS CHUNKED SO IF THE DATA AT THE BEGINNING OR END (FOR TITLE OR DESCRIPTION) SEEMS INCOMPLETE COMPARED TO THE REST OF THE ENTRIES, JUST SKIP IT
```
"""

In [5]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model='deepseek-r1:14b', temperature=0)

In [6]:
import json
import re

all_results = []

def clean_json_from_response(response):
    json_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', response)
    if json_match:
        json_str = json_match.group(1)
    else:
        json_str = response
    
    # Clean any thinking tags that might be present
    json_str = re.sub(r'<think>[\s\S]*?</think>', '', json_str)
    
    # Try to parse JSON
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        print(f"Failed to parse JSON from response: {response[:100]}...")
        return None

menus = []
for chunk in chunks:
    response = llm.invoke([
        (
            "system",
            STANDARDIZATION_PROMPT,
        ),
        ("human", f"ONLY RESPOND WITH THE JSON OBJECT AND NOTHING ELSE. CONVERT THE FULL DATA. DO NOT TRUNCATE OR STOP EARLY. FULL TEXT! HERE IS THE DATA I WANT YOU TO CONVERT: {chunk}"),
    ])

    result = clean_json_from_response(response.content)
    menus.extend(result)

In [7]:
import os
import json

# Create directory if it doesn't exist
os.makedirs("../data/json", exist_ok=True)
fname = pdf_paths[0].split('/')[-1].split('.')[0]

# Write the menus data to a jsonl file
with open(f"../data/json/{fname}.jsonl", "w") as f:
    for item in menus:
        f.write(json.dumps(item) + "\n")

print(f"[INFO] Saved {len(menus)} menu items to ../data/json/{fname}.jsonl")

[INFO] Saved 8 menu items to ../data/json/Childrens2021.jsonl
