In [1]:
!pip install requests beautifulsoup4 pandas selenium

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Downloading PySocks-1.7.1-py3-n

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

In [15]:
def scrape_recipes(category_urls, max_pages=5):
    recipes = []

    # Set up the Selenium WebDriver
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode

    # Update this path to the location where you have placed chromedriver
    service = Service("/opt/homebrew/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    for category_url in category_urls:
        for page in range(1, max_pages + 1):
            url = f"{category_url}?page={page}"
            print(url)
            driver.get(url)

            # Allow time for the page to fully load
            time.sleep(3)  # Adjust sleep time if necessary

            # Find all recipe tiles on the page
            recipe_elements = driver.find_elements(
                By.CSS_SELECTOR, ".fd-tile.fd-recipe"
            )
            for element in recipe_elements:
                title = element.find_element(By.CSS_SELECTOR, "h2.title").text.strip()
                recipe_url = element.find_element(
                    By.CSS_SELECTOR, "h2.title a"
                ).get_attribute("href")
                image_url = element.find_element(By.CSS_SELECTOR, "img").get_attribute(
                    "src"
                )
                author = element.find_element(
                    By.CSS_SELECTOR, "div.author a"
                ).text.strip()
                rating_percent = (
                    element.find_element(
                        By.CSS_SELECTOR, "div.fd-rating span.fd-rating-percent"
                    )
                    .get_attribute("style")
                    .split(":")[1]
                    .strip("%")
                )
                cook_time = element.find_element(
                    By.CSS_SELECTOR, "div.cook-time"
                ).text.strip()

                recipes.append(
                    {
                        "title": title,
                        "url": recipe_url,
                        "image_url": image_url,
                        "author": author,
                        "rating_percent": rating_percent,
                        "cook_time": cook_time,
                    }
                )

    driver.quit()
    return recipes

In [16]:
category_urls = [
    "https://www.food.com/recipe/all/trending",
    "https://www.food.com/recipe/all/quick-easy",
    "https://www.food.com/recipe/all/healthy",
    "https://www.food.com/recipe/all/editor-pick",
    "https://www.food.com/recipe/all/newest",
]

In [17]:
recipes = scrape_recipes(category_urls, max_pages=1)

# Print or process the list of recipes
for recipe in recipes:
    print(recipe)

https://www.food.com/recipe/all/trending?page=1
https://www.food.com/recipe/all/quick-easy?page=1
https://www.food.com/recipe/all/healthy?page=1
https://www.food.com/recipe/all/editor-pick?page=1
https://www.food.com/recipe/all/newest?page=1
{'title': "BARB'S GUMBO", 'url': 'https://www.food.com/recipe/barbs-gumbo-82288', 'image_url': 'https://img.sndimg.com/food/image/upload/w_483,h_372,fl_progressive,q_80,c_fill/v1/img/recipes/82/28/8/cB14froTlmpDfRF49wmQ_gumbo%20SITE-3.jpg', 'author': 'Barb G.', 'rating_percent': ' 98%;', 'cook_time': '1.1 h'}
{'title': "BARB'S GUMBO", 'url': 'https://www.food.com/recipe/barbs-gumbo-82288', 'image_url': 'https://img.sndimg.com/food/image/upload/w_483,h_372,fl_progressive,q_80,c_fill/v1/img/recipes/82/28/8/cB14froTlmpDfRF49wmQ_gumbo%20SITE-3.jpg', 'author': 'Barb G.', 'rating_percent': ' 98%;', 'cook_time': '1.1 h'}
{'title': 'BOURBON CHICKEN', 'url': 'https://www.food.com/recipe/bourbon-chicken-45809', 'image_url': 'https://img.sndimg.com/food/image

In [24]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def extract_recipe_details(recipe_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # headless mode

    service = Service("/opt/homebrew/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    driver.get(recipe_url)

    # Wait for directions and ingredients to be present
    wait = WebDriverWait(driver, 10)
    directions_list = wait.until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".direction-list li"))
    )
    ingredient_elements = wait.until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ingredient-list li"))
    )

    # Extract directions
    directions = "\n".join([li.text.strip() for li in directions_list])

    # Extract ingredients
    ingredients_dict = {}
    for element in ingredient_elements:
        quantity = element.find_element(
            By.CSS_SELECTOR, ".ingredient-quantity"
        ).text.strip()
        text = element.find_element(By.CSS_SELECTOR, ".ingredient-text").text.strip()
        ingredients_dict[quantity] = text

    driver.quit()

    return {"directions": directions, "ingredients": ingredients_dict}

In [25]:
all_recipes = {}
for recipe in recipes:
    try:
        details = extract_recipe_details(recipe["url"])
        all_recipes[recipe["url"]] = details
    except Exception as e:
        print(f"Failed to extract details for {recipe['url']}: {e}")

Failed to extract details for https://www.food.com/recipe/creamy-cajun-chicken-pasta-39087: Message: no such element: Unable to locate element: {"method":"css selector","selector":".ingredient-quantity"}
  (Session info: chrome-headless-shell=127.0.6533.89); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000103279088 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x0000000103271764 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x0000000102e8082c cxxbridge1$string$len + 88524
3   chromedriver                        0x0000000102ec4834 cxxbridge1$string$len + 367060
4   chromedriver                        0x0000000102ebae38 cxxbridge1$string$len + 327640
5   chromedriver                        0x0000000102efc48c cxxbridge1$string$len + 595500
6   chromedriver                        0x0000000