In [1]:
!pip install requests beautifulsoup4 pandas selenium

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Downloading PySocks-1.7.1-py3-n

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

In [3]:
import csv
def write_recipes_to_csv(recipes, page_number):
    filename = f"recipes_page_{page_number}.csv"
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(
            file,
            fieldnames=[
                "title",
                "url",
                "image_url",
                "author",
                "rating_percent",
                "cook_time",
            ],
        )
        writer.writeheader()
        writer.writerows(recipes)
    print(f"Data written to {filename}")

In [4]:
import requests
def fetch_recipe_data(page_number, timeout=60):
    url = f"https://api.food.com/services/mobile/fdc/search/sectionfront?pn={page_number}&recordType=Recipe&sortBy=trending&collectionId=17"

    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an error for bad responses (4xx and 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        if "API rate limit" in str(e):  # Check if it's an API rate limit error
            raise
        return {}

In [20]:
import random
def scrape_recipes(category_urls, target_unique_titles=82212):
    unique_titles = set()  # Use a set to track unique titles
    recipes = []
    last_page_number = 17891

    for category_url in category_urls:
        page_number = 17891
        while len(unique_titles) < target_unique_titles:
            print(f"Fetching URL: {category_url}?page={page_number}")
            try:
                data = fetch_recipe_data(page_number)
            except requests.exceptions.ReadTimeout as e:
                print(f"Read timeout error: {e}")
                # Save state and return if read timeout error
                return recipes, last_page_number
            except Exception as e:
                print(f"Error fetching data: {e}")
                if "API rate limit" in str(e):
                    # Save state and return if rate limit error
                    print("API rate limit reached. Saving progress and exiting.")
                    return recipes, last_page_number

                # Random sleep to avoid hitting API rate limits
                time.sleep(random.uniform(1, 5))
                continue

            if not data.get("response") or not data.get("response").get("results"):
                print("No results found or no response, ending.")
                break

            for recipe in data.get("response").get("results", []):
                title = recipe.get("main_title", "").strip()
                if title in unique_titles:
                    continue  # Skip duplicate titles

                recipe_url = recipe.get("record_url")
                image_url = recipe.get("recipe_photo_url")
                author = recipe.get("main_username")
                rating_percent = recipe.get("main_rating")
                cook_time = recipe.get("recipe_totaltime")

                recipes.append(
                    {
                        "title": title,
                        "url": recipe_url,
                        "image_url": image_url,
                        "author": author,
                        "rating_percent": rating_percent,
                        "cook_time": cook_time,
                    }
                )

                unique_titles.add(title)  # Add title to the set

            print(f"Current count of unique recipes: {len(unique_titles)}")

            if len(unique_titles) >= target_unique_titles:
                break  # Stop if the target number of unique titles is reached

            # Write data to CSV every 100 pages
            if page_number % 100 == 0:
                write_recipes_to_csv(recipes, page_number)
                recipes = []  # Clear recipes list after saving to CSV

            print("Moving to next page")
            page_number += 1  # Move to the next page
            last_page_number = page_number

    # Write remaining data to CSV if needed
    if recipes:
        write_recipes_to_csv(recipes, last_page_number)

    return recipes, last_page_number

In [6]:
category_urls = [
    "https://www.food.com/recipe/all/trending",
    "https://www.food.com/recipe/all/quick-easy",
    "https://www.food.com/recipe/all/healthy",
    "https://www.food.com/recipe/all/editor-pick",
    "https://www.food.com/recipe/all/newest",
]

In [21]:
recipes = scrape_recipes(category_urls)

# Print or process the list of recipes
for recipe in recipes:
    print(recipe)

Fetching URL: https://www.food.com/recipe/all/trending?page=17891
Current count of unique recipes: 10
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17892
Current count of unique recipes: 20
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17893
Current count of unique recipes: 30
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17894
Current count of unique recipes: 40
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17895
Current count of unique recipes: 50
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17896
Current count of unique recipes: 60
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17897
Current count of unique recipes: 70
Moving to next page
Fetching URL: https://www.food.com/recipe/all/trending?page=17898
Current count of unique recipes: 80
Moving to next page
Fetching URL: https://ww

In [10]:
len(recipes)

2

In [92]:
# Extract titles
titles = [recipe["title"] for recipe in recipes]

# Find unique titles
unique_titles = set(titles)

# Number of unique titles
num_unique_titles = len(unique_titles)

print(f"Number of unique titles: {num_unique_titles}")

Number of unique titles: 2504


In [1]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Counter for errors
error_count = 0


def extract_recipe_details(recipe_url, cook_time, recipe_rating):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # headless mode

    service = Service("/opt/homebrew/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(recipe_url)

        # Wait for directions and ingredients to be present
        wait = WebDriverWait(driver, 3)
        directions_list = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".direction-list li"))
        )
        ingredient_elements = wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, ".ingredient-list li")
            )
        )

        # Extract directions
        directions = "\n".join([li.text.strip() for li in directions_list])

        # Extract ingredients
        ingredients_dict = {}
        for element in ingredient_elements:
            quantity = element.find_element(
                By.CSS_SELECTOR, ".ingredient-quantity"
            ).text.strip()
            text = element.find_element(
                By.CSS_SELECTOR, ".ingredient-text"
            ).text.strip()
            ingredients_dict[quantity] = text

        return {
            "directions": directions,
            "ingredients": ingredients_dict,
            "cook_time": cook_time,
            "recipe_rating": recipe_rating,
        }

    except Exception as e:
        global error_count
        error_count += 1
        return {
            "directions": "N/A",
            "ingredients": {},
            "cook_time": cook_time,
            "recipe_rating": recipe_rating,
        }

    finally:
        driver.quit()


def process_csv_file(input_file, output_file):
    global error_count
    print(f"Processing file: {input_file}")

    # Read the CSV file
    recipes = pd.read_csv(input_file)

    # Prepare to collect results
    batch_size = 10
    all_recipes = []

    # Open the output file in append mode
    with open(output_file, "a") as f:
        # Write the header if the file is empty
        if os.stat(output_file).st_size == 0:
            pd.DataFrame(
                columns=[
                    "title",
                    "url",
                    "image_url",
                    "author",
                    "rating_percent",
                    "cook_time",
                    "directions",
                    "ingredients",
                ]
            ).to_csv(f, index=False)

    # Process recipes in batches
    for i, (_, recipe) in enumerate(recipes.iterrows()):
        try:
            details = extract_recipe_details(
                recipe["url"], recipe["cook_time"], recipe["rating_percent"]
            )
            all_recipes.append(
                {
                    "title": recipe["title"],
                    "url": recipe["url"],
                    "image_url": recipe["image_url"],
                    "author": recipe["author"],
                    "rating_percent": recipe["rating_percent"],
                    "cook_time": recipe["cook_time"],
                    "directions": details["directions"],
                    "ingredients": details["ingredients"],
                }
            )

            # Write to CSV every 10 recipes
            if (i + 1) % batch_size == 0:
                df = pd.DataFrame(all_recipes)
                df.to_csv(output_file, mode="a", header=False, index=False)
                all_recipes = []  # Reset list for the next batch
                print(f"Processed and saved batch of {batch_size} recipes.")

        except Exception:
            # Log general error message
            print(f"Failed to extract details for {recipe['url']}")

    # Process any remaining recipes that didn't fill a complete batch
    if all_recipes:
        df = pd.DataFrame(all_recipes)
        df.to_csv(output_file, mode="a", header=False, index=False)
        print(f"Processed and saved final batch of {len(all_recipes)} recipes.")

    # Print the total error count
    print(f"Total errors encountered: {error_count}")

    # Delete the input file once done
    os.remove(input_file)


input_directory = "/Users/snehsuresh/Desktop/Projects/recipe-recommender-system-pipeline/notebooks/data"
output_directory = "/Users/snehsuresh/Desktop/Projects/recipe-recommender-system-pipeline/notebooks/data/output"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get the list of CSV files and sort them
csv_files = [f for f in os.listdir(input_directory) if f.endswith(".csv")]
csv_files.sort()  # Sort filenames in ascending order

# Process each CSV file in the sorted order
for filename in csv_files:
    input_file = os.path.join(input_directory, filename)
    output_file = os.path.join(output_directory, f"processed_{filename}")
    process_csv_file(input_file, output_file)

Failed to extract details for https://www.food.com/recipe/oatmeal-chocolate-toffee-squares-198870: Message: no such element: Unable to locate element: {"method":"css selector","selector":".ingredient-quantity"}
  (Session info: chrome-headless-shell=127.0.6533.100); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000100555088 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x000000010054d764 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x000000010015c82c cxxbridge1$string$len + 88524
3   chromedriver                        0x00000001001a0834 cxxbridge1$string$len + 367060
4   chromedriver                        0x0000000100196e38 cxxbridge1$string$len + 327640
5   chromedriver                        0x00000001001d848c cxxbridge1$string$len + 595500
6   chromedriver                        0