## Assignment 2 - Recipe Website Scraping

In [1]:
import requests
import json
import time
import logging
import random
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Base URL
base_url = "https://www.epicurious.com"

# Headers to avoid being blocked
headers = {"User-Agent": "Mozilla/5.0"}

# Delay function (random 2-5 sec pause)
def delay():
    sleep_time = random.uniform(2, 5)
    logging.info(f"Sleeping for {sleep_time:.2f} seconds to avoid detection...")
    time.sleep(sleep_time)

# Step 1: Scrape all "recipes-menus" links from the main page
def get_links(url, keyword):
    try:
        logging.info(f"Fetching links from {url}...")
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        links = [urljoin(base_url, a["href"]) for a in soup.find_all("a", href=True) if keyword in a["href"]]
        logging.info(f"Found {len(links)} links matching '{keyword}'")
        return links
    except requests.RequestException as e:
        logging.error(f"Error fetching links from {url}: {e}")
        return []

# Step 2: Scrape "recipes/food/views" links from each "recipes-menus" page
def get_recipe_links(menu_links):
    recipe_links = []
    for index, menu_link in enumerate(menu_links[:5]):  # Limit to first 10 links
        logging.info(f"[{index+1}/10] Scraping menu page: {menu_link}")
        delay()
        recipe_links.extend(get_links(menu_link, "recipes/food/views"))
    
    recipe_links = list(set(recipe_links))[:5]  # Limit to first 10 unique recipe links
    logging.info(f"Total unique recipes found: {len(recipe_links)}")
    return recipe_links

# Step 3: Scrape ingredients and preparation steps from each recipe page
def scrape_recipe(url):
    try:
        logging.info(f"Scraping recipe: {url}")
        delay()
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Get ingredients
        ingredients_container = soup.find("div", {"data-testid": "IngredientList"})
        ingredients = [item.get_text(strip=True) for item in ingredients_container.find_all("div", class_="Description-cSrMCf")] if ingredients_container else ["Not found"]

        # Get preparation steps
        instructions_container = soup.find("div", {"data-testid": "InstructionsWrapper"})
        step_titles = instructions_container.find_all("h4") if instructions_container else []
        step_texts = instructions_container.find_all("p") if instructions_container else []
        steps = [f"{title.get_text(strip=True)}: {text.get_text(strip=True)}" for title, text in zip(step_titles, step_texts)] if step_titles else ["Not found"]

        return {"url": url, "ingredients": ingredients, "steps": steps}
    
    except requests.RequestException as e:
        logging.error(f"Error scraping recipe {url}: {e}")
        return {"url": url, "error": str(e)}

# Run the scraper
logging.info("Starting the scraper...")
menu_links = get_links("https://www.epicurious.com/recipes-menus", "recipes-menus")
recipe_links = get_recipe_links(menu_links)

recipes_data = []
for index, recipe_link in enumerate(recipe_links):
    logging.info(f"[{index+1}/{len(recipe_links)}] Scraping recipe page...")
    recipes_data.append(scrape_recipe(recipe_link))

# Save to JSON file
json_file = "epicurious_recipes.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(recipes_data, f, indent=4, ensure_ascii=False)

logging.info(f"Scraping completed! Data saved to '{json_file}'.")


2025-02-26 15:58:10,231 - INFO - Starting the scraper...
2025-02-26 15:58:10,232 - INFO - Fetching links from https://www.epicurious.com/recipes-menus...
2025-02-26 15:58:11,055 - INFO - Found 38 links matching 'recipes-menus'
2025-02-26 15:58:11,056 - INFO - [1/10] Scraping menu page: https://www.epicurious.com/recipes-menus
2025-02-26 15:58:11,057 - INFO - Sleeping for 4.49 seconds to avoid detection...
2025-02-26 15:58:15,545 - INFO - Fetching links from https://www.epicurious.com/recipes-menus...
2025-02-26 15:58:16,029 - INFO - Found 19 links matching 'recipes/food/views'
2025-02-26 15:58:16,030 - INFO - [2/10] Scraping menu page: https://www.epicurious.com/recipes-menus
2025-02-26 15:58:16,031 - INFO - Sleeping for 4.13 seconds to avoid detection...
2025-02-26 15:58:20,159 - INFO - Fetching links from https://www.epicurious.com/recipes-menus...
2025-02-26 15:58:20,540 - INFO - Found 19 links matching 'recipes/food/views'
2025-02-26 15:58:20,541 - INFO - [3/10] Scraping menu page: