In [10]:
#!/usr/bin/env python3
"""
scrape_mcd_menu.py
BeautifulSoup-based scraper for McDonald’s US menu ─ max 200 items.
Outputs: mcd_menu.json (UTF-8, pretty-printed)
"""

import json, re, time
from pathlib import Path
from typing import Dict, List, Tuple
from urllib.parse import urljoin, urlparse
from urllib import robotparser

import requests
from bs4 import BeautifulSoup

# ────────────────────────── Config ──────────────────────────
ROOT_URL       = "https://www.mcdonalds.com"
MENU_URL       = f"{ROOT_URL}/us/en-us/full-menu.html"
LOCATION       = "USA"
USER_AGENT     = "Mozilla/5.0 (compatible; mcd-scraper/1.1)"
MAX_ITEMS      = 200
REQUEST_DELAY  = 1.2                       # seconds between item pages
OUT_FILE       = Path("mcd_menu.json")
GENERIC_PREFIX = "our terms and conditions"  # phrase signalling legal banner
# ────────────────────────────────────────────────────────────


# ---------- helpers ----------------------------------------------------------
def allowed_by_robots(url: str, ua: str = USER_AGENT) -> bool:
    """Return True if URL may be fetched according to robots.txt."""
    parsed = urlparse(url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    rp = robotparser.RobotFileParser()
    rp.set_url(robots_url)
    try:
        rp.read()
    except Exception:
        return False
    return rp.can_fetch(ua, url)


def clean(txt: str | None) -> str:
    """Collapse whitespace & trim."""
    return re.sub(r"\s+", " ", txt or "").strip()


def fetch(url: str) -> BeautifulSoup:
    """HTTP-GET a page and return BeautifulSoup object."""
    headers = {"User-Agent": USER_AGENT}
    resp = requests.get(url, headers=headers, timeout=15)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


# ---------- menu-level extraction -------------------------------------------
def extract_restaurant_meta(menu_soup: BeautifulSoup) -> Tuple[str | None, str | None]:
    """Look inside JSON-LD for openingHours / telephone."""
    hours, phone = None, None
    for script in menu_soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)
        except Exception:
            continue
        # data can be obj or list; unify to list for convenience
        objs = data if isinstance(data, list) else [data]
        for obj in objs:
            if isinstance(obj, dict) and obj.get("@type") == "Restaurant":
                hours = hours or obj.get("openingHours")
                phone = phone or obj.get("telephone")
        if hours and phone:
            break
    return hours, phone


def extract_menu_links(menu_soup: BeautifulSoup) -> List[Tuple[str, str]]:
    """Return list of (category, absolute_link) pairs."""
    links: List[Tuple[str, str]] = []
    for heading in menu_soup.select("h2, h3"):
        category = clean(heading.get_text())
        sibling = heading.find_next(["ul", "div"])
        if not sibling:
            continue
        for a in sibling.find_all("a", href=True):
            href = a["href"]
            if "/product/" in href:
                links.append((category, urljoin(ROOT_URL, href)))
    return links


# ---------- product-level extraction ----------------------------------------
def special_features_from_text(text: str) -> List[str]:
    text_lower = text.lower()
    features: List[str] = []

    if "vegetarian" in text_lower or "vegan" in text_lower:
        features.append("Vegetarian / Vegan option")
    if "spicy" in text_lower:
        features.append("Spicy")
    allergen_match = re.search(r"allergens?:?\s*(.+?)(?:\.\s|$)", text_lower, re.I)
    if allergen_match:
        features.append("Allergens: " + clean(allergen_match.group(1)))
    return features


def extract_product_details(product_url: str) -> Dict:
    """Return dict with item_name, description, special_features."""
    soup = fetch(product_url)

    # ---------- item name ----------
    name_tag = soup.find(["h1", "h2"])
    item_name = clean(name_tag.get_text()) if name_tag else "Unnamed item"

    # ---------- description ----------
    desc = ""
    #   Try meta description first
    meta_desc = soup.find("meta", attrs={"name": "description"})
    if meta_desc and meta_desc.get("content"):
        desc = clean(meta_desc["content"])
    #   Fallback: first <p>
    if not desc:
        p = soup.find("p")
        desc = clean(p.get_text()) if p else ""

    if not desc or desc.lower().startswith(GENERIC_PREFIX):
        desc = "Description not available"

    # ---------- special features ----------
    features = special_features_from_text(soup.get_text(" ", strip=True))

    return {
        "item_name": item_name,
        "description": desc,
        "special_features": features or None,
        "product_url": product_url,
        "price": None  # price not public on full-menu pages
    }


# ---------- main workflow ----------------------------------------------------
def scrape_mcd_menu() -> Dict:
    if not allowed_by_robots(MENU_URL):
        raise RuntimeError("Blocked by robots.txt – cannot scrape.")

    menu_soup = fetch(MENU_URL)
    opening_hours, telephone = extract_restaurant_meta(menu_soup)

    all_links = extract_menu_links(menu_soup)

    results: List[Dict] = []
    seen: set[str] = set()

    for category, url in all_links:
        if len(results) >= MAX_ITEMS:
            break
        if url in seen:
            continue
        seen.add(url)
        try:
            details = extract_product_details(url)
        except Exception as exc:
            print(f"[warn] {url} skipped ({exc})")
            continue
        details["category"] = category
        results.append(details)
        time.sleep(REQUEST_DELAY)

    return {
        "restaurant_name": "McDonald's",
        "location": LOCATION,
        "opening_hours": opening_hours,
        "contact_info": telephone,
        "scrape_source": MENU_URL,
        "item_count": len(results),
        "items": results,
    }


if __name__ == "__main__":
    data = scrape_mcd_menu()
    OUT_FILE.write_text(
        json.dumps(data, indent=2, ensure_ascii=False),
        encoding="utf-8"
    )
    print(f"✅ Saved {data['item_count']} items → {OUT_FILE.resolve()}")


✅ Saved 93 items → C:\Users\prmsr\OneDrive - iitkgp.ac.in\Desktop\Zomato\mcd_menu.json
