In [13]:
#!/usr/bin/env python3
"""
scrape_haldirams_menu.py  ·  May-2025
Outputs haldirams_menu.json with the exact McDonald's-style schema.

Robustness features
───────────────────
• Tries every sitemap URL listed in robots.txt *and* a fallback list.
• If all sitemaps fail (404/403), crawls the main-nav category links.
• Honors robots.txt, polite delays, normal desktop User-Agent.
• Flags vegetarian/vegan, spicy, allergen keywords.
"""

from __future__ import annotations
import json, re, time, random, io
from pathlib import Path
from typing import Dict, List, Iterable
from urllib import robotparser
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from lxml import etree

# ───────── CONFIG ─────────
ROOT  = "https://www.haldirams.com"
UA    = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
         "AppleWebKit/537.36 (KHTML, like Gecko) "
         "Chrome/124.0.0.0 Safari/537.36")
MAX_ITEMS  = 200
DELAY_SEC  = 1.0
OUT_FILE   = Path("haldirams_menu.json")
FALLBACK_SITEMAPS = [
    "/sitemap.xml",
    "/sitemap_index.xml",
    "/sitemap/sitemap.xml",
    "/media/sitemap/sitemap.xml",
]
# ──────────────────────────


# ---------- networking helpers ----------
def fetch(url: str, binary=False) -> str | bytes:
    hdr = {
        "User-Agent": UA,
        "Accept": "*/*",
        "Accept-Language": "en-US,en;q=0.9",
    }
    r = requests.get(url, headers=hdr, timeout=30, allow_redirects=True)
    r.raise_for_status()
    return r.content if binary else r.text


def robots_allows(path: str) -> bool:
    rp = robotparser.RobotFileParser()
    rp.set_url(urljoin(ROOT, "/robots.txt"))
    try:
        rp.read()
        return rp.can_fetch(UA, path)
    except Exception:
        return True


# ---------- sitemap discovery ----------
def discover_sitemap_urls() -> Iterable[str]:
    # read robots.txt
    try:
        robots = fetch(urljoin(ROOT, "/robots.txt"))
        for line in robots.splitlines():
            if line.lower().startswith("sitemap:"):
                yield line.split(":", 1)[1].strip()
    except Exception:
        pass
    # fallbacks
    for path in FALLBACK_SITEMAPS:
        yield urljoin(ROOT, path)


def get_first_live_sitemap() -> str | None:
    for url in discover_sitemap_urls():
        try:
            fetch(url, binary=True)       # just to test 200 
            return url
        except Exception:
            continue
    return None


def extract_product_urls_from_sitemap(xml_bytes: bytes) -> List[str]:
    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
    doc = etree.fromstring(xml_bytes)
    locs = [e.text for e in doc.xpath("//sm:url/sm:loc", namespaces=ns)]
    return [
        u for u in locs
        if u.endswith(".html")
        and "/blog/" not in u
        and "/category/" not in u
    ]


# ---------- HTML fallback (nav crawl) ----
def nav_category_links() -> List[str]:
    soup = BeautifulSoup(fetch(ROOT), "html.parser")
    links = []
    for a in soup.select("nav a[href]"):
        href = a["href"]
        if href.endswith(".html") and "javascript:" not in href:
            full = href if href.startswith("http") else urljoin(ROOT, href)
            links.append(full)
    # dedupe preserve order
    seen, out = set(), []
    for link in links:
        if link not in seen:
            seen.add(link)
            out.append(link)
    return out


def product_links_from_category(cat_url: str) -> List[str]:
    soup = BeautifulSoup(fetch(cat_url), "html.parser")
    out = []
    for a in soup.select("a.product-item-link[href]"):
        href = a["href"]
        if href.startswith("http"):
            out.append(href)
        else:
            out.append(urljoin(ROOT, href))
    return out


# ---------- feature flags ---------------
def special_features(text: str) -> List[str]:
    low = text.lower()
    feats = []
    if any(k in low for k in ("veg", "vegan", "vegetarian")):
        feats.append("Vegetarian / Vegan option")
    if "spicy" in low or "chilli" in low:
        feats.append("Spicy")
    if "allergen" in low or ("contains" in low and "nuts" in low):
        feats.append("Contains allergen information")
    return feats


# ---------- product parser --------------
def parse_product(url: str) -> Dict:
    soup = BeautifulSoup(fetch(url), "html.parser")

    # name
    h1 = soup.find("h1")
    name = h1.get_text(strip=True) if h1 else ""
    if not name:
        ogt = soup.find("meta", property="og:title")
        name = ogt["content"].strip() if ogt and ogt.get("content") else url.rsplit("/", 1)[-1]

    # description
    box = soup.find("div", class_="product attribute overview")
    desc = box.get_text(" ", strip=True) if box else ""
    if not desc:
        ogd = soup.find("meta", property="og:description")
        desc = ogd["content"].strip() if ogd and ogd.get("content") else "Description not available"
    else:
        desc = re.sub(r"\s+", " ", desc)

    # category breadcrumb
    cat = "Uncategorised"
    bc = soup.select("ul.breadcrumbs li")
    if len(bc) >= 2:
        cat = bc[1].get_text(strip=True)

    return {
        "item_name": name,
        "description": desc,
        "special_features": special_features(desc) or None,
        "product_url": url,
        "price": None,
        "category": cat,
    }


# ---------- main workflow ---------------
def scrape_haldirams() -> Dict:
    live_sitemap = get_first_live_sitemap()
    product_urls: List[str] = []

    if live_sitemap:
        print(f"[info] using sitemap {live_sitemap}")
        xml = fetch(live_sitemap, binary=True)
        product_urls = extract_product_urls_from_sitemap(xml)
    else:
        print("[warn] no live sitemap – falling back to nav crawl")
        cats = nav_category_links()
        for c in cats:
            product_urls.extend(product_links_from_category(c))
            time.sleep(0.6)

    # dedupe & trim
    seen, uniq = set(), []
    for u in product_urls:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
        if len(uniq) >= MAX_ITEMS:
            break

    items: List[Dict] = []
    for url in uniq:
        path = urlparse(url).path
        if not robots_allows(path):
            continue
        try:
            items.append(parse_product(url))
        except Exception as e:
            print(f"[warn] {url} skipped – {e}")
        time.sleep(DELAY_SEC + random.uniform(0, 0.4))
        if len(items) >= MAX_ITEMS:
            break

    return {
        "restaurant_name": "Haldiram's",
        "location": "India",
        "opening_hours": None,
        "contact_info": None,
        "scrape_source": live_sitemap or "HTML-navigation fallback",
        "item_count": len(items),
        "items": items,
    }


# ---------- save ------------------------
if __name__ == "__main__":
    data = scrape_haldirams()
    OUT_FILE.write_text(
        json.dumps(data, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )
    print(f"✅ Saved {data['item_count']} items → {OUT_FILE.resolve()}")


[info] using sitemap https://www.haldirams.com/media/sitemap/sitemap.xml
[warn] https://www.haldirams.com/corporate-gifting.html skipped – 404 Client Error: Not Found for url: https://www.haldirams.com/corporate-gifting.html
✅ Saved 199 items → C:\Users\prmsr\OneDrive - iitkgp.ac.in\Desktop\Zomato\haldirams_menu.json


In [None]:
pip freeze > requirements.txt