In [6]:
import os
import json
import re
from bs4 import BeautifulSoup

folder_path = "rewe_html"
output_file = "rewe_products.json"
products = []

for filename in os.listdir(folder_path):
    if not filename.endswith(".html"):
        continue

    with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")

        # --- JSON-LD structured product info ---
        product_data = {}
        json_ld = soup.find("script", {"type": "application/ld+json"})
        if json_ld:
            try:
                product_data = json.loads(json_ld.string)
            except Exception:
                pass

        # --- Title ---
        title = product_data.get("name", "N/A")

        # --- Size (e.g., 250g, 1kg) ---
        size = "N/A"
        if title:
            for word in title.split():
                if re.fullmatch(r"\d+(,\d+)?(g|ml|kg|l|Stück)", word):
                    size = word
                    break

        # --- Image ---
        image = product_data.get("image", "N/A")

        # --- Price from <script id^="pdpr-propstore"> ---
        price = "N/A"
        script_tag = soup.find("script", {"id": re.compile(r"^pdpr-propstore")})
        if script_tag:
            try:
                store_data = json.loads(script_tag.string)
                price_cents = store_data["productData"]["pricing"]["price"]
                price = f"{price_cents / 100:.2f} €"
            except Exception:
                pass

        # Store product info
        products.append({
            "filename": filename,
            "title": title,
            "size": size,
            "image": image,
            "price": price
        })

# Save all to JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(products, f, ensure_ascii=False, indent=2)

print(f"✅ Scraping complete. Data saved to {output_file}")


✅ Scraping complete. Data saved to rewe_products.json
