In [27]:
import os
import json
from bs4 import BeautifulSoup

# Directory containing the HTML files
directory = "aldi_html"

# Range of file numbers
file_range = range(1, 22)  # 001 to 021

# Output list
products = []

for i in file_range:
    filename = f"page_{i:03}.html"
    filepath = os.path.join(directory, filename)

    try:
        with open(filepath, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")  # Use 'lxml' if installed

            # Extract product title
            title_tag = soup.find("meta", property="og:title")
            title = title_tag["content"].strip() if title_tag else "N/A"

            # Extract amount info
            amount_tag = soup.find("span", class_="price__unit")
            amount = amount_tag.text.strip() if amount_tag else "N/A"

            # Extract image URL
            image_tag = soup.find("meta", property="og:image")
            image_url = image_tag["content"].strip() if image_tag else "N/A"

            # Extract price
            price_wrapper = soup.find("span", class_="price__wrapper")
            price = price_wrapper.text.strip().replace("\n", "") if price_wrapper else "N/A"

            products.append({
                "file": filename,
                "title": title,
                "amount": amount,
                "price": price,
                "image": image_url
            })

    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Save to JSON
output_path = os.path.join(directory, "aldi_products.json")
with open(output_path, "w", encoding="utf-8") as out_file:
    json.dump(products, out_file, indent=2, ensure_ascii=False)

print(f"✅ Extracted data saved to: {output_path}")


✅ Extracted data saved to: aldi_html/aldi_products.json
