In [1]:
import os
import requests

# Create output directory
os.makedirs("aldi_html", exist_ok=True)

# Read URLs from file
with open("aldi-links.txt", "r", encoding="utf-8") as f:
    urls = [line.strip() for line in f if line.strip()]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/114.0.0.0 Safari/537.36"
}

for i, url in enumerate(urls, start=1):
    try:
        print(f"[INFO] Downloading {url}")
        response = requests.get(url.split("#")[0], headers=headers, timeout=15)

        if response.status_code == 200:
            filename = f"aldi_html/page_{i:03}.html"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(response.text)
            print(f"[SAVED] {filename}")
        else:
            print(f"[ERROR] Failed to fetch {url} (Status code: {response.status_code})")

    except Exception as e:
        print(f"[ERROR] Exception occurred for {url}: {e}")


[INFO] Downloading https://www.aldi-nord.de/angebote/aktion-mo-02-06/frischkaesezubereitung-1011877-0-0.article.html
[SAVED] aldi_html/page_001.html
[INFO] Downloading https://www.aldi-nord.de/produkt/bio-feta-4575-0-0.article.html#/sortiment/kuehlung-tiefkuehlung/kaese-milch-milchprodukte/kaese
[SAVED] aldi_html/page_002.html
[INFO] Downloading https://www.aldi-nord.de/produkt/premium-cornichons-4096-0-0.article.html#/sortiment/nahrungsmittel/konserven
[SAVED] aldi_html/page_003.html
[INFO] Downloading https://www.aldi-nord.de/produkt/energy-drink-classic-1004898-0-0.article.html#/sortiment/getraenke/sport-energy-drinks
[SAVED] aldi_html/page_004.html
[INFO] Downloading https://www.aldi-nord.de/angebote/aktion-mo-02-06/bananen-6151-0-0.article.html
[SAVED] aldi_html/page_005.html
[INFO] Downloading https://www.aldi-nord.de/produkt/merci-3967-0-0.article.html#/sortiment/snacks-suessigkeiten/schokolade
[SAVED] aldi_html/page_006.html
[INFO] Downloading https://www.aldi-nord.de/angebote/

In [2]:
import os
import json
from bs4 import BeautifulSoup

# Directory containing the HTML files
directory = "aldi_html"

# Range of file numbers
file_range = range(1, 22)  # 001 to 021

# Output list
products = []

for i in file_range:
    filename = f"page_{i:03}.html"
    filepath = os.path.join(directory, filename)

    try:
        with open(filepath, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")  # Use 'lxml' if installed

            # Extract product title
            title_tag = soup.find("meta", property="og:title")
            title = title_tag["content"].strip() if title_tag else "N/A"

            # Extract amount info
            amount_tag = soup.find("span", class_="price__unit")
            amount = amount_tag.text.strip() if amount_tag else "N/A"

            # Extract image URL
            image_tag = soup.find("meta", property="og:image")
            image_url = image_tag["content"].strip() if image_tag else "N/A"

            # Extract price
            price_wrapper = soup.find("span", class_="price__wrapper")
            price = price_wrapper.text.strip().replace("\n", "") if price_wrapper else "N/A"

            products.append({
                "file": filename,
                "title": title,
                "amount": amount,
                "price": price,
                "image": image_url
            })

    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Save to JSON
output_path = os.path.join(directory, "aldi_products.json")
with open(output_path, "w", encoding="utf-8") as out_file:
    json.dump(products, out_file, indent=2, ensure_ascii=False)

print(f"✅ Extracted data saved to: {output_path}")


Error processing page_016.html: [Errno 2] No such file or directory: 'aldi_html/page_016.html'
Error processing page_017.html: [Errno 2] No such file or directory: 'aldi_html/page_017.html'
Error processing page_018.html: [Errno 2] No such file or directory: 'aldi_html/page_018.html'
Error processing page_019.html: [Errno 2] No such file or directory: 'aldi_html/page_019.html'
Error processing page_020.html: [Errno 2] No such file or directory: 'aldi_html/page_020.html'
Error processing page_021.html: [Errno 2] No such file or directory: 'aldi_html/page_021.html'
✅ Extracted data saved to: aldi_html/aldi_products.json
