zip 10623

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import re

def extract_category(url: str) -> str:
    match = re.search(r"/c/([^/?]+)", url)
    return match.group(1) if match else url

def read_links_from_file(file_path: str):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def accept_cookies(driver):
    try:
        wait = WebDriverWait(driver, 15)
        buttons = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "button")))
        for btn in buttons:
            if "alle erlauben" in btn.text.lower():
                btn.click()
                print("[INFO] Cookies accepted.")
                time.sleep(2)
                return
    except Exception:
        print("[INFO] No cookie modal or already accepted.")

def select_abholservice(driver):
    try:
        wait = WebDriverWait(driver, 15)
        buttons = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "button")))
        for btn in buttons:
            if "abholservice" in btn.text.lower():
                btn.click()
                print("[INFO] Abholservice selected.")
                time.sleep(2)
                return
    except Exception:
        print("[INFO] No Abholservice modal or already handled.")

def enter_plz_and_select_market(driver):
    try:
        time.sleep(2)
        plz_input = driver.find_element(By.TAG_NAME, "input")
        plz_input.clear()
        plz_input.send_keys("10115")
        plz_input.submit()
        print("[INFO] PLZ entered and submitted.")
        time.sleep(3)
    except Exception:
        print("[WARN] Failed to enter PLZ.")

    try:
        for _ in range(20):
            buttons = driver.find_elements(By.TAG_NAME, "button")
            for btn in buttons:
                if "abholmarkt wählen" in btn.text.lower():
                    btn.click()
                    print("[INFO] Abholmarkt selected.")
                    time.sleep(3)
                    return
            time.sleep(1)
        print("[WARN] No 'Abholmarkt wählen' button found.")
    except Exception:
        print("[ERROR] Error selecting Abholmarkt.")

def enable_stealth(driver):
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
        Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
        Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
        """
    })

def scrape_products_from_category(driver, url):
    driver.get(url)
    accept_cookies(driver)
    select_abholservice(driver)

    products = []
    page_num = 1

    try:
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.search-service-product")))
        time.sleep(2)
        print(f"[INFO] Scraping products on page {page_num}...")

        product_elements = driver.find_elements(By.CSS_SELECTOR, "article.search-service-product")

        for el in product_elements:
            try:
                image_elem = el.find_element(By.TAG_NAME, "img")
                name = image_elem.get_attribute("alt").strip()
                imageUrl = image_elem.get_attribute("src")
            except:
                name = ""
                imageUrl = ""

            try:
                price = el.find_element(By.CSS_SELECTOR, "div.productPrice, .search-service-productPrice").text.strip().replace("\n", "")
            except:
                price = ""

            try:
                unit = el.find_element(By.CSS_SELECTOR, "div.productGrammage, .search-service-productGrammage").text.strip()
            except:
                unit = ""

            products.append({"name": name, "price": price, "unit": unit, "imageUrl": imageUrl})

        print(f"[INFO] Found {len(product_elements)} products on page {page_num}.")

    except Exception as e:
        print(f"[ERROR] Error scraping products: {e}")

    print(f"[INFO] Total products found: {len(products)}")
    return products

def main():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    # Do NOT run headless because CAPTCHA needs manual solving
    # options.add_argument("--headless")

    driver = webdriver.Chrome(options=options)
    driver.set_window_size(1920, 1080)
    enable_stealth(driver)

    links = read_links_from_file("links.txt")
    result = {}

    if not links:
        print("[ERROR] No links found in links.txt")
        return

    first_url = links[0]
    print(f"[INFO] Setting PLZ and market on: {first_url}")
    driver.get(first_url)
    accept_cookies(driver)
    select_abholservice(driver)
    enter_plz_and_select_market(driver)

    print("[ACTION] If a CAPTCHA appears, please solve it manually in the browser.")
    input("Press Enter after solving the CAPTCHA to continue scraping...")

    for url in links:
        category = extract_category(url)
        print(f"[INFO] Scraping category: {category} -> {url}")
        products = scrape_products_from_category(driver, url)
        result[category] = products
        print(f"[INFO] {category}: {len(products)} products scraped.")

    driver.quit()

    with open("rewe-products-10115.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print("[DONE] All products saved to rewe-products.json")

if __name__ == "__main__":
    main()


[INFO] Setting PLZ and market on: https://shop.rewe.de/p/philadelphia-natur-doppelrahmstufe-175g/575043
[WARN] Failed to enter PLZ.
[WARN] No 'Abholmarkt wählen' button found.
[ACTION] If a CAPTCHA appears, please solve it manually in the browser.
[INFO] Scraping category: https://shop.rewe.de/p/philadelphia-natur-doppelrahmstufe-175g/575043 -> https://shop.rewe.de/p/philadelphia-natur-doppelrahmstufe-175g/575043
[INFO] No cookie modal or already accepted.
[INFO] No Abholservice modal or already handled.
[ERROR] Error scraping products: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=137.0.7151.68)
Stacktrace:
0   chromedriver                        0x0000000104cb8708 cxxbridge1$str$ptr + 2729312
1   chromedriver                        0x0000000104cb096c cxxbridge1$str$ptr + 2697156
2   chromedriver                        0x0000000104802728 cxxbridge1$string$len + 90444
3   chromedriver                        0x000000

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=137.0.7151.68)
Stacktrace:
0   chromedriver                        0x0000000104cb8708 cxxbridge1$str$ptr + 2729312
1   chromedriver                        0x0000000104cb096c cxxbridge1$str$ptr + 2697156
2   chromedriver                        0x0000000104802728 cxxbridge1$string$len + 90444
3   chromedriver                        0x00000001047dc744 chromedriver + 132932
4   chromedriver                        0x0000000104871c9c cxxbridge1$string$len + 546496
5   chromedriver                        0x000000010488abe0 cxxbridge1$string$len + 648708
6   chromedriver                        0x000000010483dbc0 cxxbridge1$string$len + 333284
7   chromedriver                        0x0000000104c7c298 cxxbridge1$str$ptr + 2482416
8   chromedriver                        0x0000000104c7f52c cxxbridge1$str$ptr + 2495364
9   chromedriver                        0x0000000104c5dae0 cxxbridge1$str$ptr + 2357560
10  chromedriver                        0x0000000104c7fdb4 cxxbridge1$str$ptr + 2497548
11  chromedriver                        0x0000000104c4edec cxxbridge1$str$ptr + 2296900
12  chromedriver                        0x0000000104c9fc4c cxxbridge1$str$ptr + 2628260
13  chromedriver                        0x0000000104c9fdd8 cxxbridge1$str$ptr + 2628656
14  chromedriver                        0x0000000104cb05b8 cxxbridge1$str$ptr + 2696208
15  libsystem_pthread.dylib             0x00000001850fa034 _pthread_start + 136
16  libsystem_pthread.dylib             0x00000001850f4e3c thread_start + 8


10117

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import re

def extract_category(url: str) -> str:
    match = re.search(r"/c/([^/?]+)", url)
    return match.group(1) if match else url

def read_links_from_file(file_path: str):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def accept_cookies(driver):
    try:
        wait = WebDriverWait(driver, 15)
        buttons = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "button")))
        for btn in buttons:
            if "alle erlauben" in btn.text.lower():
                btn.click()
                print("[INFO] Cookies accepted.")
                time.sleep(2)
                return
    except Exception:
        print("[INFO] No cookie modal or already accepted.")

def select_abholservice(driver):
    try:
        wait = WebDriverWait(driver, 15)
        buttons = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "button")))
        for btn in buttons:
            if "abholservice" in btn.text.lower():
                btn.click()
                print("[INFO] Abholservice selected.")
                time.sleep(2)
                return
    except Exception:
        print("[INFO] No Abholservice modal or already handled.")

def enter_plz_and_select_market(driver):
    try:
        time.sleep(2)
        plz_input = driver.find_element(By.TAG_NAME, "input")
        plz_input.clear()
        plz_input.send_keys("10117")
        plz_input.submit()
        print("[INFO] PLZ entered and submitted.")
        time.sleep(3)
    except Exception:
        print("[WARN] Failed to enter PLZ.")

    try:
        for _ in range(20):
            buttons = driver.find_elements(By.TAG_NAME, "button")
            for btn in buttons:
                if "abholmarkt wählen" in btn.text.lower():
                    btn.click()
                    print("[INFO] Abholmarkt selected.")
                    time.sleep(3)
                    return
            time.sleep(1)
        print("[WARN] No 'Abholmarkt wählen' button found.")
    except Exception:
        print("[ERROR] Error selecting Abholmarkt.")

def enable_stealth(driver):
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
        Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
        Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
        """
    })

def scrape_products_from_category(driver, url):
    driver.get(url)
    accept_cookies(driver)
    select_abholservice(driver)

    products = []
    page_num = 1

    try:
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.search-service-product")))
        time.sleep(2)
        print(f"[INFO] Scraping products on page {page_num}...")

        product_elements = driver.find_elements(By.CSS_SELECTOR, "article.search-service-product")

        for el in product_elements:
            try:
                image_elem = el.find_element(By.TAG_NAME, "img")
                name = image_elem.get_attribute("alt").strip()
                imageUrl = image_elem.get_attribute("src")
            except:
                name = ""
                imageUrl = ""

            try:
                price = el.find_element(By.CSS_SELECTOR, "div.productPrice, .search-service-productPrice").text.strip().replace("\n", "")
            except:
                price = ""

            try:
                unit = el.find_element(By.CSS_SELECTOR, "div.productGrammage, .search-service-productGrammage").text.strip()
            except:
                unit = ""

            products.append({"name": name, "price": price, "unit": unit, "imageUrl": imageUrl})

        print(f"[INFO] Found {len(product_elements)} products on page {page_num}.")

    except Exception as e:
        print(f"[ERROR] Error scraping products: {e}")

    print(f"[INFO] Total products found: {len(products)}")
    return products

def main():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    # Do NOT run headless because CAPTCHA needs manual solving
    # options.add_argument("--headless")

    driver = webdriver.Chrome(options=options)
    driver.set_window_size(1920, 1080)
    enable_stealth(driver)

    links = read_links_from_file("links.txt")
    result = {}

    if not links:
        print("[ERROR] No links found in links.txt")
        return

    first_url = links[0]
    print(f"[INFO] Setting PLZ and market on: {first_url}")
    driver.get(first_url)
    accept_cookies(driver)
    select_abholservice(driver)
    enter_plz_and_select_market(driver)

    print("[ACTION] If a CAPTCHA appears, please solve it manually in the browser.")
    input("Press Enter after solving the CAPTCHA to continue scraping...")

    for url in links:
        category = extract_category(url)
        print(f"[INFO] Scraping category: {category} -> {url}")
        products = scrape_products_from_category(driver, url)
        result[category] = products
        print(f"[INFO] {category}: {len(products)} products scraped.")

    driver.quit()

    with open("rewe-products-10117.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print("[DONE] All products saved to rewe-products.json")

if __name__ == "__main__":
    main()


[INFO] Setting PLZ and market on: https://shop.rewe.de/c/olivenoel/?search=oliven%C3%B6l
[INFO] Abholservice selected.
[WARN] Failed to enter PLZ.
[INFO] Abholmarkt selected.
[ACTION] If a CAPTCHA appears, please solve it manually in the browser.
[INFO] Scraping category: olivenoel -> https://shop.rewe.de/c/olivenoel/?search=oliven%C3%B6l
[INFO] Scraping products on page 1...
[INFO] Found 19 products on page 1.
[INFO] Total products found: 19
[INFO] olivenoel: 19 products scraped.
[INFO] Scraping category: sossen -> https://shop.rewe.de/c/sossen/?search=pesto
[INFO] Scraping products on page 1...
[INFO] Found 24 products on page 1.
[INFO] Total products found: 24
[INFO] sossen: 24 products scraped.
[INFO] Scraping category: tee -> https://shop.rewe.de/c/tee/?objectsPerPage=80&search=tee
[INFO] Scraping products on page 1...
[INFO] Found 80 products on page 1.
[INFO] Total products found: 80
[INFO] tee: 80 products scraped.
[INFO] Scraping category: tee -> https://shop.rewe.de/c/tee/?ob

10119

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import re

def extract_category(url: str) -> str:
    match = re.search(r"/c/([^/?]+)", url)
    return match.group(1) if match else url

def read_links_from_file(file_path: str):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def accept_cookies(driver):
    try:
        wait = WebDriverWait(driver, 15)
        buttons = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "button")))
        for btn in buttons:
            if "alle erlauben" in btn.text.lower():
                btn.click()
                print("[INFO] Cookies accepted.")
                time.sleep(2)
                return
    except Exception:
        print("[INFO] No cookie modal or already accepted.")

def select_abholservice(driver):
    try:
        wait = WebDriverWait(driver, 15)
        buttons = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "button")))
        for btn in buttons:
            if "abholservice" in btn.text.lower():
                btn.click()
                print("[INFO] Abholservice selected.")
                time.sleep(2)
                return
    except Exception:
        print("[INFO] No Abholservice modal or already handled.")

def enter_plz_and_select_market(driver):
    try:
        time.sleep(2)
        plz_input = driver.find_element(By.TAG_NAME, "input")
        plz_input.clear()
        plz_input.send_keys("10119")
        plz_input.submit()
        print("[INFO] PLZ entered and submitted.")
        time.sleep(3)
    except Exception:
        print("[WARN] Failed to enter PLZ.")

    try:
        for _ in range(20):
            buttons = driver.find_elements(By.TAG_NAME, "button")
            for btn in buttons:
                if "abholmarkt wählen" in btn.text.lower():
                    btn.click()
                    print("[INFO] Abholmarkt selected.")
                    time.sleep(3)
                    return
            time.sleep(1)
        print("[WARN] No 'Abholmarkt wählen' button found.")
    except Exception:
        print("[ERROR] Error selecting Abholmarkt.")

def enable_stealth(driver):
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
        Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
        Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
        """
    })

def scrape_products_from_category(driver, url):
    driver.get(url)
    accept_cookies(driver)
    select_abholservice(driver)

    products = []
    page_num = 1

    try:
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.search-service-product")))
        time.sleep(2)
        print(f"[INFO] Scraping products on page {page_num}...")

        product_elements = driver.find_elements(By.CSS_SELECTOR, "article.search-service-product")

        for el in product_elements:
            try:
                image_elem = el.find_element(By.TAG_NAME, "img")
                name = image_elem.get_attribute("alt").strip()
                imageUrl = image_elem.get_attribute("src")
            except:
                name = ""
                imageUrl = ""

            try:
                price = el.find_element(By.CSS_SELECTOR, "div.productPrice, .search-service-productPrice").text.strip().replace("\n", "")
            except:
                price = ""

            try:
                unit = el.find_element(By.CSS_SELECTOR, "div.productGrammage, .search-service-productGrammage").text.strip()
            except:
                unit = ""

            products.append({"name": name, "price": price, "unit": unit, "imageUrl": imageUrl})

        print(f"[INFO] Found {len(product_elements)} products on page {page_num}.")

    except Exception as e:
        print(f"[ERROR] Error scraping products: {e}")

    print(f"[INFO] Total products found: {len(products)}")
    return products

def main():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    # Do NOT run headless because CAPTCHA needs manual solving
    # options.add_argument("--headless")

    driver = webdriver.Chrome(options=options)
    driver.set_window_size(1920, 1080)
    enable_stealth(driver)

    links = read_links_from_file("links.txt")
    result = {}

    if not links:
        print("[ERROR] No links found in links.txt")
        return

    first_url = links[0]
    print(f"[INFO] Setting PLZ and market on: {first_url}")
    driver.get(first_url)
    accept_cookies(driver)
    select_abholservice(driver)
    enter_plz_and_select_market(driver)

    print("[ACTION] If a CAPTCHA appears, please solve it manually in the browser.")
    input("Press Enter after solving the CAPTCHA to continue scraping...")

    for url in links:
        category = extract_category(url)
        print(f"[INFO] Scraping category: {category} -> {url}")
        products = scrape_products_from_category(driver, url)
        result[category] = products
        print(f"[INFO] {category}: {len(products)} products scraped.")

    driver.quit()

    with open("rewe-products-10119.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print("[DONE] All products saved to rewe-products.json")

if __name__ == "__main__":
    main()


[INFO] Setting PLZ and market on: https://shop.rewe.de/c/olivenoel/?search=oliven%C3%B6l
[INFO] Abholservice selected.
[WARN] Failed to enter PLZ.
[INFO] Abholmarkt selected.
[ACTION] If a CAPTCHA appears, please solve it manually in the browser.
[INFO] Scraping category: olivenoel -> https://shop.rewe.de/c/olivenoel/?search=oliven%C3%B6l
[INFO] Scraping products on page 1...
[INFO] Found 22 products on page 1.
[INFO] Total products found: 22
[INFO] olivenoel: 22 products scraped.
[INFO] Scraping category: sossen -> https://shop.rewe.de/c/sossen/?search=pesto
[INFO] Scraping products on page 1...
[INFO] Found 40 products on page 1.
[INFO] Total products found: 40
[INFO] sossen: 40 products scraped.
[INFO] Scraping category: tee -> https://shop.rewe.de/c/tee/?objectsPerPage=80&search=tee
[INFO] Scraping products on page 1...
[INFO] Found 80 products on page 1.
[INFO] Total products found: 80
[INFO] tee: 80 products scraped.
[INFO] Scraping category: tee -> https://shop.rewe.de/c/tee/?ob