In [2]:
import json
import urllib.parse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ─── 1) Selenium setup ─────────────────────────────────────────────────────────
options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'   # return after DOMContentLoaded
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(5)
wait = WebDriverWait(driver, 20)

# ─── 2) Build the filtered-collection URL ───────────────────────────────────────
base       = "https://www.scout.mk/product-category/obleka/"
filter_val = urllib.parse.quote("деца", safe="")
collection_url = f"{base}?filter_pol={filter_val}"

# ─── 3) Paginate & collect detail-page URLs ────────────────────────────────────
product_urls = []
seen         = set()
page         = 1

while True:
    if page == 1:
        url = collection_url
    else:
        url = f"{base}page/{page}/?filter_pol={filter_val}"

    driver.get(url)
    try:
        wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "ul.products li.product")
        ))
    except TimeoutException:
        print(f"⚠️  No products on page {page}, stopping pagination.")
        break

    cards = driver.find_elements(By.CSS_SELECTOR, "ul.products li.product")
    print(f"Page {page}: found {len(cards)} products")
    if not cards:
        break

    for i in range(len(cards)):
        try:
            card = driver.find_elements(By.CSS_SELECTOR, "ul.products li.product")[i]
            href = card.find_element(By.TAG_NAME, "a").get_attribute("href")
            if href and "/product/" in href and href not in seen:
                seen.add(href)
                product_urls.append(href)
        except (NoSuchElementException, IndexError):
            continue
        except Exception as e:
            print(f"⚠️  Error on card {i}: {e}")
            continue


    page += 1

print(f"\n→ Collected {len(product_urls)} product URLs total.\n")

# ─── 4) Visit each product & scrape details ────────────────────────────────────
data = []
for idx, href in enumerate(product_urls, start=1):
    driver.get(href)
    try:
        # wait for either <h1> or <h2> with the product title class
        wait.until(EC.visibility_of_element_located((
            By.CSS_SELECTOR,
            "h1.product_title.entry-title, h2.product_title.entry-title"
        )))
    except TimeoutException:
        print(f"Skipping {href} (title not found)")
        continue

    # Name
    title_el = driver.find_element(
        By.CSS_SELECTOR,
        "h1.product_title.entry-title, h2.product_title.entry-title"
    )
    name = title_el.text.strip()

    # SKU
    try:
        sku = driver.find_element(By.CSS_SELECTOR, "span.sku").text.strip()
    except NoSuchElementException:
        sku = ""

    # Price
    try:
        price = driver.find_element(By.CSS_SELECTOR, "p.price").text.strip()
    except NoSuchElementException:
        price = ""

    # Short description
    try:
        desc = driver.find_element(
            By.CSS_SELECTOR,
            "div.description.woocommerce-product-details__short-description"
        ).text.strip()
    except NoSuchElementException:
        desc = ""

    # Main image URL
    try:
        img = driver.find_element(
            By.CSS_SELECTOR,
            "div.woocommerce-product-gallery__image img"
        )
        image_url = img.get_attribute("data-src") or img.get_attribute("src")
    except NoSuchElementException:
        image_url = ""

    # Variations JSON → extract sizes & colors
    sizes, colors = [], []
    try:
        raw = driver.find_element(By.CSS_SELECTOR, "form.variations_form") \
                    .get_attribute("data-product_variations")
        variants = json.loads(raw)
        sizes  = sorted(v["attributes"]["attribute_pa_size"]
                        for v in variants
                        if v["attributes"].get("attribute_pa_size"))
        colors = sorted(v["attributes"]["attribute_pa_boja"]
                        for v in variants
                        if v["attributes"].get("attribute_pa_boja"))
    except Exception:
        pass

    print(f"[{idx}/{len(product_urls)}] {name} → Sizes: {sizes or ['—']}  Colors: {colors or ['—']}")

    data.append({
        "Name":        name,
        "SKU":         sku,
        "Price":       price,
        "Description": desc,
        "Image_URL":   image_url,
        "Sizes":       ", ".join(sizes),
        "Colors":      ", ".join(colors),
        "Product_URL": href
    })

# ─── 5) Save to CSV ─────────────────────────────────────────────────────────────
df = pd.DataFrame(data)
df.to_csv("scout_obleka_deca.csv", index=False, encoding="utf-8-sig")
print(f"\n✔ Done! Scraped {len(df)} products → scout_obleka_deca.csv")

driver.quit()

Page 1: found 12 products
Page 2: found 12 products
Page 3: found 24 products
⚠️  No products on page 4, stopping pagination.

→ Collected 36 product URLs total.

[1/36] ADLT SNAPBACK CAP M → Sizes: ['0', '0']  Colors: ['navy', 'olive']
[2/36] Baseball Cap K 1901012 → Sizes: ['—']  Colors: ['turquoise']
[3/36] Canyon Cap K 1911351 → Sizes: ['m']  Colors: ['pink']
[4/36] HIGHTON TRS M → Sizes: ['36', '38']  Colors: ['black', 'black']
[5/36] HOT SHOT II → Sizes: ['11', '11', '11', '12', '13', '13', '13', '14', '5', '5', '5', '7', '7', '9', '9', '9', '9']  Colors: ['grey', 'grey', 'grey', 'navy', 'navy', 'navy', 'navy', 'pink', 'violet', 'violet', 'violet', 'violet', 'violet', 'violet', 'white', 'white', 'white']
[6/36] Kid G Jacket 3H19925 → Sizes: ['140']  Colors: ['pink']
[7/36] Kid G Jacket Fix Hood 32Z1105 → Sizes: ['110', '128', '140']  Colors: ['orange', 'orange', 'orange']
[8/36] Kid G Jacket Fix Hood 39X7985 → Sizes: ['128']  Colors: ['blue']
[9/36] Kid G Jacket Fix Hood 3A29385N

In [3]:
import json
import urllib.parse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ─── 1) Selenium setup ─────────────────────────────────────────────────────────
options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
prefs = {"profile.managed_default_content_settings.images": 2}  # disable image loading
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(5)
wait = WebDriverWait(driver, 20)

# ─── 2) Base URL setup ─────────────────────────────────────────────────────────
base       = "https://www.scout.mk/product-category/obleka/"
filter_val = urllib.parse.quote("деца", safe="")  # change to "мажи" for men
collection_url = f"{base}?filter_pol={filter_val}"

# ─── 3) Collect all product links ──────────────────────────────────────────────
product_urls = []
seen         = set()
page         = 1

while True:
    if page == 1:
        url = collection_url
    else:
        url = f"{base}page/{page}/?filter_pol={filter_val}"

    driver.get(url)
    try:
        wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "ul.products li.product")
        ))
    except TimeoutException:
        print(f"⚠️  No products on page {page}, stopping pagination.")
        break

    cards = driver.find_elements(By.CSS_SELECTOR, "ul.products li.product")
    if not cards:
        break

    print(f"Page {page}: found {len(cards)} products")
    for card in cards:
        try:
            href = card.find_element(By.TAG_NAME, "a").get_attribute("href")
            if href and "/product/" in href and href not in seen:
                seen.add(href)
                product_urls.append(href)
        except Exception as e:
            print(f"⚠️  Skipping card due to error: {e}")
            continue

    page += 1

print(f"\n→ Collected {len(product_urls)} product URLs total.\n")

# ─── 4) Visit each product page and scrape data ────────────────────────────────
data = []
for idx, href in enumerate(product_urls, start=1):
    driver.get(href)
    try:
        wait.until(EC.visibility_of_element_located((
            By.CSS_SELECTOR,
            "h1.product_title.entry-title, h2.product_title.entry-title"
        )))
    except TimeoutException:
        print(f"⚠️  Skipping {href} (title not found)")
        continue

    # Name
    name = driver.find_element(
        By.CSS_SELECTOR,
        "h1.product_title.entry-title, h2.product_title.entry-title"
    ).text.strip()

    # SKU
    try:
        sku = driver.find_element(By.CSS_SELECTOR, "span.sku").text.strip()
    except NoSuchElementException:
        sku = ""

    # Price
    try:
        price = driver.find_element(By.CSS_SELECTOR, "p.price").text.strip()
    except NoSuchElementException:
        try:
            price = driver.find_element(By.CSS_SELECTOR, "span.woocommerce-Price-amount").text.strip()
        except NoSuchElementException:
            price = ""

    # Description
    try:
        desc = driver.find_element(
            By.CSS_SELECTOR,
            "div.description.woocommerce-product-details__short-description"
        ).text.strip()
    except NoSuchElementException:
        desc = ""

    # Main image
    try:
        img = driver.find_element(
            By.CSS_SELECTOR, "div.woocommerce-product-gallery__image img"
        )
        image_url = img.get_attribute("data-src") or img.get_attribute("src")
    except NoSuchElementException:
        image_url = ""

    # Sizes & Colors
    sizes, colors = [], []
    try:
        raw = driver.find_element(By.CSS_SELECTOR, "form.variations_form") \
                    .get_attribute("data-product_variations")
        variants = json.loads(raw)
        sizes = sorted({
            v["attributes"].get("attribute_pa_size")
            for v in variants
            if v["attributes"].get("attribute_pa_size")
        })
        colors = sorted({
            v["attributes"].get("attribute_pa_boja")
            for v in variants
            if v["attributes"].get("attribute_pa_boja")
        })
    except Exception:
        pass

    print(f"[{idx}/{len(product_urls)}] {name} → Price: {price or '—'} | Sizes: {sizes or ['—']} | Colors: {colors or ['—']}")

    data.append({
        "Name":        name,
        "SKU":         sku,
        "Price":       price,
        "Description": desc,
        "Image_URL":   image_url,
        "Sizes":       ", ".join(sizes),
        "Colors":      ", ".join(colors),
        "Product_URL": href
    })

# ─── 5) Save to CSV ─────────────────────────────────────────────────────────────
df = pd.DataFrame(data)
df.to_csv("scout_obleka_decatest.csv", index=False, encoding="utf-8-sig")
print(f"\n✔ Done! Scraped {len(df)} products → scout_obleka_decatest.csv")

driver.quit()


Page 1: found 12 products
Page 2: found 24 products
⚠️  Skipping card due to error: Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=135.0.7049.115); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6FAAAEFA5+77893]
	GetHandleVerifier [0x00007FF6FAAAF000+77984]
	(No symbol) [0x00007FF6FA8791BA]
	(No symbol) [0x00007FF6FA880AEC]
	(No symbol) [0x00007FF6FA883B5C]
	(No symbol) [0x00007FF6FA883C2F]
	(No symbol) [0x00007FF6FA8CEA17]
	(No symbol) [0x00007FF6FA8CF41C]
	(No symbol) [0x00007FF6FA8C1B1C]
	(No symbol) [0x00007FF6FA8F716F]
	(No symbol) [0x00007FF6FA8C19E6]
	(No symbol) [0x00007FF6FA8F7340]
	(No symbol) [0x00007FF6FA91F07F]
	(No symbol) [0x00007FF6FA8F6F03]
	(No symbol) [0x00007FF6FA8C0328]
	(No symbol) [0x00007FF6FA8C1093]
	GetHandleVerifier [0x00007FF6FAD67B6D+2931725]
	GetHandleVerif

In [18]:
import json
import urllib.parse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

# ─── 1) Selenium setup ─────────────────────────────────────────────────────────
options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(5)
wait = WebDriverWait(driver, 20)

# ─── 2) Build the filtered-collection URL ───────────────────────────────────────
base       = "https://www.scout.mk/product-category/obleka/"
filter_val = urllib.parse.quote("деца", safe="")
collection_url = f"{base}?filter_pol={filter_val}"

# ─── 3) Paginate & collect detail-page URLs ────────────────────────────────────
product_urls = []
seen         = set()
page         = 1

while True:
    if page == 1:
        url = collection_url
    else:
        url = f"{base}page/{page}/?filter_pol={filter_val}"

    driver.get(url)
    try:
        wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "ul.products li.product")
        ))
    except TimeoutException:
        print(f"⚠️  No products on page {page}, stopping pagination.")
        break

    cards = driver.find_elements(By.CSS_SELECTOR, "ul.products li.product")
    print(f"Page {page}: found {len(cards)} products")
    if not cards:
        break

    for i in range(len(cards)):
        try:
            card = driver.find_elements(By.CSS_SELECTOR, "ul.products li.product")[i]
            href = card.find_element(By.TAG_NAME, "a").get_attribute("href")
            if href and "/product/" in href and href not in seen:
                seen.add(href)
                product_urls.append(href)
        except (NoSuchElementException, IndexError):
            continue
        except Exception as e:
            print(f"⚠️  Error on card {i}: {e}")
            continue

    page += 1

print(f"\n→ Collected {len(product_urls)} product URLs total.\n")

# ─── 4) Visit each product & scrape details ────────────────────────────────────
data = []
for idx, href in enumerate(product_urls, start=1):
    driver.get(href)
    try:
        wait.until(EC.visibility_of_element_located((
            By.CSS_SELECTOR,
            "h1.product_title.entry-title, h2.product_title.entry-title"
        )))
    except TimeoutException:
        print(f"Skipping {href} (title not found)")
        continue

    # Name
    title_el = driver.find_element(
        By.CSS_SELECTOR,
        "h1.product_title.entry-title, h2.product_title.entry-title"
    )
    name = title_el.text.strip()

    # SKU
    try:
        sku = driver.find_element(By.CSS_SELECTOR, "span.sku").text.strip()
    except NoSuchElementException:
        sku = ""

    #Price & Discount Price
    try:
        price_text = driver.find_element(By.CSS_SELECTOR, "p.price").get_attribute("innerText").replace("\n", " ").strip()
    # Example format: "1,890.00ден Original price was: 1,890.00ден.1,323.00денCurrent price is: 1,323.00ден."
    # Extract numbers that look like prices ending with "ден"
        import re
        prices = re.findall(r"\d[\d.,]*ден", price_text)

        if len(prices) >= 2:
            original_price = prices[0]
            discount_price = prices[1]
        elif len(prices) == 1:
            original_price = prices[0]
            discount_price = ""
        else:
            original_price = ""
            discount_price = ""
    except NoSuchElementException:
        original_price = ""
        discount_price = ""


    # Short description
    try:
        desc = driver.find_element(
            By.CSS_SELECTOR,
            "div.description.woocommerce-product-details__short-description"
        ).text.strip()
    except NoSuchElementException:
        desc = ""

    # Main image URL
    try:
        img_el = driver.find_element(
            By.CSS_SELECTOR,
            "div.img-thumbnail div.inner img"
        )
        image_url = img_el.get_attribute("data-large_image") or img_el.get_attribute("src")
    except NoSuchElementException:
        image_url = ""

    # Variations JSON → extract sizes & colors
    sizes, colors = [], []
    try:
        raw = driver.find_element(By.CSS_SELECTOR, "form.variations_form") \
                    .get_attribute("data-product_variations")
        variants = json.loads(raw)
        sizes  = sorted(v["attributes"]["attribute_pa_size"]
                        for v in variants
                        if v["attributes"].get("attribute_pa_size"))
        colors = sorted(v["attributes"]["attribute_pa_boja"]
                        for v in variants
                        if v["attributes"].get("attribute_pa_boja"))
    except Exception:
        pass

    print(f"[{idx}/{len(product_urls)}] {name} → Sizes: {sizes or ['—']}  Colors: {colors or ['—']}")


    data.append({
        "Name":           name,
        "SKU":            sku,
        "Original Price": original_price,
        "Discount Price": discount_price,
        "Description":    desc,
        "Image URL":      image_url,
        "Sizes":          sizes,
        "Colors":         colors,
        "URL":            href
    })



# ─── 5) Save to CSV ─────────────────────────────────────────────────────────────
df = pd.DataFrame(data)
df.to_csv("scout_obleka_deca.csv", index=False, encoding="utf-8-sig")
print(f"\n✔ Done! Scraped {len(df)} products → scout_obleka_deca.csv")

driver.quit()


Page 1: found 12 products
Page 2: found 24 products
Page 3: found 12 products
⚠️  No products on page 4, stopping pagination.

→ Collected 36 product URLs total.

[1/36] ADLT SNAPBACK CAP M → Sizes: ['0', '0']  Colors: ['navy', 'olive']
[2/36] Baseball Cap K 1901012 → Sizes: ['—']  Colors: ['turquoise']
[3/36] Canyon Cap K 1911351 → Sizes: ['m']  Colors: ['pink']
[4/36] HIGHTON TRS M → Sizes: ['36', '38']  Colors: ['black', 'black']
[5/36] HOT SHOT II → Sizes: ['11', '11', '11', '12', '13', '13', '13', '14', '5', '5', '5', '7', '7', '9', '9', '9', '9']  Colors: ['grey', 'grey', 'grey', 'navy', 'navy', 'navy', 'navy', 'pink', 'violet', 'violet', 'violet', 'violet', 'violet', 'violet', 'white', 'white', 'white']
[6/36] Kid G Jacket 3H19925 → Sizes: ['140']  Colors: ['pink']
[7/36] Kid G Jacket Fix Hood 32Z1105 → Sizes: ['110', '128', '140']  Colors: ['orange', 'orange', 'orange']
[8/36] Kid G Jacket Fix Hood 39X7985 → Sizes: ['128']  Colors: ['blue']
[9/36] Kid G Jacket Fix Hood 3A29385N