In [1]:
import json
import urllib.parse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

# ─── 1) Selenium setup ─────────────────────────────────────────────────────────
options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(5)
wait = WebDriverWait(driver, 20)

# ─── 2) Build the filtered-collection URL ───────────────────────────────────────
base       = "https://www.scout.mk/product-category/obleka/"
filter_val = urllib.parse.quote("мажи", safe="")
collection_url = f"{base}?filter_pol={filter_val}"

# ─── 3) Paginate & collect detail-page URLs ────────────────────────────────────
product_urls = []
seen         = set()
page         = 1

while True:
    if page == 1:
        url = collection_url
    else:
        url = f"{base}page/{page}/?filter_pol={filter_val}"

    driver.get(url)
    try:
        wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "ul.products li.product")
        ))
    except TimeoutException:
        print(f"⚠️  No products on page {page}, stopping pagination.")
        break

    cards = driver.find_elements(By.CSS_SELECTOR, "ul.products li.product")
    print(f"Page {page}: found {len(cards)} products")
    if not cards:
        break

    for i in range(len(cards)):
        try:
            card = driver.find_elements(By.CSS_SELECTOR, "ul.products li.product")[i]
            href = card.find_element(By.TAG_NAME, "a").get_attribute("href")
            if href and "/product/" in href and href not in seen:
                seen.add(href)
                product_urls.append(href)
        except (NoSuchElementException, IndexError):
            continue
        except Exception as e:
            print(f"⚠️  Error on card {i}: {e}")
            continue

    page += 1

print(f"\n→ Collected {len(product_urls)} product URLs total.\n")

# ─── 4) Visit each product & scrape details ────────────────────────────────────
data = []
for idx, href in enumerate(product_urls, start=1):
    driver.get(href)
    try:
        wait.until(EC.visibility_of_element_located((
            By.CSS_SELECTOR,
            "h1.product_title.entry-title, h2.product_title.entry-title"
        )))
    except TimeoutException:
        print(f"Skipping {href} (title not found)")
        continue

    # Name
    title_el = driver.find_element(
        By.CSS_SELECTOR,
        "h1.product_title.entry-title, h2.product_title.entry-title"
    )
    name = title_el.text.strip()

    # SKU
    try:
        sku = driver.find_element(By.CSS_SELECTOR, "span.sku").text.strip()
    except NoSuchElementException:
        sku = ""

    #Price & Discount Price
    try:
        price_text = driver.find_element(By.CSS_SELECTOR, "p.price").get_attribute("innerText").replace("\n", " ").strip()
    # Example format: "1,890.00ден Original price was: 1,890.00ден.1,323.00денCurrent price is: 1,323.00ден."
    # Extract numbers that look like prices ending with "ден"
        import re
        prices = re.findall(r"\d[\d.,]*ден", price_text)

        if len(prices) >= 2:
            original_price = prices[0]
            discount_price = prices[1]
        elif len(prices) == 1:
            original_price = prices[0]
            discount_price = ""
        else:
            original_price = ""
            discount_price = ""
    except NoSuchElementException:
        original_price = ""
        discount_price = ""


    # Short description
    try:
        desc = driver.find_element(
            By.CSS_SELECTOR,
            "div.description.woocommerce-product-details__short-description"
        ).text.strip()
    except NoSuchElementException:
        desc = ""

    # Main image URL
    try:
        img_el = driver.find_element(
            By.CSS_SELECTOR,
            "div.img-thumbnail div.inner img"
        )
        image_url = img_el.get_attribute("data-large_image") or img_el.get_attribute("src")
    except NoSuchElementException:
        image_url = ""

    # Variations JSON → extract sizes & colors
    sizes, colors = [], []
    try:
        raw = driver.find_element(By.CSS_SELECTOR, "form.variations_form") \
                    .get_attribute("data-product_variations")
        variants = json.loads(raw)
        sizes  = sorted(v["attributes"]["attribute_pa_size"]
                        for v in variants
                        if v["attributes"].get("attribute_pa_size"))
        colors = sorted(v["attributes"]["attribute_pa_boja"]
                        for v in variants
                        if v["attributes"].get("attribute_pa_boja"))
    except Exception:
        pass

    print(f"[{idx}/{len(product_urls)}] {name} → Sizes: {sizes or ['—']}  Colors: {colors or ['—']}")


    data.append({
        "Name":           name,
        "SKU":            sku,
        "Original Price": original_price,
        "Discount Price": discount_price,
        "Description":    desc,
        "Image URL":      image_url,
        "Sizes":          sizes,
        "Colors":         colors,
        "URL":            href
    })



# ─── 5) Save to CSV ─────────────────────────────────────────────────────────────
df = pd.DataFrame(data)
df.to_csv("scout_obleka_mazhi.csv", index=False, encoding="utf-8-sig")
print(f"\n✔ Done! Scraped {len(df)} products → scout_obleka_mazhi.csv")

driver.quit()


Page 1: found 12 products
Page 2: found 24 products
Page 3: found 12 products
Page 4: found 24 products
Page 5: found 24 products
Page 6: found 12 products
Page 7: found 12 products
Page 8: found 12 products
Page 9: found 24 products
Page 10: found 24 products
Page 11: found 24 products
Page 12: found 24 products
Page 13: found 24 products
Page 14: found 24 products
Page 15: found 24 products
Page 16: found 24 products
Page 17: found 24 products
Page 18: found 24 products
Page 19: found 24 products
Page 20: found 24 products
Page 21: found 22 products
⚠️  No products on page 22, stopping pagination.

→ Collected 251 product URLs total.

[1/251] 1960 Logo T-shirt F87313 → Sizes: ['m', 's']  Colors: ['green', 'green']
[2/251] Abisko Hike Shirt LS F82263 → Sizes: ['—']  Colors: ['—']
[3/251] Abisko Lite Fleece Half Zip F87113 → Sizes: ['l', 'm', 'm', 'm', 's', 'xl']  Colors: ['green', 'green', 'navy', 'orange', 'orange', 'orange']
[4/251] Abisko Lite Fleece Jacket F86971 → Sizes: ['l', 'm