In [2]:
import re
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By

# --- setup ---
driver = webdriver.Chrome()
base = "https://www.fashiongroup.com.mk/"
driver.get(base)

# --- collect all “Облека” links ---
anchors = driver.find_elements(By.CSS_SELECTOR, "a[href*='/obleka/']")

markets = {}
for a in anchors:
    href = a.get_attribute("href")
    slug = urlparse(href).path.rstrip("/").split("/")[-1]
    # normalize both hyphens and pluses into underscores
    key  = slug.replace("-", "_").replace("+", "_")
    markets[key] = href

driver.quit()

# now pick them out by the new, fully-normalized keys
url_obleka_maski  = markets["maski"]
url_obleka_zenski = markets["zenski"]
url_obleka_deca   = markets["devojcinja_momcinja_novorodencinja_bebe_devojcinja_bebe_momcinja"]

print("Maski URL:   ", url_obleka_maski)
print("Zenski URL:  ", url_obleka_zenski)
print("Deca URL:    ", url_obleka_deca)

Maski URL:    https://www.fashiongroup.com.mk/obleka/maski/
Zenski URL:   https://www.fashiongroup.com.mk/obleka/zenski/
Deca URL:     https://www.fashiongroup.com.mk/obleka/devojcinja+momcinja+novorodencinja+bebe-devojcinja+bebe-momcinja


In [None]:
import csv
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# === CONFIG ===
BASE_URL             = "https://www.fashiongroup.com.mk/obleka/maski/"
ITEM_SELECTOR        = "div.item-data.col-xs-12.col-sm-12"
PAGINATION_SELECTOR  = "ul.pagination li a"
LISTING_WAIT         = 10
REQUESTS_TIMEOUT     = 10
HEADERS              = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
# =================

def parse_detail(detail_url):
    """Fetch a detail page via requests+BS4 and extract Brand, Code, Composition."""
    try:
        resp = requests.get(detail_url, headers=HEADERS, timeout=REQUESTS_TIMEOUT)
        resp.raise_for_status()
    except requests.RequestException:
        return {"Brand": "", "Code": "", "Description": ""}

    soup = BeautifulSoup(resp.text, "html.parser")

    # Brand
    brand_el = soup.select_one("div.block.product-details-info div.brand")
    brand    = brand_el.get_text(strip=True) if brand_el else ""

    # Code
    code_el = soup.select_one("div.block.product-details-info div.code span")
    code    = code_el.get_text(strip=True) if code_el else ""

    # Composition (“Состав”)
    composition = ""
    for row in soup.select("table.product-attrbite-table tr"):
        tds = row.find_all("td")
        if len(tds) == 2 and tds[0].get_text(strip=True) == "Состав":
            composition = tds[1].get_text(strip=True)
            break

    return {"Brand": brand, "Code": code, "Description": composition}


def scrape_all_masks():
    # 1) Launch headless Chrome for the listing
    opts = webdriver.ChromeOptions()
    opts.add_argument("--headless=new")
    opts.add_experimental_option("prefs", {
        "profile.managed_default_content_settings.images": 2
    })
    driver = webdriver.Chrome(options=opts)
    wait   = WebDriverWait(driver, LISTING_WAIT)

    # 2) Determine how many pages there are
    driver.get(BASE_URL)
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ITEM_SELECTOR)))
    page_links = driver.find_elements(By.CSS_SELECTOR, PAGINATION_SELECTOR)
    last_page = 1
    for a in page_links:
        t = a.text.strip()
        if t.isdigit():
            last_page = max(last_page, int(t))
    print(f"Detected {last_page} pages.")

    items = []
    # 3) Loop through every page
    for page in range(1, last_page + 1):
        url = f"{BASE_URL}?page={page}"
        print(f"→ Scraping listing page {page}/{last_page}")
        driver.get(url)
        time.sleep(1)
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ITEM_SELECTOR)))

        cards = driver.find_elements(By.CSS_SELECTOR, ITEM_SELECTOR)
        print(f"   Found {len(cards)} products.")

        for card in cards:
            detail_url = card.find_element(By.CSS_SELECTOR, "a.product-link")\
                             .get_attribute("href")
            thumb_src  = card.find_element(By.CSS_SELECTOR, "div.img-wrapper img")\
                             .get_attribute("src")
            image_url  = urljoin(BASE_URL, thumb_src)
            name       = card.find_element(By.CSS_SELECTOR, "div.title a").text.strip()

            # Price splitting
            curr = card.find_element(By.CSS_SELECTOR, "div.current-price .value").text.strip()
            olds = card.find_elements(By.CSS_SELECTOR, "div.prev-old-price")
            if olds:
                regular_price  = olds[0].text.strip()
                discount_price = curr
            else:
                regular_price  = curr
                discount_price = ""

            # Available sizes (no 'disabled')
            sizes = [
                sz.get_attribute("data-productsize-name") or sz.text.strip()
                for sz in card.find_elements(By.CSS_SELECTOR, "div.product-sizes .item.btn")
                if "disabled" not in sz.get_attribute("class")
            ]
            sizes_str = ", ".join(sizes)

            items.append({
                "Image URL":       image_url,
                "Name":            name,
                "Regular Price":   regular_price + " MKД",
                "Discount Price":  (discount_price + " MKД") if discount_price else "",
                "Available Sizes": sizes_str,
                "Detail URL":      detail_url,
            })

    driver.quit()

    # 4) Enrich each item via requests + BS4
    print("Fetching detail pages…")
    for idx, it in enumerate(items, 1):
        if idx % 20 == 0 or idx == len(items):
            print(f"  → detail {idx}/{len(items)}")
        it.update(parse_detail(it["Detail URL"]))

    return items


def main():
    data = scrape_all_masks()

    df = pd.DataFrame(data)[[
        "Image URL", "Name", "Regular Price", "Discount Price",
        "Available Sizes", "Brand", "Code", "Description"
    ]]
    df.to_csv(
        "fashiongroup_maski.csv",
        index=False,
        encoding="utf-8",
        quotechar='"',
        quoting=csv.QUOTE_ALL
    )
    print(f"\n✅ Scraped {len(df)} products → fashiongroup_maski.csv")


if __name__ == "__main__":
    main()