In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import webcolors
!pip install selenium beautifulsoup4 pandas webcolors




In [40]:


# URLs for both pages
urls = [
    "https://www2.hm.com/de_de/damen/produkte/jeans.html",
    "https://www2.hm.com/de_de/damen/produkte/jeans.html?page=2",
    "https://www2.hm.com/de_de/herren/produkte/jeans.html",
    "https://www2.hm.com/de_de/herren/produkte/jeans.html?page=2",
    "https://www2.hm.com/de_de/kinder/9-14j/kleidung/jeans.html"
]

all_products = []

# Step 1: Scrape both pages
for url in urls:
    if "damen" in url:
        gender = "Women"
    elif "herren" in url:
        gender = "Men"
    elif "kinder" in url:
        gender = "Kid"
    else:
        gender = "Unknown"
    
    driver = webdriver.Chrome()
    driver.get(url)

    body = driver.find_element(By.TAG_NAME, "body")
    for _ in range(10):
        body.send_keys(Keys.END)
        time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    grid = soup.find("ul", attrs={"data-elid": "product-grid"})
    if grid:
        products = grid.find_all("article")
        for p in products:
            all_products.append((p, gender))

print(f"✅ Total scraped: {len(all_products)} products")

✅ Total scraped: 180 products


In [52]:
FALLBACK_COLOR_NAMES = [
    'BLACK', 'WHITE', 'RED', 'GREEN', 'BLUE', 'YELLOW', 'CYAN', 'MAGENTA',
    'GRAY', 'ORANGE', 'PURPLE', 'PINK', 'BROWN', 'NAVY', 'MAROON',
    'OLIVE', 'TEAL', 'LIME', 'SILVER', 'AQUA', 'FUCHSIA', 'GOLD', 'BEIGE'
]

COLOR_CORRECTIONS = {
    'SLATEGRAY': 'GRAY',
    'LIGHTGRAY': 'GRAY',
    'DARKGRAY': 'GRAY',
    'PEACHPUFF': 'BROWN',
    'LAVENDER': 'BROWN',
    'TEAL': 'BLUE',
    'SILVER': 'BLUE',
    'INDIGO': 'BLUE',
    'DARKGREEN': 'BLUE',
    'BEIGE': 'WHITE',
    'UNKNOWN': 'BLUE',
    'MAROON':'RED'
}

def closest_color(rgb_tuple):
    min_distance = float('inf')
    closest_name = None
    for name in FALLBACK_COLOR_NAMES:
        try:
            r_c, g_c, b_c = webcolors.name_to_rgb(name.lower())
            dist = (r_c - rgb_tuple[0])**2 + (g_c - rgb_tuple[1])**2 + (b_c - rgb_tuple[2])**2
            if dist < min_distance:
                min_distance = dist
                closest_name = name
        except:
            continue
    return closest_name or "BLUE"

def rgb_or_hex_to_name(color_code):
    try:
        if color_code.startswith("rgb"):
            rgb = tuple(map(int, re.findall(r'\d+', color_code)))
            return closest_color(rgb)
        elif color_code.startswith("#"):
            rgb = webcolors.hex_to_rgb(color_code)
            return closest_color(rgb)
    except:
        return "BLUE"
    return "BLUE"

# ----------- Step 4: Extraction Functions -----------
def jeans_des(product):
    try:
        return product.find("h2", class_="be6471 bf7846 d0e739").get_text(strip=True)
    except AttributeError:
        return "N/A"

def jeans_price(product):
    try:
        price = product.find("del")
        if price:
            return price.get_text(strip=True).replace("Normalpreis", "").strip()

        price = product.find("span", class_="b1cd4f b6e218")
        if price:
            return price.get_text(strip=True).strip()

        span_prices = product.find_all("span")
        for span in span_prices:
            if "€" in span.get_text():
                return span.get_text(strip=True).strip()

    except:
        pass

    return "N/A"

def jeans_colors(product):
    color_set = set()

    # 1. Named colors from title attribute (if available)
    for tag in product.select("a[title].c0d578"):
        raw = tag.get("title", "").strip().upper()
        color = COLOR_CORRECTIONS.get(raw, raw)
        color_set.add(color)

    # 2. Visual swatches from inline background-color
    for span in product.select("span.ea044e"):  # confirmed correct class
        style = span.get("style", "")
        match = re.search(r'background-color:\s*([^;]+)', style)
        if match:
            code = match.group(1).strip()
            raw = rgb_or_hex_to_name(code).upper()
            color = COLOR_CORRECTIONS.get(raw, raw)
            color_set.add(color)

    # 3. Fallback
    if not color_set:
        return [COLOR_CORRECTIONS.get("UNKNOWN", "BLUE")]

    return sorted(color_set)


In [60]:
data = []
for prod, gender in all_products:
    name = jeans_des(prod)
    price = jeans_price(prod)
    colors = jeans_colors(prod)

    if name != "N/A" and price != "N/A":
        formatted_colors = ", ".join([c.capitalize() for c in colors])
        data.append([name, price, formatted_colors, gender])
        print(f"🛍️ Name: {name}")
        print(f"💶 Price: {price}")
        print(f"🎨 Colors: {', '.join(colors)}")
        print(f"👕 Gender: {gender}")
        print("-" * 50)

🛍️ Name: Wide High Ankle Jeans
💶 Price: 34,99 €
🎨 Colors: BLUE, BROWN, WHITE
👕 Gender: Women
--------------------------------------------------
🛍️ Name: Wide High Jeans
💶 Price: 29,99 €
🎨 Colors: BLACK, BLUE
👕 Gender: Women
--------------------------------------------------
🛍️ Name: Wide High Jeans
💶 Price: 29,99 €
🎨 Colors: BLACK, BLUE, GRAY
👕 Gender: Women
--------------------------------------------------
🛍️ Name: Wide High Jeans
💶 Price: 29,99 €
🎨 Colors: BLACK, BLUE
👕 Gender: Women
--------------------------------------------------
🛍️ Name: Wide High Jeans
💶 Price: 34,99 €
🎨 Colors: BLACK, WHITE
👕 Gender: Women
--------------------------------------------------
🛍️ Name: Wide High Ankle Jeans
💶 Price: 34,99 €
🎨 Colors: BLUE, BROWN
👕 Gender: Women
--------------------------------------------------
🛍️ Name: Wide High Jeans
💶 Price: 29,99 €
🎨 Colors: BLACK, BLUE, GRAY
👕 Gender: Women
--------------------------------------------------
🛍️ Name: Skinny High Jeans
💶 Price: 24,99 €
🎨 Color

In [61]:
columns = ["Name", "Price", "Colors", "Gender"]
df = pd.DataFrame(data, columns=columns)
df.to_csv("hm_jeans_with_colors.csv", index=False, encoding="utf-8")
print(f"✅ CSV saved with {len(all_products)} entries")

✅ CSV saved with 180 entries


In [62]:
formatted_colors = ", ".join([c.capitalize() for c in colors])
data.append([name, price, formatted_colors, gender])

In [63]:
columns = ["Name", "Price", "Colors", "Gender"]
df = pd.DataFrame(data, columns=columns)
df["Company"] = "H&M"  # Add constant column after df is created
df

Unnamed: 0,Name,Price,Colors,Gender,Company
0,Wide High Ankle Jeans,"34,99 €","Blue, Brown, White",Women,H&M
1,Wide High Jeans,"29,99 €","Black, Blue",Women,H&M
2,Wide High Jeans,"29,99 €","Black, Blue, Gray",Women,H&M
3,Wide High Jeans,"29,99 €","Black, Blue",Women,H&M
4,Wide High Jeans,"34,99 €","Black, White",Women,H&M
...,...,...,...,...,...
176,Wide Leg Cargo Jeans,"29,99 €","Blue, Gray",Kid,H&M
177,Baggy Fit Carpenter Jeans,"34,99 €","Black, Blue",Kid,H&M
178,Baggy Fit Bootcut Leg Jeans,"34,99 €",Blue,Kid,H&M
179,Jeans mit weitem Bein,"29,99 €","Blue, Gray",Kid,H&M
