# Web Scraper for Olive Young Products

This notebook scrapes product data from skincare category [Olive Young](https://global.oliveyoung.com/display/category?ctgrNo=1000000008) using Selenium. 
Scraped in batches in case there are errors

We extract:
- Full ingredients
- Skin types
- Content size
- Product name & URL

This data feeds into our broader skincare comparison project.

In [16]:
import undetected_chromedriver as uc
import pandas as pd
import time
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# -------- SCRAPING FUNCTION -------- #
def get_product_info(url):
    try:
        options = uc.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")

        driver = uc.Chrome(options=options)
        driver.get(url)

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "prd-detail-page")))
        time.sleep(1)

        # Open the "Specific Item Info" modal
        driver.execute_script(
            'document.querySelector("#prd-detail-page > div.prd-banner-wrap.mo-only > ul > li:nth-child(1) > a").click();'
        )
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "modalSpecItemInformation")))
        time.sleep(2)

        rows = driver.find_elements(By.CSS_SELECTOR, "#descTbody > tr")

        ingredients, skin_type, amount = None, None, None
        for row in rows:
            th = row.find_element(By.TAG_NAME, "th").text.strip()
            td = row.find_element(By.TAG_NAME, "td").text.strip()
            if th == "Ingredients":
                ingredients = td
            elif th == "Ideal for":
                skin_type = td
            elif th == "Content volume or weight":
                amount = td

        driver.quit()
        return ingredients, skin_type, amount

    except Exception as e:
        return f"Error: {e}", "", ""

# -------- MAIN BATCH SCRAPER -------- #
def scrape_in_batches(input_csv, output_csv, batch_size=100):
    # Load data
    df = pd.read_csv(input_csv)

    # If output already exists, resume from last saved row
    if os.path.exists(output_csv):
        scraped_df = pd.read_csv(output_csv)
        scraped_urls = set(scraped_df['url'])
        print(f"🔄 Resuming from existing file. {len(scraped_urls)} URLs already scraped.")
    else:
        scraped_df = pd.DataFrame(columns=list(df.columns) + ['ingredients', 'skin_type', 'amount'])
        scraped_urls = set()

    # Filter URLs not yet scraped
    unsaved_df = df[~df['url'].isin(scraped_urls)]
    total = len(unsaved_df)
    print(f"🚀 Starting batch scrape for {total} remaining products...")

    for i in range(0, total, batch_size):
        batch = unsaved_df.iloc[i:i+batch_size].copy()

        # Scrape batch
        results = batch['url'].apply(lambda url: pd.Series(get_product_info(url)))
        results.columns = ['ingredients', 'skin_type', 'amount']

        # Append to batch and save
        batch = pd.concat([batch, results], axis=1)
        scraped_df = pd.concat([scraped_df, batch], ignore_index=True)
        scraped_df.to_csv(output_csv, index=False)

        print(f"✅ Batch {i // batch_size + 1} done: {len(scraped_df)} total scraped so far.")
        time.sleep(2) 

    print("🎉 All batches complete! File saved:", output_csv)

# -------- RUN -------- #
scrape_in_batches(
    input_csv="oliveyoung_products_extracted.csv",
    output_csv="oliveyoung_all_scraped_batches.csv",
    batch_size=5
)

🚀 Starting batch scrape for 3042 remaining products...


  scraped_df = pd.concat([scraped_df, batch], ignore_index=True)


✅ Batch 1 done: 5 total scraped so far.
✅ Batch 2 done: 10 total scraped so far.
✅ Batch 3 done: 15 total scraped so far.
✅ Batch 4 done: 20 total scraped so far.
✅ Batch 5 done: 25 total scraped so far.
✅ Batch 6 done: 30 total scraped so far.
✅ Batch 7 done: 35 total scraped so far.
✅ Batch 8 done: 40 total scraped so far.
✅ Batch 9 done: 45 total scraped so far.
✅ Batch 10 done: 50 total scraped so far.
✅ Batch 11 done: 55 total scraped so far.
✅ Batch 12 done: 60 total scraped so far.
✅ Batch 13 done: 65 total scraped so far.
✅ Batch 14 done: 70 total scraped so far.
✅ Batch 15 done: 75 total scraped so far.
✅ Batch 16 done: 80 total scraped so far.
✅ Batch 17 done: 85 total scraped so far.
✅ Batch 18 done: 90 total scraped so far.
✅ Batch 19 done: 95 total scraped so far.
✅ Batch 20 done: 100 total scraped so far.
✅ Batch 21 done: 105 total scraped so far.
✅ Batch 22 done: 110 total scraped so far.
✅ Batch 23 done: 115 total scraped so far.
✅ Batch 24 done: 120 total scraped so fa

In [19]:
#Added a column for product type based on the product name

import re
import pandas as pd

# =========================
# Settings
# =========================
INPUT_CSV   = "oliveyoung_all_scraped_batches.csv"  # your working file
SAMPLE_OUT  = "oliveyoung_product_type_SAMPLE.csv"
FULL_OUT    = "oliveyoung_with_product_type.csv"

SAMPLE_N        = 20      # how many rows for the test sample
RANDOM_SAMPLE   = False    # True=random sample, False=head()
RUN_FULL        = True   # flip to True after you're happy with the sample


# =========================
# Product-type inference rules
# (ordered: most specific -> general)
# =========================
TYPE_RULES = [
    ("Sun Stick",         r"\b(sun\s*stick|sunstick)\b"),
    ("Sun Cushion",       r"\b(sun\s*cushion|suncushion)\b"),
    ("Sunscreen",         r"\b(sun\s*(cream|gel|milk|essence)|uv\s*(gel|milk|essence)|sunblock)\b"),
    ("Cleansing Oil",     r"\b(cleansing\s*oil|oil\s*cleanser)\b"),
    ("Cleansing Balm",    r"\b(cleansing\s*balm|balm\s*cleanser)\b"),
    ("Cleansing Foam",    r"\b(cleansing\s*foam|foam\s*cleanser)\b"),
    ("Cleansing Gel",     r"\b(cleansing\s*gel|gel\s*cleanser)\b"),
    ("Cleanser",          r"\b(cleanser|face\s*wash)\b"),
    ("Exfoliating Toner", r"\b(exfoliating\s*toner|(?:^|\W)(aha|bha|pha)(?:$|\W)|peeling\s*toner)\b"),
    ("Toner Pad",         r"\b(toner\s*pad|peel\s*pad|exfoliating\s*pad)\b"),
    ("Toner",             r"\b(toner|skin\s*softener)\b"),
    ("Essence",           r"\b(essence|first\s*essence)\b"),
    ("Ampoule",           r"\b(ampoule)\b"),
    ("Serum",             r"\b(serum)\b"),
    ("Emulsion",          r"\b(emulsion)\b"),
    ("Gel Cream",         r"\b(gel\s*cream)\b"),
    ("Cream",             r"\b(cream)\b"),
    ("Lotion",            r"\b(lotion)\b"),
    ("Moisturizer",       r"\b(moisturizer|moisturiser)\b"),
    ("Eye Cream",         r"\b(eye\s*cream)\b"),
    ("Eye Serum",         r"\b(eye\s*serum)\b"),
    ("Sleeping Mask",     r"\b(sleeping\s*mask)\b"),
    ("Sheet Mask",        r"\b(sheet\s*mask|mask\s*sheet)\b"),
    ("Wash-Off Mask",     r"\b(wash[- ]?off\s*mask|clay\s*mask)\b"),
    ("Mask",              r"\b(mask)\b"),
    ("Spot Treatment",    r"\b(spot\s*treatment|acne\s*spot|blemish\s*spot)\b"),
    ("Pimple Patch",      r"\b(pimple\s*patch|acne\s*patch)\b"),
    ("Mist",              r"\b(mist|spray)\b"),
    ("Lip Balm",          r"\b(lip\s*balm)\b"),
    ("Lip Mask",          r"\b(lip\s*mask)\b"),
    ("Hand Cream",        r"\b(hand\s*cream)\b"),
    ("Body Lotion",       r"\b(body\s*lotion)\b"),
    ("Body Wash",         r"\b(body\s*wash|shower\s*gel)\b"),
    ("Shampoo",           r"\b(shampoo)\b"),
    ("Conditioner",       r"\b(conditioner)\b"),
    ("Scalp Treatment",   r"\b(scalp\s*(tonic|treatment|essence))\b"),
    ("Deodorant",         r"\b(deodorant)\b"),
    ("Set",               r"\b(set|kit)\b"),
    ("Refill",            r"\b(refill)\b"),
]

COMPILED_RULES = [(label, re.compile(pat, re.IGNORECASE)) for label, pat in TYPE_RULES]

# Optional normalizers to reduce noise
SIZE_PAT = re.compile(r"(\d+\.?\d*\s*(ml|mL|ML|oz|fl\.?\s*oz|g|pcs|sheet[s]?|stick[s]?))", re.IGNORECASE)
MULTIPLIER_PAT = re.compile(r"\b(x|×)\s*\d+\b", re.IGNORECASE)
PARENS_PAT = re.compile(r"\s*[\(\[]\s*(mini|refill|set|limited|special|duo|trio|value|sample|gift)\s*[\)\]]", re.IGNORECASE)

def normalize_name(name: str) -> str:
    s = name or ""
    s = SIZE_PAT.sub(" ", s)
    s = MULTIPLIER_PAT.sub(" ", s)
    s = PARENS_PAT.sub(" ", s)
    return " ".join(s.split())

def infer_product_type(name: str) -> str:
    s = normalize_name(name)
    for label, rx in COMPILED_RULES:
        if rx.search(s):
            return label
    # Sunscreen fallback by SPF/PA
    if re.search(r"\bSPF\s*\d{2,3}\b", s, re.IGNORECASE) or re.search(r"\bPA\+{1,4}\b", s, re.IGNORECASE):
        return "Sunscreen"
    return "Other"


# Run: sample first, then full file
df = pd.read_csv(INPUT_CSV)

# Basic guard
if "prdtName" not in df.columns:
    raise KeyError("Column 'prdtName' not found. Update the code to the correct name column.")

if not RUN_FULL:
    # ----- SAMPLE -----
    sample = df.sample(SAMPLE_N, random_state=42).copy() if RANDOM_SAMPLE else df.head(SAMPLE_N).copy()
    sample["product_type"] = sample["prdtName"].apply(infer_product_type)
    sample.to_csv(SAMPLE_OUT, index=False)
    print(f"✅ Sample saved to {SAMPLE_OUT} (rows={len(sample)})")
    # quick peek
    print(sample[["prdtName", "product_type"]].head(SAMPLE_N).to_string(index=False))
else:
    # ----- FULL RUN -----
    df["product_type"] = df["prdtName"].apply(infer_product_type)
    df.to_csv(FULL_OUT, index=False)
    print(f"🎉 Full file saved to {FULL_OUT} (rows={len(df)})")

🎉 Full file saved to oliveyoung_with_product_type.csv (rows=3042)


In [3]:
#Separate scrape for sunscreen products after realizing it was on a different category
# https://global.oliveyoung.com/display/category?ctgrNo=1000000011
import undetected_chromedriver as uc
import pandas as pd
import time
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# -------- SCRAPING FUNCTION -------- #
def get_product_info(url):
    try:
        options = uc.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")

        driver = uc.Chrome(options=options)
        driver.get(url)

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "prd-detail-page")))
        time.sleep(1)

        # Open the "Specific Item Info" modal
        driver.execute_script(
            'document.querySelector("#prd-detail-page > div.prd-banner-wrap.mo-only > ul > li:nth-child(1) > a").click();'
        )
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "modalSpecItemInformation")))
        time.sleep(2)

        rows = driver.find_elements(By.CSS_SELECTOR, "#descTbody > tr")

        ingredients, skin_type, amount = None, None, None
        for row in rows:
            th = row.find_element(By.TAG_NAME, "th").text.strip()
            td = row.find_element(By.TAG_NAME, "td").text.strip()
            if th == "Ingredients":
                ingredients = td
            elif th == "Ideal for":
                skin_type = td
            elif th == "Content volume or weight":
                amount = td

        driver.quit()
        return ingredients, skin_type, amount

    except Exception as e:
        return f"Error: {e}", "", ""

# -------- MAIN BATCH SCRAPER -------- #
def scrape_in_batches(input_csv, output_csv, batch_size=100):
    # Load data
    df = pd.read_csv(input_csv)

    # If output already exists, resume from last saved row
    if os.path.exists(output_csv):
        scraped_df = pd.read_csv(output_csv)
        scraped_urls = set(scraped_df['url'])
        print(f"🔄 Resuming from existing file. {len(scraped_urls)} URLs already scraped.")
    else:
        scraped_df = pd.DataFrame(columns=list(df.columns) + ['ingredients', 'skin_type', 'amount'])
        scraped_urls = set()

    # Filter URLs not yet scraped
    unsaved_df = df[~df['url'].isin(scraped_urls)]
    total = len(unsaved_df)
    print(f"🚀 Starting batch scrape for {total} remaining products...")

    for i in range(0, total, batch_size):
        batch = unsaved_df.iloc[i:i+batch_size].copy()

        # Scrape batch
        results = batch['url'].apply(lambda url: pd.Series(get_product_info(url)))
        results.columns = ['ingredients', 'skin_type', 'amount']

        # Append to batch and save
        batch = pd.concat([batch, results], axis=1)
        scraped_df = pd.concat([scraped_df, batch], ignore_index=True)
        scraped_df.to_csv(output_csv, index=False)

        print(f"✅ Batch {i // batch_size + 1} done: {len(scraped_df)} total scraped so far.")
        time.sleep(2) 

    print("🎉 All batches complete! File saved:", output_csv)

# -------- RUN -------- #
scrape_in_batches(
    input_csv="/Users/sarahhyun/skincare/data/olive/olive_sunscreens_parsed.csv",
    output_csv="/Users/sarahhyun/skincare/data/olive/oliveyoung_sunscreen_scraped.csv",
    batch_size=5
)

SyntaxError: invalid syntax (2362090840.py, line 2)