# 올리브영 상품 크롤러
이 노트북은 올리브영 상품 목록 및 상세정보(이미지, 제조국, 제조업자 등)를 크롤링합니다.
- 크롤링할 카테고리(상품목록) 페이지의 URL을 직접 입력하세요.
- 상품 목록, 상세정보, 결과 저장까지 단계별로 실행할 수 있습니다.
- 카테고리에서 VIEW(24, 36, 48개씩 보기) 설정에 따라 개수에 맞춰 1페이지만 크롤링 합니다.
- 기본 sleep값은 3 입니다. 필요시 줄일 수 있지만 봇 탐지에 적발 가능성이 높아집니다.

# 1. 라이브러리 임포트

In [None]:
import time
import json
import re
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# 2. 크롬 드라이버 셋업 함수

In [None]:
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

# 3. URL 입력

In [None]:
base_url = input("크롤링할 올리브영 카테고리(상품목록) 페이지의 URL을 입력하세요:\n예시: https://www.oliveyoung.co.kr/store/display/getMCategoryList.do?...\n> ")

# 4. 상품 목록 크롤링 함수

In [None]:
def get_product_list(url):
    driver = setup_driver()
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    products = []
    product_items = []
    for num in range(1, 49):
        product_items.extend(soup.select(f'li[data-number="{num}"]'))
    for product in product_items:
        prd_info = product.select_one('div.prd_info')
        if prd_info:
            product_data = {}
            prd_name = prd_info.select_one('p.tx_name')
            product_data['name'] = prd_name.text.strip() if prd_name else None
            brand = prd_info.select_one('span.tx_brand')
            product_data['brand'] = brand.text.strip() if brand else None
            price_tag = prd_info.select_one('p.prd_price span.tx_cur span.tx_num')
            price = 0
            if price_tag:
                price_str = price_tag.text.strip().replace(',', '').replace('원', '')
                try:
                    price = int(price_str)
                except:
                    price = 0
            product_data['price'] = price
            img = prd_info.select_one('img')
            product_data['thumbnailUrls'] = [img['src']] if img and 'src' in img.attrs else []
            link = prd_info.select_one('a.prd_thumb')
            detail_url = link['href'] if link and 'href' in link.attrs else None
            if detail_url:
                detail_url = urljoin('https://www.oliveyoung.co.kr', detail_url)
            product_data['detailUrl'] = detail_url
            products.append(product_data)
    driver.quit()
    return products

# 5. 상세페이지 정보(이미지, 제조국, 제조업자) 크롤링 함수

In [None]:
def get_product_detail_info(detail_url):
    driver = setup_driver()
    driver.get(detail_url)
    time.sleep(2)
    # 1. 상품상세 이미지: 진입 직후(기본 탭)에서 추출
    image_urls = []
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    detail_div = soup.select_one('div.tabConts.prd_detail_cont.show')
    if detail_div:
        for img in detail_div.find_all('img'):
            src = img.get('src')
            data_src = img.get('data-src')
            if src and src.startswith('http') and not src.startswith('data:image'):
                image_urls.append(src)
            elif data_src and data_src.startswith('http') and not data_src.startswith('data:image'):
                image_urls.append(data_src)
    # 2. '구매정보' 탭 클릭 후 제조국/제조업자 추출
    manufacturer = None
    country0f0rigin = None
    try:
        buyinfo_tab = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.ID, "buyInfo"))
        )
        buyinfo_tab.click()
        time.sleep(2)
    except Exception as e:
        print("구매정보 탭 클릭 실패:", e)
    soup2 = BeautifulSoup(driver.page_source, 'html.parser')
    buyinfo_div = soup2.select_one('div.tabConts.prd_detail_cont.show')
    if buyinfo_div:
        artc_info = buyinfo_div.select_one('div#artcInfo')
        if artc_info:
            for dl in artc_info.select('dl.detail_info_list'):
                dt = dl.find('dt')
                dd = dl.find('dd')
                if not dt or not dd:
                    continue
                dt_text = dt.get_text(strip=True)
                dd_text = dd.get_text(strip=True)
                if '화장품제조업자' in dt_text:
                    m = re.search(r'화장품제조업자\\s*[:：]\\s*([^/]+)', dd_text)
                    if m:
                        manufacturer = m.group(1).strip()
                    else:
                        manufacturer = dd_text.strip()
                elif '제조국' in dt_text:
                    country0f0rigin = dd_text.strip()
    driver.quit()
    return {
        'detailImageUrls': image_urls,
        'manufacturer': manufacturer,
        'country0f0rigin': country0f0rigin
    }

# 6. 상품 목록 크롤링 실행

In [None]:
products = get_product_list(base_url)
print(f'총 {len(products)}개 상품 발견!')

# 7. 상세페이지 정보 크롤링 실행

In [None]:
all_products = []
for idx, product in enumerate(products, 1):
    print(f'{idx}/{len(products)}: {product["name"]}')
    detail_info = get_product_detail_info(product['detailUrl']) if product['detailUrl'] else {'detailImageUrls': [], 'manufacturer': None, 'country0f0rigin': None}
    all_products.append({
        'productName': product['name'],
        'price': product['price'],
        'thumbnailUrls': product['thumbnailUrls'],
        'detailImageUrls': detail_info['detailImageUrls'],
        'manufacturer': detail_info['manufacturer'],
        'country0f0rigin': detail_info['country0f0rigin'],
        'productUrl': product['detailUrl']
    })

# 8. 결과를 JSON 파일로 저장

In [None]:
with open('oliveyoung_products.json', 'w', encoding='utf-8') as f:
    json.dump({'items': all_products}, f, ensure_ascii=False, indent=2)
print('크롤링 결과가 oliveyoung_products.json 파일로 저장되었습니다.')