In [1]:
pip install selenium webdriver-manager pandas

Collecting selenium
  Downloading selenium-4.36.0-py3-none-any.whl (9.6 MB)
     ---------------------------------------- 9.6/9.6 MB 10.4 MB/s eta 0:00:00
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Collecting certifi>=2025.6.15
  Downloading certifi-2026.1.4-py3-none-any.whl (152 kB)
     -------------------------------------- 152.9/152.9 kB 8.9 MB/s eta 0:00:00
Collecting urllib3[socks]<3.0,>=2.5.0
  Downloading urllib3-2.6.3-py3-none-any.whl (131 kB)
     -------------------------------------- 131.6/131.6 kB 8.1 MB/s eta 0:00:00
Collecting trio-websocket<1.0,>=0.12.2
  Downloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Collecting trio<1.0,>=0.30.0
  Downloading trio-0.31.0-py3-none-any.whl (512 kB)
     ------------------------------------- 512.7/512.7 kB 10.7 MB/s eta 0:00:00
Collecting websocket-client<2.0,>=1.8.0
  Downloading websocket_client-1.9.0-py3-none-any.whl (82 kB)
     ---------------------------------------- 8

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.14.1 requires torch==1.13.1, but you have torch 2.0.1 which is incompatible.
tensorflow-intel 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.3 which is incompatible.
pycaret 3.0.1 requires imbalanced-learn>=0.8.1, but you have imbalanced-learn 0.7.0 which is incompatible.
pycaret 3.0.1 requires numpy<1.24,>=1.21, but you have numpy 1.24.3 which is incompatible.
google-auth 2.19.1 requires urllib3<2.0, but you have urllib3 2.6.3 which is incompatible.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.
conda-repo-cli 1.0.20 requires requests==2.28.1, but you have requests 2.32.5 which is incompatible.
botocore 1.27.28 requires urllib3<1.27,>=1.25.4, 

In [2]:
import re
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service


BRAND_URL = "https://brand.naver.com/melkin"

def clean_int(text: str) -> int:
    if not text:
        return 0
    nums = re.sub(r"[^0-9]", "", text)
    return int(nums) if nums else 0

def clean_float(text: str) -> float:
    if not text:
        return 0.0
    m = re.search(r"(\d+(\.\d+)?)", text.replace(",", ""))
    return float(m.group(1)) if m else 0.0

def extract_product_id(url: str) -> str:
    # 보통 /products/1234567890 형태
    if not url:
        return ""
    m = re.search(r"/products/(\d+)", url)
    return m.group(1) if m else ""

def get_driver(headless: bool = False):
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless=new")

    # 네이버에서 자동화 탐지 완화용 옵션들(완전 해결은 아님)
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1280,2000")
    chrome_options.add_argument("--lang=ko-KR")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.implicitly_wait(7)
    return driver

def scroll_to_load_all(driver, max_scrolls=50, pause=1.2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def scrape_products_from_brandstore(headless=False, max_scrolls=60):
    driver = get_driver(headless=headless)
    try:
        driver.get(BRAND_URL)
        time.sleep(2)

        # 브랜드스토어에서 "전체상품" / "스토어" 탭 구성이 바뀔 수 있어
        # 가능한 경우 '전체상품' 링크를 찾아 클릭
        try:
            all_products = driver.find_elements(By.XPATH, "//a[contains(., '전체상품')]")
            if all_products:
                all_products[0].click()
                time.sleep(2)
        except Exception:
            pass

        # 스크롤로 상품 더 로드
        scroll_to_load_all(driver, max_scrolls=max_scrolls, pause=1.2)

        # 상품 카드(클래스명은 종종 바뀜) → 가장 안전한 건 "products/숫자" 링크를 모두 긁는 방식
        anchors = driver.find_elements(By.XPATH, "//a[contains(@href, '/products/')]")

        # 같은 상품 링크 중복 제거
        urls = []
        for a in anchors:
            href = a.get_attribute("href")
            if href and "/products/" in href:
                urls.append(href.split("?")[0])
        urls = list(dict.fromkeys(urls))  # preserve order, unique

        data = []
        for url in urls:
            # 카드에서 바로 name/price/rating/review 추출하는 게 제일 좋지만
            # 네이버는 카드 DOM이 자주 바뀌니, 1단계에서는 "상품 URL 목록+ID"를 확보하고
            # 2단계에서 상세페이지에서 정밀 추출하는 2-step이 더 안정적임.
            data.append({
                "product_id": extract_product_id(url),
                "name": "",          # 2단계에서 채움(상세페이지)
                "price": None,       # 2단계에서 채움
                "rating": None,      # 2단계에서 채움
                "review_count": None,# 2단계에서 채움
                "url": url
            })

        df = pd.DataFrame(data).drop_duplicates(subset=["product_id", "url"])
        return df

    finally:
        driver.quit()

if __name__ == "__main__":
    df = scrape_products_from_brandstore(headless=False, max_scrolls=70)
    print(df.head(10))
    print("rows:", len(df))

    df.to_csv("melkin_products_step1_urls.csv", index=False, encoding="utf-8-sig")
    print("Saved: melkin_products_step1_urls.csv")


    product_id name price rating review_count  \
0  12649505033       None   None         None   
1  12678957567       None   None         None   
2  11391657511       None   None         None   
3  10108843856       None   None         None   
4  11619404547       None   None         None   
5   9096073807       None   None         None   
6   4758250954       None   None         None   
7   7124664880       None   None         None   
8  10149813702       None   None         None   
9   6844556354       None   None         None   

                                                 url  
0  https://brand.naver.com/melkin/products/126495...  
1  https://brand.naver.com/melkin/products/126789...  
2  https://brand.naver.com/melkin/products/113916...  
3  https://brand.naver.com/melkin/products/101088...  
4  https://brand.naver.com/melkin/products/116194...  
5  https://brand.naver.com/melkin/products/909607...  
6  https://brand.naver.com/melkin/products/475825...  
7  https://brand.nav

In [3]:
import re
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service


INPUT_CSV = "melkin_products_step1_urls.csv"
OUTPUT_CSV = "melkin_products_master.csv"
CHECKPOINT_CSV = "melkin_products_master_checkpoint.csv"


def clean_int(text: str) -> int:
    if not text:
        return 0
    nums = re.sub(r"[^0-9]", "", text)
    return int(nums) if nums else 0

def clean_float(text: str) -> float:
    if not text:
        return 0.0
    m = re.search(r"(\d+(\.\d+)?)", text.replace(",", ""))
    return float(m.group(1)) if m else 0.0

def get_driver(headless: bool = False):
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1280,2200")
    chrome_options.add_argument("--lang=ko-KR")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.implicitly_wait(6)
    return driver

def safe_get_text(driver, by, value) -> str:
    try:
        el = driver.find_element(by, value)
        return el.text.strip()
    except Exception:
        return ""

def extract_product_meta(driver):
    """
    네이버 상품 상세 DOM은 변동이 잦아서
    1) 제목(h1) 추출
    2) 가격/평점/리뷰수는 텍스트 패턴 기반 fallback 포함
    """
    # 1) 상품명: 가장 보편적으로 h1에 존재
    name = ""
    for xp in [
        "//h1",
        "//h2",
        "//*[self::h1 or self::h2][1]",
    ]:
        try:
            name = driver.find_element(By.XPATH, xp).text.strip()
            if name and len(name) >= 2:
                break
        except Exception:
            pass

    page_text = ""
    try:
        page_text = driver.find_element(By.TAG_NAME, "body").text
    except Exception:
        page_text = ""

    # 2) 가격: "원" 포함 숫자 패턴
    price = None
    price_candidates = []

    # DOM 기반 후보 (자주 바뀌므로 여러 개 시도)
    for xp in [
        "//*[contains(text(), '원') and (contains(@class,'price') or contains(@class,'Price'))]",
        "//*[contains(text(), '원')][1]",
    ]:
        t = safe_get_text(driver, By.XPATH, xp)
        if t:
            price_candidates.append(t)

    # 텍스트 기반 후보: 12,345원 형태
    if page_text:
        m_all = re.findall(r"(\d[\d,]{2,})\s*원", page_text)
        # 너무 많은 숫자가 나오면 상위에 등장한 값을 우선
        if m_all:
            # 맨 앞 5개만 후보로
            for v in m_all[:5]:
                price_candidates.append(v + "원")

    # 후보에서 가장 그럴듯한(최대값이 아니라, "할인가/판매가" 섞이므로 최소값이 더 현실적일 때도 있음)
    # 여기선 우선 "가장 작은 양의 값"을 가격으로 잡음(할인가가 흔함)
    parsed_prices = []
    for c in price_candidates:
        p = clean_int(c)
        if p > 0:
            parsed_prices.append(p)
    if parsed_prices:
        price = min(parsed_prices)

    # 3) 평점: 0~5 사이 소수 패턴
    rating = None
    # DOM 후보
    rating_text = ""
    for xp in [
        "//*[contains(text(),'평점')]/following::*[1]",
        "//*[contains(@class,'rating') or contains(@class,'score')][1]",
    ]:
        rating_text = safe_get_text(driver, By.XPATH, xp)
        r = clean_float(rating_text)
        if 0 < r <= 5:
            rating = r
            break
    # 텍스트 기반 fallback
    if rating is None and page_text:
        m = re.search(r"([0-5]\.\d)\s*(?:/)?\s*5", page_text)
        if m:
            r = clean_float(m.group(1))
            if 0 < r <= 5:
                rating = r

    # 4) 리뷰수: "리뷰 123" 또는 "(123)" 같은 패턴
    review_count = None
    # DOM 후보
    for xp in [
        "//*[contains(text(),'리뷰') and (contains(@class,'review') or contains(@class,'Review'))][1]",
        "//*[contains(text(),'리뷰')][1]",
    ]:
        t = safe_get_text(driver, By.XPATH, xp)
        rc = clean_int(t)
        if rc > 0:
            review_count = rc
            break
    # 텍스트 기반 fallback
    if (review_count is None or review_count == 0) and page_text:
        m = re.search(r"리뷰\s*([0-9,]+)", page_text)
        if m:
            review_count = clean_int(m.group(1))

    return {
        "name": name,
        "price": price,
        "rating": rating,
        "review_count": review_count
    }

def run_detail_scrape(headless=False, sleep_each=1.2, checkpoint_every=10):
    df = pd.read_csv(INPUT_CSV)

    # 재실행 시 체크포인트가 있으면 이어서
    try:
        ck = pd.read_csv(CHECKPOINT_CSV)
        # product_id 기준으로 merge해서 이미 채운 것은 유지
        df = df.merge(
            ck[["product_id", "name", "price", "rating", "review_count"]],
            on="product_id",
            how="left",
            suffixes=("", "_old")
        )
        for col in ["name", "price", "rating", "review_count"]:
            if f"{col}_old" in df.columns:
                df[col] = df[col].fillna(df[f"{col}_old"])
                df.drop(columns=[f"{col}_old"], inplace=True)
        print(f"[Resume] loaded checkpoint: {len(ck)} rows")
    except Exception:
        pass

    driver = get_driver(headless=headless)
    try:
        for i, row in df.iterrows():
            # 이미 채워진 행은 스킵
            if pd.notna(row.get("name")) and str(row.get("name")).strip():
                continue

            url = row["url"]
            try:
                driver.get(url)
                time.sleep(2.0)  # 페이지 렌더링 대기
                meta = extract_product_meta(driver)

                df.at[i, "name"] = meta["name"]
                df.at[i, "price"] = meta["price"]
                df.at[i, "rating"] = meta["rating"]
                df.at[i, "review_count"] = meta["review_count"]

                print(f"[{i+1}/{len(df)}] OK {row['product_id']} | {meta['name'][:30]} | {meta['price']}")

            except Exception as e:
                print(f"[{i+1}/{len(df)}] FAIL {row.get('product_id')} | {url} | {e}")

            time.sleep(sleep_each)

            if (i + 1) % checkpoint_every == 0:
                df.to_csv(CHECKPOINT_CSV, index=False, encoding="utf-8-sig")
                print(f"[Checkpoint] saved: {CHECKPOINT_CSV}")

        df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
        df.to_csv(CHECKPOINT_CSV, index=False, encoding="utf-8-sig")
        print(f"[Done] saved: {OUTPUT_CSV}")

    finally:
        driver.quit()

if __name__ == "__main__":
    run_detail_scrape(headless=False, sleep_each=1.0, checkpoint_every=10)


[1/72] OK 12649505033 |  | 14700
[2/72] OK 12678957567 |  | 7900
[3/72] OK 11391657511 |  | 12900
[4/72] OK 10108843856 |  | 9700
[5/72] OK 11619404547 |  | 30000
[6/72] OK 9096073807 |  | 5390
[7/72] OK 4758250954 |  | 4680
[8/72] OK 7124664880 |  | 398
[9/72] OK 10149813702 |  | 8000
[10/72] OK 6844556354 |  | 4760
[Checkpoint] saved: melkin_products_master_checkpoint.csv
[11/72] OK 13105695589 |  | 5900
[12/72] OK 13088334040 |  | 3780
[13/72] OK 12930766187 |  | 4190
[14/72] OK 12765439244 |  | 1318
[15/72] OK 12742800949 |  | 138
[16/72] OK 12964502133 |  | 1680
[17/72] OK 12337998446 |  | 1170
[18/72] OK 12265535531 |  | 3790
[19/72] OK 12199251361 |  | 1970
[20/72] OK 12115864618 |  | 4370
[Checkpoint] saved: melkin_products_master_checkpoint.csv
[21/72] OK 5416628605 |  | 106
[22/72] OK 581159952 |  | 322
[23/72] OK 581155411 |  | 218
[24/72] OK 5288609892 |  | 1340
[25/72] OK 639016947 |  | 203
[26/72] OK 5314685920 |  | 394
[27/72] OK 5414083196 |  | 172
[28/72] OK 614018823 

In [6]:
import re
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service


INPUT_CSV = "melkin_products_step1_urls.csv"
OUTPUT_CSV = "melkin_products_master.csv"
CHECKPOINT_CSV = "melkin_products_master_checkpoint.csv"

BRAND_TO_SMARTSTORE = True  # 핵심 스위치


def clean_int(text: str) -> int:
    if not text:
        return 0
    nums = re.sub(r"[^0-9]", "", text)
    return int(nums) if nums else 0

def clean_float(text: str) -> float:
    if not text:
        return 0.0
    m = re.search(r"(\d+(\.\d+)?)", text.replace(",", ""))
    return float(m.group(1)) if m else 0.0

def to_smartstore_url(url: str) -> str:
    # brand.naver.com/melkin/products/123 -> smartstore.naver.com/melkin/products/123
    if not url:
        return url
    u = url.split("?")[0]
    u = u.replace("https://brand.naver.com/", "https://smartstore.naver.com/")
    return u

def get_driver(headless: bool = False):
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1280,2200")
    chrome_options.add_argument("--lang=ko-KR")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.implicitly_wait(10)
    return driver

def wait_body_text(driver, timeout_sec=10):
    # 렌더링이 느릴 때 대비: body 텍스트가 일정 길이 이상 될 때까지 대기
    t0 = time.time()
    while time.time() - t0 < timeout_sec:
        try:
            txt = driver.find_element(By.TAG_NAME, "body").text
            if txt and len(txt) > 50:
                return txt
        except Exception:
            pass
        time.sleep(0.3)
    try:
        return driver.find_element(By.TAG_NAME, "body").text
    except Exception:
        return ""

def extract_name(driver):
    # smartstore 상세에서 이름은 보통 h1 또는 og:title
    # 1) h1
    try:
        h1 = driver.find_element(By.XPATH, "//h1")
        if h1.text.strip():
            return h1.text.strip()
    except Exception:
        pass

    # 2) meta og:title
    try:
        og = driver.find_element(By.XPATH, "//meta[@property='og:title']")
        content = og.get_attribute("content") or ""
        if content.strip():
            return content.strip()
    except Exception:
        pass

    # 3) title 태그
    try:
        title = driver.title or ""
        title = title.replace(" : 네이버 스마트스토어", "").strip()
        if title:
            return title
    except Exception:
        pass

    return ""

def extract_price(driver, body_text: str):
    # 우선 meta product:price:amount 같은 게 있으면 좋지만 없을 수 있음
    # body 텍스트에서 "원" 패턴 후보를 잡아 가장 작은 값을 가격으로(할인가 기준)
    m_all = re.findall(r"(\d[\d,]{2,})\s*원", body_text or "")
    prices = [int(v.replace(",", "")) for v in m_all[:10] if v]
    prices = [p for p in prices if p > 0]
    return min(prices) if prices else None

def extract_rating_review(driver, body_text: str):
    rating = None
    review_count = None

    # rating: 0~5 사이
    m = re.search(r"([0-5]\.\d)\s*(?:/)?\s*5", body_text or "")
    if m:
        r = float(m.group(1))
        if 0 < r <= 5:
            rating = r

    # review_count: "리뷰 123"
    m2 = re.search(r"리뷰\s*([0-9,]+)", body_text or "")
    if m2:
        review_count = int(m2.group(1).replace(",", ""))
    else:
        review_count = 0

    return rating, review_count

def run(headless=False, sleep_each=0.8, checkpoint_every=10):
    df = pd.read_csv(INPUT_CSV)

    # checkpoint resume
    try:
        ck = pd.read_csv(CHECKPOINT_CSV)
        df = df.merge(
            ck[["product_id", "name", "price", "rating", "review_count"]],
            on="product_id",
            how="left",
            suffixes=("", "_old")
        )
        for col in ["name", "price", "rating", "review_count"]:
            if f"{col}_old" in df.columns:
                df[col] = df[col].fillna(df[f"{col}_old"])
                df.drop(columns=[f"{col}_old"], inplace=True)
        print(f"[Resume] checkpoint rows: {len(ck)}")
    except Exception:
        pass

    driver = get_driver(headless=headless)
    try:
        for i, row in df.iterrows():
            # name 비어있는 것만 채우자
            if pd.notna(row.get("name")) and str(row.get("name")).strip():
                continue

            url = row["url"]
            target = to_smartstore_url(url) if BRAND_TO_SMARTSTORE else url

            try:
                driver.get(target)
                body_text = wait_body_text(driver, timeout_sec=10)

                name = extract_name(driver)
                price = extract_price(driver, body_text)
                rating, review_count = extract_rating_review(driver, body_text)

                df.at[i, "name"] = name
                df.at[i, "price"] = price
                df.at[i, "rating"] = rating
                df.at[i, "review_count"] = review_count

                print(f"[{i+1}/{len(df)}] {row['product_id']} | {name[:25]} | {price} | {rating} | {review_count}")

            except Exception as e:
                print(f"[{i+1}/{len(df)}] FAIL {row.get('product_id')} | {target} | {e}")

            time.sleep(sleep_each)

            if (i + 1) % checkpoint_every == 0:
                df.to_csv(CHECKPOINT_CSV, index=False, encoding="utf-8-sig")
                print("[Checkpoint] saved")

        df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
        df.to_csv(CHECKPOINT_CSV, index=False, encoding="utf-8-sig")
        print(f"[Done] saved: {OUTPUT_CSV}")

    finally:
        driver.quit()

if __name__ == "__main__":
    run(headless=False)


[Resume] checkpoint rows: 72
[1/72] 12649505033 | NAVER | None | None | 0
[2/72] 12678957567 | NAVER | None | None | 0
[3/72] 11391657511 | NAVER | None | None | 0
[4/72] 10108843856 | NAVER | None | None | 0
[5/72] 11619404547 | NAVER | None | None | 0
[6/72] 9096073807 | NAVER | None | None | 0
[7/72] 4758250954 | NAVER | None | None | 0
[8/72] 7124664880 | NAVER | None | None | 0
[9/72] 10149813702 | NAVER | None | None | 0
[10/72] 6844556354 | NAVER | None | None | 0


PermissionError: [Errno 13] Permission denied: 'melkin_products_master_checkpoint.csv'

In [4]:
df1 = pd.read_csv('melkin_products_master.csv')

In [5]:
df1.isna().sum()

product_id       0
name            72
price            0
rating          10
review_count     0
url              0
dtype: int64

In [None]:
import os
import re
import time
import math
import requests
import pandas as pd
from datetime import datetime
from html import unescape

NAVER_CLIENT_ID = os.getenv("") # 클라이언트 아이디
NAVER_CLIENT_SECRET = os.getenv("") # 클라이언트 키

if not NAVER_CLIENT_ID or not NAVER_CLIENT_SECRET:
    raise RuntimeError("환경변수 NAVER_CLIENT_ID / NAVER_CLIENT_SECRET 를 설정해줘.")

BASE_URL = "https://openapi.naver.com/v1/search/shop.json"


def strip_html(s: str) -> str:
    if not s:
        return ""
    s = unescape(s)
    return re.sub(r"<[^>]+>", "", s).strip()

def to_int(x) -> int:
    try:
        return int(str(x).replace(",", "").strip())
    except:
        return 0

def naver_shop_search(query: str, display: int = 100, start: int = 1, sort: str = "sim"):
    """
    sort: sim(정확도), date(날짜), asc(가격오름), dsc(가격내림)
    display: 1~100
    start: 1~1000 (보통 API 제한/정책 확인 필요)
    """
    headers = {
        "X-Naver-Client-Id": NAVER_CLIENT_ID,
        "X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
    }
    params = {"query": query, "display": display, "start": start, "sort": sort}
    r = requests.get(BASE_URL, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    return r.json()

def collect_for_keyword(keyword: str, pages: int = 3, display: int = 100, sort: str = "sim", sleep_sec: float = 0.25):
    """
    keyword로 pages 만큼 페이지 수집 (page 1 = start=1, page2=start=101 ...)
    pages*display 개까지(최대) 수집
    """
    rows = []
    for p in range(pages):
        start = 1 + p * display
        js = naver_shop_search(keyword, display=display, start=start, sort=sort)
        items = js.get("items", [])
        for it in items:
            rows.append({
                "collected_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "keyword": keyword,
                "title": strip_html(it.get("title", "")),
                "mallName": it.get("mallName", ""),
                "brand": it.get("brand", ""),
                "maker": it.get("maker", ""),
                "category1": it.get("category1", ""),
                "category2": it.get("category2", ""),
                "category3": it.get("category3", ""),
                "category4": it.get("category4", ""),
                "lprice": to_int(it.get("lprice", 0)),
                "hprice": to_int(it.get("hprice", 0)),
                "productId": it.get("productId", ""),
                "productType": it.get("productType", ""),
                "link": it.get("link", ""),
                "image": it.get("image", ""),
            })
        time.sleep(sleep_sec)
    return pd.DataFrame(rows)

def add_brand_flags(df: pd.DataFrame, target_brand_keywords=("멜킨", "MELKIN", "melkin")) -> pd.DataFrame:
    def is_target(row) -> int:
        text = " ".join([
            str(row.get("title","")),
            str(row.get("brand","")),
            str(row.get("mallName","")),
            str(row.get("maker","")),
        ]).lower()
        for kw in target_brand_keywords:
            if kw.lower() in text:
                return 1
        return 0
    df = df.copy()
    df["is_melkin"] = df.apply(is_target, axis=1)
    return df

def summarize(df: pd.DataFrame):
    # 키워드별 요약
    grp = df.groupby("keyword", dropna=False)
    summary = grp.agg(
        n=("title", "count"),
        melkin_n=("is_melkin", "sum"),
        melkin_share=("is_melkin", "mean"),
        price_mean=("lprice", "mean"),
        price_median=("lprice", "median"),
        price_p10=("lprice", lambda x: x.quantile(0.10)),
        price_p90=("lprice", lambda x: x.quantile(0.90)),
    ).reset_index()
    summary["melkin_share"] = (summary["melkin_share"] * 100).round(1)
    summary["price_mean"] = summary["price_mean"].round(0)
    summary["price_median"] = summary["price_median"].round(0)
    summary["price_p10"] = summary["price_p10"].round(0)
    summary["price_p90"] = summary["price_p90"].round(0)
    return summary

def main():
    # 멜킨 제품군/시장 키워드
    keywords = [
        "덤벨",
        "케틀벨",
        "치닝디핑",
        "철봉",
        "실내자전거",
        "요가매트",
        "마사지건",
        "홈트 기구",
        # 브랜드 포함 검색도 같이 (브랜드 자사 포지션 확인)
        "멜킨 덤벨",
        "멜킨 치닝디핑",
    ]

    # 각 키워드당 2~3페이지면 포폴 충분 (200~300개/키워드)
    pages = 2
    display = 100
    sort = "sim"

    all_df = []
    for kw in keywords:
        print(f"Collecting: {kw}")
        df_kw = collect_for_keyword(kw, pages=pages, display=display, sort=sort)
        all_df.append(df_kw)

    df = pd.concat(all_df, ignore_index=True).drop_duplicates(subset=["keyword", "link", "productId"], keep="first")
    df = add_brand_flags(df, target_brand_keywords=("멜킨", "MELKIN", "melkin"))

    # 저장
    df.to_csv("naver_shop_raw.csv", index=False, encoding="utf-8-sig")
    print("Saved: naver_shop_raw.csv")

    summary = summarize(df)
    summary.to_csv("naver_shop_summary.csv", index=False, encoding="utf-8-sig")
    print("Saved: naver_shop_summary.csv")

    # 콘솔 미리보기
    print("\n=== Summary (keyword-level) ===")
    print(summary.sort_values("melkin_share", ascending=False).to_string(index=False))

    # 멜킨만 따로
    df_melkin = df[df["is_melkin"] == 1].copy()
    df_melkin.to_csv("naver_shop_melkin_only.csv", index=False, encoding="utf-8-sig")
    print("Saved: naver_shop_melkin_only.csv")

if __name__ == "__main__":
    main()


RuntimeError: 환경변수 NAVER_CLIENT_ID / NAVER_CLIENT_SECRET 를 설정해줘.

In [17]:
import requests

NAVER_CLIENT_ID = "ilHfeQvRIGp1ApaBoTjO"
NAVER_CLIENT_SECRET = "TJyTqemEde"

url = "https://openapi.naver.com/v1/search/shop.json"
headers = {
    "X-Naver-Client-Id": NAVER_CLIENT_ID,
    "X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
}

params = {
    "query": "덤벨",
    "display": 5
}

res = requests.get(url, headers=headers, params=params)
print(res.status_code)
print(res.json())


200
{'lastBuildDate': 'Mon, 16 Feb 2026 16:13:28 +0900', 'total': 440722, 'start': 1, 'display': 5, 'items': [{'title': '헬스럽 무게조절 조립 <b>덤벨</b> 세트 20kg', 'link': 'https://search.shopping.naver.com/catalog/52187278625', 'image': 'https://shopping-phinf.pstatic.net/main_5218727/52187278625.20241230111537.jpg', 'lprice': '32800', 'hprice': '', 'mallName': '네이버', 'productId': '52187278625', 'productType': '1', 'brand': '헬스럽', 'maker': '', 'category1': '스포츠/레저', 'category2': '헬스', 'category3': '아령', 'category4': ''}, {'title': '멜킨 롤튼 무게조절 <b>덤벨</b> 프레스 24kg 2kg 단위 중량조절 조립식 세트', 'link': 'https://smartstore.naver.com/main/products/6996108419', 'image': 'https://shopping-phinf.pstatic.net/main_8454060/84540608741.2.jpg', 'lprice': '159000', 'hprice': '', 'mallName': '멜킨스포츠', 'productId': '84540608741', 'productType': '2', 'brand': '멜킨스포츠', 'maker': '멜킨스포츠', 'category1': '스포츠/레저', 'category2': '헬스', 'category3': '아령', 'category4': ''}, {'title': '홈투더짐 클래식 무게조절 <b>덤벨</b> <b>아령</b> 세트 조립 조립식 바벨 20

In [19]:
import time
import re
import requests
import pandas as pd
from datetime import datetime
from html import unescape

NAVER_CLIENT_ID = "ilHfeQvRIGp1ApaBoTjO"
NAVER_CLIENT_SECRET = "TJyTqemEde"

BASE_URL = "https://openapi.naver.com/v1/search/shop.json"

def strip_html(s: str) -> str:
    if not s:
        return ""
    s = unescape(s)
    return re.sub(r"<[^>]+>", "", s).strip()

def to_int(x) -> int:
    try:
        return int(str(x).replace(",", "").strip())
    except:
        return 0

def naver_shop_search(query: str, display: int = 100, start: int = 1, sort: str = "sim"):
    headers = {
        "X-Naver-Client-Id": NAVER_CLIENT_ID,
        "X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
    }
    params = {"query": query, "display": display, "start": start, "sort": sort}
    r = requests.get(BASE_URL, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    return r.json()

def collect_keyword(keyword: str, pages: int = 3, display: int = 100, sort: str = "sim", sleep_sec: float = 0.25):
    rows = []
    for p in range(pages):
        start = 1 + p * display
        js = naver_shop_search(keyword, display=display, start=start, sort=sort)
        for it in js.get("items", []):
            rows.append({
                "collected_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "keyword": keyword,
                "title": strip_html(it.get("title", "")),
                "lprice": to_int(it.get("lprice", 0)),
                "mallName": it.get("mallName", ""),
                "brand": it.get("brand", ""),
                "maker": it.get("maker", ""),
                "category1": it.get("category1", ""),
                "category2": it.get("category2", ""),
                "category3": it.get("category3", ""),
                "category4": it.get("category4", ""),
                "productId": it.get("productId", ""),
                "productType": it.get("productType", ""),
                "link": it.get("link", ""),
                "image": it.get("image", ""),
            })
        time.sleep(sleep_sec)
    return pd.DataFrame(rows)

def is_melkin_row(row) -> bool:
    text = " ".join([
        str(row.get("title","")),
        str(row.get("brand","")),
        str(row.get("mallName","")),
        str(row.get("maker","")),
    ]).lower()
    return ("멜킨" in text) or ("melkin" in text)

def main():
    keywords = [
        "덤벨", "케틀벨", "바벨", "아령",
        "치닝디핑", "철봉", "풀업바",
        "실내자전거", "스핀바이크",
        "요가매트", "폼롤러", "마사지건",
        "홈트 기구"
    ]

    pages = 5      # 키워드당 500개(100*5) 표본
    display = 100
    sort = "sim"

    dfs = []
    for kw in keywords:
        print("collect:", kw)
        dfs.append(collect_keyword(kw, pages=pages, display=display, sort=sort))

    raw = pd.concat(dfs, ignore_index=True)
    raw = raw.drop_duplicates(subset=["keyword", "productId", "link"])

    raw.to_csv("naver_shop_raw_all.csv", index=False, encoding="utf-8-sig")

    melkin = raw[raw.apply(is_melkin_row, axis=1)].copy()
    melkin.to_csv("naver_shop_melkin_only.csv", index=False, encoding="utf-8-sig")

    print("ALL:", len(raw), "| MELKIN:", len(melkin))
    print(melkin[["keyword","title","lprice","mallName","brand","productId"]].head(10).to_string(index=False))

if __name__ == "__main__":
    main()


collect: 덤벨
collect: 케틀벨
collect: 바벨
collect: 아령
collect: 치닝디핑
collect: 철봉
collect: 풀업바
collect: 실내자전거
collect: 스핀바이크
collect: 요가매트
collect: 폼롤러
collect: 마사지건
collect: 홈트 기구
ALL: 6472 | MELKIN: 335
keyword                                                 title  lprice mallName brand   productId
     덤벨             멜킨 롤튼 무게조절 덤벨 프레스 24kg 2kg 단위 중량조절 조립식 세트  159000    멜킨스포츠 멜킨스포츠 84540608741
     덤벨 멜킨 육각덤벨 1kg 2kg 3kg 4kg 5kg 6kg 7kg 8kg 9kg 10kg 20kg    2500    멜킨스포츠 멜킨스포츠 18384787916
     덤벨                                    멜킨스포츠 무게조절 덤벨 24kg   89000      네이버 멜킨스포츠 52203981620
     덤벨                                       멜킨스포츠 육각 덤벨 5kg   12000      네이버 멜킨스포츠 52190642645
     덤벨                                    멜킨스포츠 무게조절 덤벨 40kg  159000      네이버 멜킨스포츠 52204183620
     덤벨                          멜킨스포츠 롤튼 무게조절 덤벨 24kg 단위 4kg  119000      네이버 멜킨스포츠 52340836618
     덤벨                          멜킨스포츠 롤튼 무게조절 덤벨 36kg 단위 4kg  149000      네이버 멜킨스포츠 52378604618
     덤벨                   

In [43]:
import pandas as pd

raw = pd.read_csv("naver_shop_raw_all.csv")
melkin = pd.read_csv("naver_shop_melkin_only.csv")

# 키워드별 멜킨 노출 점유율
raw["is_melkin"] = raw.apply(lambda r: ("멜킨" in (str(r["brand"])+str(r["mallName"])+str(r["maker"])+str(r["title"]))), axis=1)

summary = raw.groupby("keyword").agg(
    n=("title","count"),
    melkin_n=("is_melkin","sum"),
    melkin_share=("is_melkin","mean"),
    price_mean=("lprice","mean"),
    price_median=("lprice","median"),
).reset_index()

summary["melkin_share"] = (summary["melkin_share"]*100).round(1)
summary["price_mean"] = summary["price_mean"].round(0)
summary["price_median"] = summary["price_median"].round(0)

summary.to_csv("naver_shop_keyword_summary.csv", index=False, encoding="utf-8-sig")
print(summary.sort_values("melkin_share", ascending=False).to_string(index=False))


keyword   n  melkin_n  melkin_share  price_mean  price_median
     덤벨 500        53          10.6     41015.0       23000.0
     아령 499        53          10.6     41483.0       23100.0
  실내자전거 500        52          10.4    272402.0      213380.0
   요가매트 500        29           5.8     42626.0       31635.0
  스핀바이크 498        27           5.4    376010.0      269650.0
     바벨 500        24           4.8     44398.0       29650.0
  홈트 기구 500        23           4.6     93793.0       51300.0
    폼롤러 500        19           3.8     21313.0       16500.0
     철봉 481        14           2.9     64021.0       34840.0
    풀업바 497        14           2.8     79682.0       47140.0
    케틀벨 497        13           2.6     56556.0       33480.0
   마사지건 500         7           1.4     79241.0       47075.0
   치닝디핑 500         7           1.4    144574.0       90780.0


In [44]:
import pandas as pd

raw = pd.read_csv("naver_shop_raw_all.csv")
raw["is_melkin"] = raw.apply(lambda r: ("멜킨" in (str(r["brand"])+str(r["mallName"])+str(r["maker"])+str(r["title"]))), axis=1)

summary = raw.groupby("keyword").agg(
    total=("title","count"),
    melkin_n=("is_melkin","sum"),
).reset_index()

summary["melkin_share(%)"] = (summary["melkin_n"] / summary["total"] * 100).round(2)

print(summary.sort_values("melkin_share(%)", ascending=False).to_string(index=False))


keyword  total  melkin_n  melkin_share(%)
     아령    499        53            10.62
     덤벨    500        53            10.60
  실내자전거    500        52            10.40
   요가매트    500        29             5.80
  스핀바이크    498        27             5.42
     바벨    500        24             4.80
  홈트 기구    500        23             4.60
    폼롤러    500        19             3.80
     철봉    481        14             2.91
    풀업바    497        14             2.82
    케틀벨    497        13             2.62
   마사지건    500         7             1.40
   치닝디핑    500         7             1.40


In [45]:
melkin = raw[raw["is_melkin"] == True]
market_price = raw["lprice"].median()
melkin_price = melkin["lprice"].median()

print("시장 중앙값:", market_price)
print("멜킨 중앙값:", melkin_price)
print("가격 포지션 차이:", melkin_price - market_price)


시장 중앙값: 41450.0
멜킨 중앙값: 39000.0
가격 포지션 차이: -2450.0


In [46]:
melkin_cat = melkin.groupby("category3").size().reset_index(name="count")
melkin_cat = melkin_cat.sort_values("count", ascending=False)
print(melkin_cat.to_string(index=False))


category3  count
       아령    106
    헬스사이클     79
    웨이트기구     75
     요가매트     29
     헬스소품     19
      케틀벨     13
     마사지건      7
      스텝퍼      6
     로잉머신      1


In [47]:
market_median = raw["lprice"].median()
melkin_median = raw[raw["is_melkin"] == True]["lprice"].median()

print("시장 중앙값:", market_median)
print("멜킨 중앙값:", melkin_median)
print("차이:", melkin_median - market_median)


시장 중앙값: 41450.0
멜킨 중앙값: 39000.0
차이: -2450.0
