# 카카오 쇼핑 크롤링

목표
- 정적 페이지, 동적 페이지 구분하기
- `스크롤`을 활용하여 300개의 상품 정보 추출
- 상품 정보
  1. 상품명
  2. 상페 상품 페이지 링크
  3. 판매가격
  4. 판매처
  5. 배송정보 e.g. 무료배송, 유료배송
  6. 상품 이미지

chrome settings -> preferences -> disable javascript를 통해 동적인지 정적인지 확인해보니 `동적`이다.


In [89]:
from dataclasses import dataclass
import random
from time import sleep
from typing import ClassVar, Optional

import pandas as pd
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from bs4 import BeautifulSoup


KAKAO_SHOPPING_ROOT = "https://shoppinghow.kakao.com"
KAKAO_SHOPPING_PATH = "/siso/p/sale/mall/talkstore"
EXCEL_FILE_PATH = "./outputs/kakao-shopping.xlsx"

@dataclass
class DriverController:
  DEFAULT_DELAY: ClassVar[int] = 5 # 클래스 상수 ClassVar를 사용안하면 에러 발생
  RANDOM_SLEEP_MIN: ClassVar[float] = 0.8
  RANDOM_SLEEP_MAX: ClassVar[float] = 1.5
  driver: webdriver.Chrome
  start_url: Optional[str] = None

  # __init__ 후에 바로 실행되는 special method
  def __post_init__(self) -> None:
    if self.start_url:
      self.driver.get(self.start_url)

      WebDriverWait(self.driver, 10).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
      )

  # 기본적으로 명시적 대기 기반 요소 찾기 
  def find_element(
    self, 
    selector: str, 
    driver_selector = By.CSS_SELECTOR, 
    delay = DEFAULT_DELAY
  ) -> WebElement:
    return WebDriverWait(self.driver, delay).until(
      EC.presence_of_element_located((driver_selector, selector))
    )
  
  def click_element(self, selector: str) -> None:
    element = self.find_element(selector)
    element.click()
    self._random_sleep()
  
  def key_input(self, selector: str, keys: list[str]) -> None:
    element = self.find_element(selector)
    element.send_keys(*keys)
    self._random_sleep()
  
  # 동일한 term으로 실행시 차단될 수 있으니 random한 값으로 sleep 수행
  def _random_sleep(self) -> None:
    sleep(random.uniform(self.RANDOM_SLEEP_MIN, self.RANDOM_SLEEP_MAX))
  
  
driver = webdriver.Chrome()
dc = DriverController(driver=driver, start_url=urljoin(KAKAO_SHOPPING_ROOT, KAKAO_SHOPPING_PATH))


# 원하는 조건까지 무한 스크롤

In [90]:
# item이 300까지 일 때 까지 scroll
LIMIT_ITEMS_COUNT = 300
items_wrapper_selector = "#SaleSectionProductsWrap > li"

js_script = f"""
itemsLength = document.querySelectorAll(arguments[0]).length;
window.scrollBy(0, 2800);

return itemsLength;
"""

while True:
  items_length = driver.execute_script(js_script, items_wrapper_selector)
  sleep(1)
  
  if items_length >= LIMIT_ITEMS_COUNT: break


# 원하는 정보 추출

In [91]:
page_soup = BeautifulSoup(driver.page_source, "html.parser")
items = page_soup.select(items_wrapper_selector)
result = []

for item in items:
  product_name = item.select_one(".info_name").get_text(strip=True)
  product_link = urljoin(KAKAO_SHOPPING_ROOT, item.select_one("a")["href"])
  product_price = item.select_one(".info_sales").get_text(strip=True).replace(",", "").replace("원", "")
  product_seller = item.select_one(".txt_shop").text
  product_shipping = item.select_one(".info_etc").contents[-1]
  product_image_link = "https:" + item.select_one(".wrap_thumb > img")["src"]
  # print(product_name, product_link, product_price, product_seller, product_shipping, product_image_link, sep="\n")

  result.append([
    product_name, 
    product_link, 
    product_price, 
    product_seller, 
    product_shipping, 
    product_image_link
  ])



# 엑셀에 저장

In [92]:
import pandas as pd


df = pd.DataFrame(result, columns=["상품명", "상품 링크", "가격", "판매처", "배송정보", "상품 이미지"])
df.to_excel(EXCEL_FILE_PATH,  index=False)


# 실제 이미지 저장하기


In [93]:
from io import BytesIO

import requests
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as ExcelImage
from PIL import Image as PILImage


def get_excel_format_image_from_url(url: str, *, width: Optional[int] = 100, height: Optional[int] = 100) -> ExcelImage:
  response = requests.get(url)
  response.raise_for_status()

  image_raw_bytes = response.content
  stored_image_in_buffer = BytesIO(image_raw_bytes)
  image_pil_obj = PILImage.open(stored_image_in_buffer)
  scaled_down_image_pil_obj = image_pil_obj.resize((width, height), PILImage.Resampling.LANCZOS)
  # 제대로 들어왔나 확인
  # scaled_down_image_pil_obj.show()

  # 다시 excel image object롤 변환
  temp_buffer = BytesIO()
  scaled_down_image_pil_obj.save(temp_buffer, format="PNG", optimize=True)
  temp_buffer.seek(0)
  excel_image_obj = ExcelImage(temp_buffer)

  return excel_image_obj

def insert_image_to_cell(ws, column_letter: str, row_index: int, excel_image: ExcelImage) -> None:
  cell_coordinate = f"{column_letter}{row_index}"
  
  # 해당 셀에 이미지 링크가 있으므로 초기화
  ws[cell_coordinate] = None

  # 이미지의 크기에 맞게 cell 크기 조정
  ws.column_dimensions[column_letter].width = excel_image.width / 7
  ws.row_dimensions[row_index].height = excel_image.height * 0.8

  ws.add_image(excel_image, cell_coordinate)



wb = load_workbook(EXCEL_FILE_PATH)
ws = wb.active

column_name_letter_map = {column.value: column.column_letter for column in ws[1]}
image_column_letter = column_name_letter_map["상품 이미지"]

# 상품 이미지에 담겨 있는 상품 이미지 링크를 순회하여 실제 이미지로 추가
for row_index in range(2, ws.max_row + 1):
  image_url = ws[f"{image_column_letter}{row_index}"].value
  excel_image = get_excel_format_image_from_url(image_url)
  
  insert_image_to_cell(
    ws, 
    image_column_letter, 
    row_index, 
    excel_image
  )

wb.save(EXCEL_FILE_PATH)
