In [22]:
import re
import time
import pandas as pd
import datetime as dt
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from tqdm.notebook import tqdm

In [23]:
chromedriver_autoinstaller.install()
driver = webdriver.Chrome()
driver.implicitly_wait(5)

In [24]:
# 실사용 할 무신사 URL
musinsaUrl = "https://www.musinsa.com/ranking/best?period=now&age=ALL&mainCategory=&subCategory=&leafCategory=&price=&golf=false&kids=false&newProduct=false&exclusive=false&discount=false&soldOut=false&page={}&viewType=small&priceMin=&priceMax="
pageNumber = range(1,6)

In [25]:
# 테스트용 URL
testUrl = "https://www.musinsa.com/ranking/best?period=now&age=ALL&mainCategory=&subCategory=&leafCategory=&price=&golf=false&kids=false&newProduct=false&exclusive=false&discount=false&soldOut=false&page=1&viewType=small&priceMin=&priceMax="

In [26]:
# 한글 제거 함수
def remove_korean(text):
    pattern = re.compile('[가-힣]+')  # 한글 패턴
    result = pattern.sub('', text)
    return result

In [28]:
musinsaData = []
for p in pageNumber:
    url = musinsaUrl.format(p)
    print(url)

driver.get(testUrl)
time.sleep(0.3)

soup1 = BeautifulSoup(driver.page_source)

itemList = soup1.select(".li_box")

for li in itemList:
    itemNo = li['data-goods-no']
    # print(itemNo)
    
    itemUrl = f'https://www.musinsa.com/app/goods/{itemNo}?loc=goods_rank'
    
    driver.get(itemUrl)
    time.sleep(0.3)

    
    
    # 현재 페이지 소스를 가져와서 BeautifulSoup으로 파싱
    soup2 = BeautifulSoup(driver.page_source)
    
    for elem in soup2.select('.product_title .product_title_eng, .product_title .txt_reserve, .product_title .txt_delay'):
      elem.decompose()
    
    # 상품명
    title = soup2.select('.product_title')

    if title:
      titleValue = title[0].text.strip()
      titleValue = titleValue.replace('[무료반품]', '').strip()
    else:
      titleValue = ""
      
    # print(titleValue)
   
    # item_categories 클래스를 가진 요소를 찾아서 처리
   
    #부류만 추출
    category = soup2.select('.item_categories a',href=True)[0]
    
    if category:
      categoryValue = category.text.strip()
      categoryValue = re.sub(r'\([^()]*\)', '', categoryValue)
      categoryValue = categoryValue.replace('>', '').strip()
    else:
      categoryValue = ""  
    # print(categoryValue)
    
    #종류만 추출
    types = soup2.select('.item_categories a',href=True)[1]
    
    if types:
      typesValue = types.text.strip()
      typesValue = re.sub(r'\([^()]*\)', '', typesValue)
      typesValue = typesValue.replace('>', '').strip()
    else:
      typesValue = ""  
    # print(typesValue)
    
    # 착용성별 추출
    gender = soup2.select('.txt_gender span')

    if gender:
        genderText = [g.get_text(strip=True) for g in gender]
        genderValue = ' '.join(genderText)
        genderValue = genderValue.replace('남 여','공용').strip()
    else:
        genderValue = ""
    # print(genderValue)
    
    # 조회수
    views = soup2.select('.product_article_contents #pageview_1m')
    if views:
        viewText = [v.get_text(strip=True) for v in views]
        viewsValue = ' '.join(viewText)
        a = ''
        if "만" in viewsValue: a ='만'
        elif "천" in viewsValue: a ='천'
        viewsValue=float(remove_korean(viewsValue)) if viewsValue else 0
        if a:
          if a == '만':viewsValue*=10000
          else: viewsValue*=1000
          viewsValue=int(viewsValue)
    else:
      viewsValue = 0
      
    # print(viewsValue)
    
    # 누적판매
    sales = soup2.select('#li_sales_1y #sales_1y_qty')
    if sales:
        salesText = [s.get_text(strip=True) for s in sales]
        salesValue = ' '.join(salesText)
        a = ''
        if "만" in salesValue: a ='만'
        elif "천" in salesValue: a ='천'
        salesValue=float(remove_korean(salesValue)) if salesValue else 0
        if a:
          if a == '만':salesValue*=10000
          else: salesValue*=1000
          salesValue=int(salesValue)
          
    else:
      salesValue = 0
    
    # print(salesValue)
    
    # 선호연령대
    ages = soup2.select('#product_order_info .graph_age')

    if ages:
        agesText = [a.get_text(strip=True) for a in ages]
        agesValueList = []

        for ageText in agesText:
            # ~을 기준으로 문자열을 분할하여 숫자를 추출
            age_values = ageText.split('~')
            agesValueList.extend(age_values)

        agesValue = ' '.join(agesValueList)
    else:
        agesValue = 0

    # print(agesValue)
    
    # 선호성별
    sex = soup2.select('.graph_sex_text')
    if sex:
      sexText = [x.get_text(strip=True) for x in sex]
      sexValue = '  '.join(sexText)
    else:
      sexValue = ""
    # print(sexValue)
    
    # 상품 가격
    price = soup2.select('#list_price')
    
    priceText = [p.get_text(strip=True) for p in price]
    priceValue = None  # 초기값을 None으로 설정

    for p in priceText:
        p_values = p.replace('원', '').replace(',', '').strip().split('~')
        if p_values:
            # 리스트에 값이 있으면 가장 마지막 값만 저장
            priceValue = int(p_values[-1])
        else:
          priceValue = ""

    # print(priceValue)
    
    mydict = {
      "상품명": titleValue,
      "종류": categoryValue,
      "분류": typesValue,
      "성별": genderValue,
      "조회수": viewsValue,
      "누적판매": salesValue,
      "선호연령" : agesValue,
      "선호성별" : sexValue,
      "상품가격" : priceValue
    }
    
    musinsaData.append(mydict)
    
print("수집된 상품 갯수: ",len(musinsaData))

수집된 상품 갯수:  90


In [None]:
# copy = musinsaData.copy()
# fname = dt.datetime.now().strftime("MusinsaRank_%y%m%d_%H.csv")

# with open(fname,"w", encoding="utf-8")as f:
#   for i, v in enumerate(copy):
#     if i == 0:
#       title ="%s\n"%",".join(list(v.keys()))
#       # print(title)
#       f.write(title)
    
#     detail = "%s\n" % ",".join(list(v.values()))
#     # print(detail)
#     f.write(detail)

# print("fin :)")

In [29]:
copy = musinsaData.copy()

df = pd.DataFrame(copy)
excel_file_path = dt.datetime.now().strftime("MusinsaRank_%y%m%d_%H.xlsx")
df.to_excel(excel_file_path, index=False)

print("fin :)")

fin :)
