# 무신사 크롤링 모듈 생성

In [None]:
%%writefile musinsa.py
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

all_page_products = []

def get_total_page(category="002"):
    url = "https://search.musinsa.com/ranking/best?u_cat_cd={}".format(category)
    html_page = requests.get(url).content
    soup = BeautifulSoup(html_page, 'html.parser')
    total_page = soup.select_one("#goodsRankForm > div.right_contents.hover_box > div.boxed-list-wrapper.rank-shop > div.thumbType_box.box > span.pagingNumber > span.totalPagingNum").text
    total_page = int(total_page)
    return total_page

def get_musinsa_product(category="002"):
    total_page = get_total_page()
    for page in tqdm_notebook(range(1, total_page+1)):
        url = "https://search.musinsa.com/ranking/best?mainCategory={}&page={}".format(category, page)

        html_page = requests.get(url.format(page)).content
        soup = BeautifulSoup(html_page, 'html.parser')
        products = soup.select("#goodsRankList > li")
        
        for product in products:

            # 상품 id
            product_id = product['data-goods-no']
            img_url = product.select_one("img.lazyload").get("data-original")
            product_brand = product.select_one("p.item_title").text
            product_detail_url = product.select_one("p.list_info > a").get("href")
            product_name = product.select_one("p.list_info").text.strip()
            product_price = product.select_one("p.price").text.split()
            product_price = float(product_price[-1].replace(",", "").replace("원", ""))
            start_count_elem = product.select_one("p.point > span.count")
            if start_count_elem:
                star_count = int(start_count_elem.text.replace(",", ""))
            else:
                star_count = 0

            item = {
                "product_id": product_id,
                "img_url": img_url,
                "product_brand":product_brand.strip(),
                "product_detail_url":product_detail_url,
                "product_name":product_name.strip(),
                "product_price":product_price,
                "star_count":star_count
            }

            all_page_products.append(item)
            
    return all_page_products

Overwriting musinsa.py


# 무신사 모듈 로딩 및 크롤링 함수 호출

In [None]:
from musinsa import get_musinsa_product

all_products = get_musinsa_product()

  0%|          | 0/100 [00:00<?, ?it/s]

# mongodb 연결 및 db, collection 연결

In [None]:
import pymongo
client = pymongo.MongoClient("mongodb://3.37.184.54:27017/")
client

MongoClient('3.37.184.54', 27017)

기존 db 삭제

In [None]:
client.crawling.musinsa.drop()

insert

In [None]:
musinsa = client.crawling.musinsa
insert_result = musinsa.insert(all_products)

# 데이터 프레임 생성( 크롤링 데이터로 )

In [None]:
import pandas as pd
df_musinsa = pd.DataFrame(all_products)
df_musinsa.head()

Unnamed: 0,product_id,img_url,product_brand,product_detail_url,product_name,product_price,star_count,_id
0,2081554,https://image.msscdn.net/images/goods_img/2021...,노스페이스,https://store.musinsa.com/app/goods/2081554?lo...,NJ1DM62A 남성 1996 에코 눕시 자켓,299000.0,1932,61b041c5c28c300318f4fbf3
1,865862,https://image.msscdn.net/images/goods_img/2018...,무신사 스탠다드,https://store.musinsa.com/app/goods/865862?loc...,캐시미어 블렌드 오버사이즈 싱글 코트 [블랙],139900.0,16135,61b041c5c28c300318f4fbf4
2,2014490,https://image.msscdn.net/images/goods_img/2021...,후아유,https://store.musinsa.com/app/goods/2014490?lo...,리버서블 후드 웜업 [아메리카 블랙] WHJJB4T02U,79920.0,1642,61b041c5c28c300318f4fbf5
3,1575618,https://image.msscdn.net/images/goods_img/2020...,쿠어,https://store.musinsa.com/app/goods/1575618?lo...,MTR 발마칸 코트 (딥브라운),279650.0,422,61b041c5c28c300318f4fbf6
4,2105483,https://image.msscdn.net/images/goods_img/2021...,노스페이스,https://store.musinsa.com/app/goods/2105483?lo...,NC1DM73A 고 프리 하이브리드 다운 코트 EX,269100.0,285,61b041c5c28c300318f4fbf7


In [None]:
product_mean = df_musinsa['product_price'].mean()
product_mean

195757.43633333335

# 평균 구하고 다시 조회 및 데이터 프레임 생성하기

In [None]:
QUERY = {"product_price" : {"$lte": product_mean}}
results = musinsa.find(QUERY)

result_df = pd.DataFrame(results)
result_df.head()

Unnamed: 0,_id,product_id,img_url,product_brand,product_detail_url,product_name,product_price,star_count
0,61b041c5c28c300318f4fbf4,865862,https://image.msscdn.net/images/goods_img/2018...,무신사 스탠다드,https://store.musinsa.com/app/goods/865862?loc...,캐시미어 블렌드 오버사이즈 싱글 코트 [블랙],139900.0,16135
1,61b041c5c28c300318f4fbf5,2014490,https://image.msscdn.net/images/goods_img/2021...,후아유,https://store.musinsa.com/app/goods/2014490?lo...,리버서블 후드 웜업 [아메리카 블랙] WHJJB4T02U,79920.0,1642
2,61b041c5c28c300318f4fbfa,2037171,https://image.msscdn.net/images/goods_img/2021...,스파오,https://store.musinsa.com/app/goods/2037171?lo...,[허니푸퍼] 리버서블 푸퍼_SPJDB4VC14,69900.0,4207
3,61b041c5c28c300318f4fbfb,1535337,https://image.msscdn.net/images/goods_img/2020...,코드그라피,https://store.musinsa.com/app/goods/1535337?lo...,[PRAUDEN] 유틸리티 덕다운 푸파 숏패딩_쿨그레이,124000.0,4317
4,61b041c5c28c300318f4fbfd,1611891,https://image.msscdn.net/images/goods_img/2020...,라퍼지 포 우먼,https://store.musinsa.com/app/goods/1611891?lo...,울리치 트렌치 맥코트_Black,108000.0,2693
