In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from tqdm import tqdm


In [5]:

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=options)
    driver.set_window_size(1800, 1800)
    return driver

def load_progress(filename='Items_Info.csv'):
    try:
        df = pd.read_csv(filename)
        if df.empty:
            return None, 1
        # 마지막으로 데이터가 저장된 브랜드와 페이지 번호를 가져옵니다.
        last_brand = df['Brand'].iloc[-1]
        last_page = df['Page_Number'].iloc[-1]
        return last_brand, last_page
    except FileNotFoundError:
        return None, 1

def scrape_brand_data(driver, brand, start_page=1, end_page=100, save_interval=10):
    all_data = []  # 모든 페이지의 데이터를 저장할 리스트
    for page_num in range(start_page, end_page + 1):
        page_url = f'https://www.musinsa.com/brands/{brand}?page={page_num}&size=90'
        driver.get(page_url)

        if is_page_empty(driver):
            print(f"No items found for brand {brand} on page {page_num}.")
            break

        data = []
        for item in tqdm(range(1, 91), desc=f"Scraping {brand} - Page {page_num}"):
            item_details = scrape_product_info(driver, brand, item, div_nums)
            if item_details:
                data.append(item_details)
                all_data.append(item_details)  # 현재 페이지의 데이터를 전체 데이터에 추가

        # 설정된 간격마다 또는 마지막 페이지에서 데이터 저장
        if page_num % save_interval == 0 or page_num == end_page:
            save_data(all_data, brand, page_num)
            all_data.clear()  # 저장 후 전체 데이터 리스트 초기화















# def load_progress(filename='completed_brands.csv'):
#     try:
#         df = pd.read_csv(filename)
#         if df.empty:
#             return None, 1
#         last_row = df.iloc[-1]
#         brand = last_row['Brand']
#         last_completed_page = last_row['Page_Number']
#         return brand, last_completed_page
#     except FileNotFoundError:
#         return None, 1

# def save_progress(completed_brands, filename='completed_brands.csv'):
#     pd.DataFrame(completed_brands, columns=['Brand', 'Page_Number']).to_csv(filename, index=False)

# def save_data(data, brand, page_num, brand_page_info='Items_Info.csv'):
#     if data:
#         new_data = pd.DataFrame(data)
#         brand_page_info = f"./Datasets/{brand}_page_{page_num}.csv"
#         if not os.path.exists("./Datasets"):
#             os.makedirs("./Datasets")
#         if not os.path.exists(brand_page_info):
#             new_data.to_csv(brand_page_info, index=False)
#         else:
#             new_data.to_csv(brand_page_info, mode='a', header=False, index=False)
#     else:
#         print("No data to save.")


### 하나의 파일로 맨마지막 브랜드 + page 를 가져와서 그 후 부터 다시 시작




def is_page_empty(driver):
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchList"]/li[1]')))
        return False
    except TimeoutException:
        try:
            no_items_text = driver.find_element(By.XPATH, '//*[@id="result-none-area"]/p').text
            if "등록된 상품이 없습니다" in no_items_text:
                return True
        except NoSuchElementException:
            pass
        return False


div_nums = [1,2,3,4]

def scrape_product_info(driver, brand, item, div_nums):
    item_price = None
    item_details = {
        "brand": None,
        "item_category": None,
        "item_img": None,
        "item_name": None,
        "item_num": None,
        "item_brand": None,
        "item_price": None
    }

    for div_num in div_nums:
        try:
            item_price_element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, f'//*[@id="searchList"]/li[{item}]/div[{div_num}]/div[2]/p[3]')))
            item_price = item_price_element.text
            break
        except NoSuchElementException:
            print("가격 못찾음")
            pass
    
    for div_num in div_nums:
        try:
            item_element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, f'//*[@id="searchList"]/li[{item}]/div[{div_num}]/div[2]/p[2]/a')))
            driver.execute_script("arguments[0].click();", item_element)
            break
        except NoSuchElementException:
            print("클릭못찾음")
            pass
   
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="root"]/div[1]/div[1]/div[1]/a[1]')))

    for div_num in div_nums:
        try:
            item_details["item_category"] = driver.find_element(By.XPATH, '//*[@id="root"]/div[1]/div[1]/div[1]/a[1]').text
            item_details["item_img"] = driver.find_element(By.XPATH, '//*[@id="root"]/div[1]/div[2]/div[1]/div[1]/div/img').get_attribute('src')
            item_details["item_name"] = driver.find_element(By.XPATH, '//*[@id="root"]/div[1]/div[1]/div[3]/div[2]/h3').text
            item_details["item_num"] = driver.find_element(By.XPATH, f'//*[@id="root"]/div[1]/div[2]/div[2]/div[{div_num}]/ul/li[1]/div[2]/span[2]').text
            break
        except NoSuchElementException:
            print("카테고리,이미지,이름,제품번호중 못찾음")
            pass

    for div_num in div_nums:
        try:
            item_details["item_brand"] = driver.find_element(By.XPATH, f'//*[@id="root"]/div[1]/div[2]/div[2]/div[{div_num}]/ul/li[1]/div[2]/a').text
            item_details["item_price"] = item_price
            break
        except NoSuchElementException:
            print("브랜드나 가격 오류")
            driver.quit()
        

    driver.back()
    return item_details



def scrape_brand_data(driver, brand, start_page=1, end_page=100):
    for page_num in range(start_page, end_page + 1):
        page_url = f'https://www.musinsa.com/brands/{brand}?page={page_num}&size=90'
        driver.get(page_url)

        if is_page_empty(driver):
            print(f"No items found for brand {brand} on page {page_num}.")
            break

        data = []
        for item in tqdm(range(1, 91), desc=f"Scraping {brand} - Page {page_num}"):
            item_details = scrape_product_info(driver, brand, item, div_nums)
            if item_details:
                data.append(item_details)

        save_data(data, brand, page_num)

def main():
    driver = setup_driver()
    df = pd.read_csv('Brand_Name.csv')
    brand_names = df['Brand_ENG'].tolist()
    last_completed_brand, start_page = load_progress()

    for brand in tqdm(brand_names, desc="Overall Progress"):
        if brand == last_completed_brand:
            start_page += 1  # 마지막으로 완료된 브랜드의 다음 페이지부터 시작
        else:
            start_page = 1  # 새 브랜드는 첫 페이지부터 시작
        print(f"Starting scraping for brand: {brand} from page {start_page}")
        scrape_brand_data(driver, brand, start_page=start_page)
        start_page = 1  # 다음 브랜드를 위해 시작 페이지를 리셋

    driver.quit()

if __name__ == "__main__":
    main()

Overall Progress:   0%|          | 0/2905 [00:00<?, ?it/s]

Starting scraping for brand: DISCOVERYEXPEDITION from page 1


Scraping DISCOVERYEXPEDITION - Page 1:   0%|          | 0/90 [00:09<?, ?it/s]
Overall Progress:   0%|          | 0/2905 [00:11<?, ?it/s]


KeyboardInterrupt: 

In [10]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from tqdm import tqdm

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=options)
    driver.set_window_size(1800, 1800)
    return driver

def load_progress(filename='completed_brands.csv'):
    try:
        df = pd.read_csv(filename)
        if df.empty:
            return None, 1
        last_row = df.iloc[-1]
        brand = last_row['Brand']
        last_completed_page = last_row['Page_Number']
        return brand, last_completed_page
    except FileNotFoundError:
        return None, 1

def save_progress(completed_brands, filename='completed_brands.csv'):
    pd.DataFrame(completed_brands, columns=['Brand', 'Page_Number']).to_csv(filename, index=False)

def is_page_empty(driver):
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchList"]/li[1]')))
        return False
    except TimeoutException:
        try:
            no_items_text = driver.find_element(By.XPATH, '//*[@id="result-none-area"]/p').text
            if "등록된 상품이 없습니다" in no_items_text:
                return True
        except NoSuchElementException:
            pass
        return False


def scrape_product_info(driver, brand ,item):
    item_price = None
    item_details = {
        "brand": brand,
        "item_category": None,
        "item_img": None,
        "item_name": None,
        "item_num": None,
        "item_brand": None,
        "item_price": None
    }
    time.sleep(0.5)
    for div_num in range(4,1,-1):
        try:
            
            item_price_element =driver.find_element(By.XPATH, f'//*[@id="searchList"]/li[{item}]/div[{div_num}]/div[2]/p[3]')
            item_price = item_price_element.text
            
           
            break
        except NoSuchElementException:
            
            pass
    time.sleep(0.5)
    for div_num in range(1,5):
        try:
    
            item_element = driver.find_element(By.XPATH, f'//*[@id="searchList"]/li[{item}]/div[{div_num}]/div[2]/p[2]/a')
            driver.execute_script("arguments[0].click();", item_element)
            
            break
        except NoSuchElementException:
            
            pass
   
    
    time.sleep(0.5)
    
    for num in range(1,5):
        try:
            item_details["item_category"] = driver.find_element(By.XPATH, '//*[@id="root"]/div[1]/div[1]/div[1]/a[1]').text
            
            item_details["item_img"] = driver.find_element(By.XPATH, '//*[@id="root"]/div[1]/div[2]/div[1]/div[1]/div/img').get_attribute('src')
            
            item_details["item_name"] = driver.find_element(By.XPATH, '//*[@id="root"]/div[1]/div[1]/div[3]/div[2]/h3').text
            
            item_details["item_num"] = driver.find_element(By.XPATH, f'//*[@id="root"]/div[1]/div[2]/div[2]/div[{num}]/ul/li[1]/div[2]/span[2]').text
            
            break
        except NoSuchElementException:
            
            pass

    for div_num in range(1,5):
        try:
            item_details["item_brand"] = driver.find_element(By.XPATH, f'//*[@id="root"]/div[1]/div[2]/div[2]/div[{div_num}]/ul/li[1]/div[2]/a').text
            item_details["item_price"] = item_price
            break
        except NoSuchElementException:
            
            pass
        

    driver.back()
    
    return item_details

def save_data(data, brand, page_num, brand_page_info='Items_Info.csv'):
    if data:
        new_data = pd.DataFrame(data)
        brand_page_info = f"./Datasets/{brand}_page_{page_num}.csv"
        if not os.path.exists("./Datasets"):
            os.makedirs("./Datasets")
        if os.path.exists(brand_page_info):
            # 파일이 존재하면, 이미 저장된 데이터를 확인
            existing_data = pd.read_csv(brand_page_info)
            if not existing_data.empty:
                # 이미 저장된 데이터가 있다면, 새 데이터를 추가하지 않음
                return
        # 새 데이터를 파일에 저장
        new_data.to_csv(brand_page_info, mode='a', header=not os.path.exists(brand_page_info), index=False)
    else:
        print("No data to save.")

def scrape_brand_data(driver, brand, start_page=1, end_page=100):
    completed_brands = []
    for page_num in range(start_page, end_page + 1):
        page_url = f'https://www.musinsa.com/brands/{brand}?page={page_num}&size=90'
        driver.get(page_url)

        if is_page_empty(driver):
            print(f"No items found for brand {brand} on page {page_num}.")
            break

        data = []
        for item in tqdm(range(1, 91), desc=f"Scraping {brand} - Page {page_num}"):
           
            item_details = scrape_product_info(driver, brand, item)
            if item_details:
                data.append(item_details)
            else:
                print(f"No item found at position {item} for brand {brand} on page {page_num}.")

        save_data(data, brand, page_num)
        data.clear()
        completed_brands.append({'Brand': brand, 'Page_Number': page_num})
        save_progress(completed_brands)  

def main():
    driver = setup_driver()
    df = pd.read_csv('Brand_Name.csv')
    brand_names = df['Brand_ENG'].tolist()
    last_completed_brand, start_page = load_progress()

    for brand in tqdm(brand_names, desc="Overall Progress"):
        if brand == last_completed_brand:
            start_page += 1  # 마지막으로 완료된 브랜드의 다음 페이지부터 시작
        else:
            start_page = 1  # 새 브랜드는 첫 페이지부터 시작
        print(f"Starting scraping for brand: {brand} from page {start_page}")
        scrape_brand_data(driver, brand, start_page=start_page)
        start_page = 1  # 다음 브랜드를 위해 시작 페이지를 리셋

    driver.quit()




if __name__ == "__main__":
    main()


Overall Progress:   0%|          | 0/2891 [00:00<?, ?it/s]

Starting scraping for brand: SCULPTOR from page 4


Scraping SCULPTOR - Page 4: 100%|██████████| 90/90 [04:05<00:00,  2.73s/it]
Scraping SCULPTOR - Page 5: 100%|██████████| 90/90 [04:06<00:00,  2.74s/it]
Scraping SCULPTOR - Page 6: 100%|██████████| 90/90 [04:03<00:00,  2.71s/it]
Scraping SCULPTOR - Page 7: 100%|██████████| 90/90 [04:05<00:00,  2.73s/it]
Overall Progress:   0%|          | 1/2891 [16:38<801:25:13, 998.31s/it]

No items found for brand SCULPTOR on page 8.
Starting scraping for brand: GLOWNY from page 1


Scraping GLOWNY - Page 1: 100%|██████████| 90/90 [04:01<00:00,  2.68s/it]
Scraping GLOWNY - Page 2: 100%|██████████| 90/90 [04:07<00:00,  2.75s/it]
Scraping GLOWNY - Page 3: 100%|██████████| 90/90 [04:13<00:00,  2.82s/it]
Scraping GLOWNY - Page 4: 100%|██████████| 90/90 [04:08<00:00,  2.76s/it]
Overall Progress:   0%|          | 2/2891 [33:24<804:38:06, 1002.66s/it]

No items found for brand GLOWNY on page 5.
Starting scraping for brand: PLACESTUDIO from page 1


Scraping PLACESTUDIO - Page 1: 100%|██████████| 90/90 [04:28<00:00,  2.98s/it]
Scraping PLACESTUDIO - Page 2: 100%|██████████| 90/90 [04:30<00:00,  3.00s/it]
Scraping PLACESTUDIO - Page 3: 100%|██████████| 90/90 [04:32<00:00,  3.03s/it]
Scraping PLACESTUDIO - Page 4: 100%|██████████| 90/90 [04:34<00:00,  3.05s/it]
Overall Progress:   0%|          | 3/2891 [51:44<840:11:24, 1047.33s/it]

No items found for brand PLACESTUDIO on page 5.
Starting scraping for brand: SIGNATURE from page 1


Scraping SIGNATURE - Page 1: 100%|██████████| 90/90 [04:47<00:00,  3.20s/it]
Scraping SIGNATURE - Page 2: 100%|██████████| 90/90 [04:45<00:00,  3.18s/it]
Scraping SIGNATURE - Page 3: 100%|██████████| 90/90 [04:43<00:00,  3.15s/it]
Overall Progress:   0%|          | 4/2891 [1:06:15<784:04:55, 977.73s/it]

No items found for brand SIGNATURE on page 4.
Starting scraping for brand: FABREGAT from page 1


Scraping FABREGAT - Page 1: 100%|██████████| 90/90 [04:52<00:00,  3.25s/it]
Scraping FABREGAT - Page 2: 100%|██████████| 90/90 [04:48<00:00,  3.20s/it]
Overall Progress:   0%|          | 5/2891 [1:16:10<673:09:45, 839.70s/it]

No items found for brand FABREGAT on page 3.
Starting scraping for brand: MARDIMERCREDI from page 1


Scraping MARDIMERCREDI - Page 1: 100%|██████████| 90/90 [05:00<00:00,  3.34s/it]
Scraping MARDIMERCREDI - Page 2: 100%|██████████| 90/90 [05:04<00:00,  3.39s/it]
Scraping MARDIMERCREDI - Page 3: 100%|██████████| 90/90 [05:03<00:00,  3.37s/it]
Scraping MARDIMERCREDI - Page 4: 100%|██████████| 90/90 [05:02<00:00,  3.37s/it]
Scraping MARDIMERCREDI - Page 5: 100%|██████████| 90/90 [05:13<00:00,  3.48s/it]
Scraping MARDIMERCREDI - Page 6: 100%|██████████| 90/90 [05:18<00:00,  3.54s/it]
Scraping MARDIMERCREDI - Page 7: 100%|██████████| 90/90 [05:22<00:00,  3.58s/it]
Scraping MARDIMERCREDI - Page 8: 100%|██████████| 90/90 [05:24<00:00,  3.60s/it]
Overall Progress:   0%|          | 6/2891 [1:58:06<1129:37:28, 1409.58s/it]

No items found for brand MARDIMERCREDI on page 9.
Starting scraping for brand: SLOWACID from page 1


Scraping SLOWACID - Page 1: 100%|██████████| 90/90 [05:49<00:00,  3.88s/it]
Scraping SLOWACID - Page 2: 100%|██████████| 90/90 [05:50<00:00,  3.89s/it]
Scraping SLOWACID - Page 3: 100%|██████████| 90/90 [05:46<00:00,  3.85s/it]
Scraping SLOWACID - Page 4: 100%|██████████| 90/90 [05:44<00:00,  3.83s/it]
Overall Progress:   0%|          | 7/2891 [2:21:36<1129:17:17, 1409.65s/it]

No items found for brand SLOWACID on page 5.
Starting scraping for brand: BROWNBREATH from page 1


Scraping BROWNBREATH - Page 1: 100%|██████████| 90/90 [05:57<00:00,  3.97s/it]
Scraping BROWNBREATH - Page 2: 100%|██████████| 90/90 [06:03<00:00,  4.04s/it]
Scraping BROWNBREATH - Page 3: 100%|██████████| 90/90 [06:13<00:00,  4.15s/it]
Scraping BROWNBREATH - Page 4: 100%|██████████| 90/90 [06:13<00:00,  4.15s/it]
Overall Progress:   0%|          | 7/2891 [2:46:17<1141:50:49, 1425.33s/it]


KeyboardInterrupt: 