In [4]:
# Step 0: 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# 작업 디렉토리 설정
import os
project_path = '/content/drive/MyDrive/aladin-project'

# 폴더가 없으면 생성
os.makedirs(project_path, exist_ok=True)
os.makedirs(f'{project_path}/data/raw', exist_ok=True)

%cd {project_path}
print(f"현재 작업 디렉토리: {os.getcwd()}")

# 필요 라이브러리
import re
import time, datetime, ssl
import pandas as pd
import urllib.request
from itertools import count
from bs4 import BeautifulSoup
from random import uniform

class AladinBestSeller():
    myencoding = 'utf-8'

    def getSoup(self):
        if self.soup == None:
            return None
        else:
            return BeautifulSoup(self.soup, 'html.parser')

    def get_request_url(self):
        request = urllib.request.Request(self.url)
        try:
            context = ssl._create_unverified_context()
            response = urllib.request.urlopen(request, context=context)
            if response.getcode() == 200:
                return response.read().decode(self.myencoding)
        except Exception as err:
            print(err)
            now = datetime.datetime.now()
            msg = '[%s] error for url %s' % (now, self.url)
            print(msg)
            return None

    def save2Csv(self, result):
        data = pd.DataFrame(result, columns=self.mycolumns)
        save_path = f'data/raw/{self.siteName}.csv'
        data.to_csv(save_path, encoding='utf-8-sig', index=True)
        print(f"파일 저장 완료: {save_path}")

    def __init__(self, siteName, url):
        self.siteName = siteName
        self.url = url
        self.mycolumns = ['year', 'month', 'rank', 'category', 'title', 'author', 'price', 'star_score', 'page_count', 'item_id']
        self.soup = self.get_request_url()

####################################################
siteName = 'aladin'
base_url = 'https://www.aladin.co.kr/shop/common/wbest.aspx'
####################################################

def getData():
    savedData = []
    for year in range(2020, 2026):
        last_month = 11 if year == 2025 else 12
        for month in range(1, last_month + 1):
            url = base_url
            url += '?BranchType=1&CID=0&Year=' + str(year)
            url += '&Month=' + str(month)
            url += '&Week=1&BestType=MonthlyBest&SearchSubBarcode='
            print(url)

            aladin = AladinBestSeller(siteName, url)
            soup = aladin.getSoup()

            if soup is None:
                break

            for rank, item in enumerate(soup.select("div.ss_book_box"), start=1):
                try:
                    category_tag = item.select_one("span.tit_catrgory")
                    category = category_tag.get_text(strip=True).strip('[]') if category_tag else "N/A"

                    title_tag = item.select_one("a.bo3")
                    title = title_tag.get_text(strip=True) if title_tag else "N/A"

                    item_id = "N/A"
                    if title_tag and title_tag.has_attr('href'):
                        match = re.search(r'ItemId=(\d+)', title_tag['href'])
                        if match:
                            item_id = match.group(1)

                    author = "N/A"
                    book_list_ul = item.select_one("div.ss_book_list ul")
                    if book_list_ul:
                        author_link = book_list_ul.select_one("a[href*='AuthorSearch']")
                        if author_link:
                            author = author_link.get_text(strip=True)

                    price_tag = item.select_one("span.ss_p2")
                    price_text = price_tag.get_text(strip=True).split('원')[0] if price_tag else "0"
                    price = int(price_text.replace(",", ""))

                    star_score_tag = item.select_one("span.star_score")
                    star_score = float(star_score_tag.get_text(strip=True)) if star_score_tag else 0.0

                    page_count = 0

                    savedData.append([year, month, rank, category, title, author, price, star_score, page_count, item_id])
                except Exception as err:
                    print(err)
                    continue

            time.sleep(uniform(1, 2))

    aladin.save2Csv(savedData)
    print('=' * 50)
    print(f"총 {len(savedData)}개 데이터 수집 완료")
    return len(savedData)

####################################################
print(siteName + ' 베스트셀러 크롤링 시작')
total_count = getData()
print(siteName + ' 베스트셀러 크롤링 끝')
print(f"data/raw/aladin.csv 파일이 구글 드라이브에 저장되었습니다.")

In [6]:
from google.colab import drive
import os
import pandas as pd
import numpy as np

# 구글 드라이브 마운트
drive.mount('/content/drive')

# 프로젝트 폴더 설정
project_path = '/content/drive/MyDrive/aladin-project'

# 해당 폴더로 이동
%cd {project_path}
print(f"현재 작업 디렉토리: {os.getcwd()}")

# CSV 로드
df = pd.read_csv('data/raw/aladin.csv')
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/aladin-project
현재 작업 디렉토리: /content/drive/MyDrive/aladin-project


Unnamed: 0.1,Unnamed: 0,year,month,rank,category,title,author,price,star_score,page_count,item_id
0,0,2020,1,1,국내도서,흔한남매 3,흔한남매,10800,9.9,0,223274669
1,1,2020,1,2,국내도서,지적 대화를 위한 넓고 얕은 지식 : 제로 편,채사장,20700,8.5,0,219841523
2,2,2020,1,3,국내도서,마도조사 4 박스판,묵향동후,16650,8.3,0,227151626
3,3,2020,1,4,국내도서,추리 천재 엉덩이 탐정 9,트롤,10800,9.8,0,227129549
4,4,2020,1,5,국내도서,에이트,이지성,17100,6.4,0,212657645


In [None]:
# Step 0: 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# 작업 디렉토리 설정
import os
project_path = '/content/drive/MyDrive/aladin-project'
%cd {project_path}
print(f"현재 작업 디렉토리: {os.getcwd()}")

# Step 1: 라이브러리 import
import time
import datetime
import ssl
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import re
from random import uniform
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Step 2: 카테고리 + 페이지 수 크롤링 클래스 정의
class AladinDetailCrawler():
    myencoding = 'utf-8'

    def get_detail_info(self, detail_url, retry=3):
        """상세 페이지에서 카테고리와 페이지 수 추출"""
        for attempt in range(retry):
            try:
                request = urllib.request.Request(detail_url)
                request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
                context = ssl._create_unverified_context()
                response = urllib.request.urlopen(request, context=context, timeout=15)

                if response.getcode() == 200:
                    html = response.read().decode(self.myencoding)
                    soup = BeautifulSoup(html, 'html.parser')

                    # 1. 카테고리 추출
                    category = "N/A"
                    category_links = soup.select('ul#ulCategory li a')
                    if len(category_links) >= 2:
                        category = category_links[1].get_text(strip=True)

                    # 2. 페이지 수 추출
                    page_count = 0
                    info_list = soup.select('div.conts_info_list1 ul li')
                    for li in info_list:
                        text = li.get_text(strip=True)
                        if '쪽' in text or '페이지' in text:
                            match = re.search(r'(\d+)\s*쪽', text)
                            if not match:
                                match = re.search(r'(\d+)\s*페이지', text)
                            if match:
                                page_count = int(match.group(1))
                                break

                    return (category, page_count)

            except Exception as err:
                if attempt < retry - 1:
                    wait_time = (attempt + 1) * 2
                    time.sleep(wait_time)
                else:
                    return ("N/A", 0)
        return ("N/A", 0)

    def save2Csv(self, result):
        data = pd.DataFrame(result, columns=self.mycolumns)
        save_path = f'data/raw/{self.siteName}.csv'
        data.to_csv(save_path, encoding='utf-8-sig', index=False)
        print(f"파일 저장 완료: {save_path}")

    def __init__(self, siteName):
        self.siteName = siteName
        self.mycolumns = ['item_id', 'real_category', 'page_count']

# Step 3: 크롤링 함수
def getDetailData(csv_file='data/raw/aladin.csv', max_workers=15):
    """기존 CSV에서 고유 ItemId의 카테고리와 페이지 수를 병렬로 크롤링"""

    try:
        df = pd.read_csv(csv_file, index_col=0)
        if 'item_id' not in df.columns:
            print(f"오류: {csv_file}에 'item_id' 컬럼이 없습니다.")
            return
    except FileNotFoundError:
        print(f"오류: {csv_file} 파일을 찾을 수 없습니다.")
        return

    print(f"원본 데이터: {len(df)}개 행")

    # 고유 ItemId 추출
    df_valid = df[pd.to_numeric(df['item_id'], errors='coerce').notnull()]
    unique_item_ids = df_valid['item_id'].unique()

    total_count = len(unique_item_ids)
    if total_count == 0:
        print("크롤링할 유효한 item_id가 없습니다.")
        return

    print(f"고유 ItemID 수: {total_count}개")

    crawler = AladinDetailCrawler('detail_mapping')
    savedData = []

    def fetch_detail(item_id):
        """단일 ItemID에 대해 카테고리와 페이지 수를 가져오는 함수"""
        try:
            detail_url = f'https://www.aladin.co.kr/shop/wproduct.aspx?ItemId={item_id}'
            category, page_count = crawler.get_detail_info(detail_url)
            time.sleep(uniform(0.5, 1.5))
            return (item_id, category, page_count, None)
        except Exception as err:
            return (item_id, 'N/A', 0, str(err))

    print(f"병렬 크롤링 시작 (최대 {max_workers}개 동시 작업)...")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_detail, item_id): item_id for item_id in unique_item_ids}

        for future in tqdm(as_completed(futures), total=total_count, desc="상세정보 크롤링"):
            item_id, category, page_count, error = future.result()
            savedData.append([item_id, category, page_count])

            if error:
                tqdm.write(f"  [오류] ItemID {item_id}: {error}")

    crawler.save2Csv(savedData)

    print("=" * 60)
    print(f"총 {len(savedData)}개 ItemID 처리 완료")
    success_category = len([x for x in savedData if x[1] != 'N/A'])
    success_page = len([x for x in savedData if x[2] > 0])
    print(f"카테고리 추출 성공: {success_category}개")
    print(f"페이지 수 추출 성공: {success_page}개")
    print(f"모두 실패: {len(savedData) - max(success_category, success_page)}개")

# Step 4: 크롤링 실행
print("detail_mapping 크롤링 시작")
getDetailData('data/raw/aladin.csv', max_workers=15)
print("detail_mapping 크롤링 끝")

# Step 5: 결과 확인
df_detail = pd.read_csv('data/raw/detail_mapping.csv')
print(f"\n상세정보 데이터: {len(df_detail)}개 행")
print(f"\n통계:")
print(f"  - 카테고리 추출 성공: {(df_detail['real_category'] != 'N/A').sum()}개")
print(f"  - 페이지 수 추출 성공: {(df_detail['page_count'] > 0).sum()}개")
print(f"\n샘플 데이터:")
df_detail.head(10)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/aladin-project
현재 작업 디렉토리: /content/drive/MyDrive/aladin-project
detail_mapping 크롤링 시작
원본 데이터: 3539개 행
고유 ItemID 수: 1958개
병렬 크롤링 시작 (최대 15개 동시 작업)...


상세정보 크롤링: 100%|██████████| 1958/1958 [12:01<00:00,  2.71it/s]


파일 저장 완료: data/raw/detail_mapping.csv
총 1958개 ItemID 처리 완료
카테고리 추출 성공: 1937개
페이지 수 추출 성공: 1937개
모두 실패: 21개
detail_mapping 크롤링 끝

상세정보 데이터: 1958개 행

통계:
  - 카테고리 추출 성공: 1958개
  - 페이지 수 추출 성공: 1937개

샘플 데이터:


Unnamed: 0,item_id,real_category,page_count
0,227151626,,0
1,223726675,에세이,216
2,227151653,,0
3,220642182,자기계발,304
4,221601434,소설/시/희곡,352
5,219841523,인문학,556
6,227693700,만화,288
7,226928892,어린이,208
8,870950,과학,719
9,212657645,자기계발,308
