In [7]:
# under19

import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# 전체 데이터셋 로드
file_path = "kakao_작가 연재 수, 원작 여부 병합 전 최종_under19.csv"
df = pd.read_csv(file_path, encoding="utf-8-sig")
title_ids = df['title_id'].tolist()

total = len(title_ids)
chunk_size = 100

# 100개 단위 저장할 전용 폴더 생성
output_folder = "kakao_novel_original_chunks"
os.makedirs(output_folder, exist_ok=True)

# Selenium 설정
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

for i in range(0, total, chunk_size):
    chunk = title_ids[i:i+chunk_size]
    results = []

    for j, title_id in enumerate(chunk):
        url = f"https://page.kakao.com/content/{title_id}"
        driver.get(url)
        time.sleep(3)

        try:
            # "동일작" 헤드 찾기
            header_element = driver.find_element(By.XPATH, "//div[@class='font-medium1-bold w-full self-center overflow-hidden text-ellipsis text-el-70 line-clamp-1' and text()='동일작']")
            same_work_section = header_element.find_element(By.XPATH, "./ancestor::div[contains(@class, 'bg-bg-a-20')]")
            webnovel_elements = same_work_section.find_elements(By.XPATH, ".//span[@class='break-all align-middle' and text()='웹소설']")
            if webnovel_elements:
                novel_original = 1
            else:
                novel_original = 0
        except:
            novel_original = 0

        results.append({"title_id": title_id, "novel_original": novel_original})

        # 진행 상황 출력
        current_idx = i + j + 1
        print(f"[{current_idx}/{total}] title_id={title_id} novel_original={novel_original}")

    # 100개 단위 파일 저장 (전용 폴더 안)
    file_num = (i // chunk_size) + 1
    output_path = os.path.join(output_folder, f"kakao_novel_original_{file_num}.csv")
    df_chunk = pd.DataFrame(results)
    df_chunk.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"✅ 저장 완료: {output_path} (진행률: {current_idx/total:.2%})")

driver.quit()
print("🎉 전체 크롤링 완료!")


[1/2366] title_id=50866481 novel_original=1
[2/2366] title_id=56976992 novel_original=1
[3/2366] title_id=56271898 novel_original=1
[4/2366] title_id=55566760 novel_original=1
[5/2366] title_id=57260192 novel_original=1
[6/2366] title_id=58800646 novel_original=1
[7/2366] title_id=54189843 novel_original=1
[8/2366] title_id=50242834 novel_original=1
[9/2366] title_id=53190884 novel_original=1
[10/2366] title_id=55021766 novel_original=1
[11/2366] title_id=59999668 novel_original=1
[12/2366] title_id=54688834 novel_original=1
[13/2366] title_id=53297664 novel_original=1
[14/2366] title_id=60626628 novel_original=1
[15/2366] title_id=58410245 novel_original=1
[16/2366] title_id=56657309 novel_original=1
[17/2366] title_id=55872683 novel_original=1
[18/2366] title_id=56556599 novel_original=1
[19/2366] title_id=50289296 novel_original=1
[20/2366] title_id=51526841 novel_original=1
[21/2366] title_id=62346531 novel_original=0
[22/2366] title_id=52226628 novel_original=1
[23/2366] title_id=

In [1]:
import pandas as pd
import glob

# ✅ 1️⃣ 100개 단위로 저장된 CSV 파일 경로 지정 (폴더명에 맞게 수정)
folder_path = "kakao_원작 여부_under19"
file_pattern = f"{folder_path}/kakao_novel_original_*.csv"

# ✅ 2️⃣ 모든 파일을 순회하며 데이터프레임으로 불러오기
file_list = glob.glob(file_pattern)

merged_df = pd.DataFrame()

for file in sorted(file_list, key=lambda x: int(x.split('_')[-1].split('.')[0])):
    df = pd.read_csv(file, encoding="utf-8-sig")
    merged_df = pd.concat([merged_df, df], ignore_index=True)
    print(f"✅ {file} 병합 완료 (현재 행 개수: {len(merged_df)})")

# ✅ 3️⃣ 최종 합친 파일로 저장
output_file = "kakao_novel_original_final.csv"
merged_df.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"🎉 최종 병합 파일 저장 완료: {output_file}")


✅ kakao_원작 여부_under19\kakao_novel_original_1.csv 병합 완료 (현재 행 개수: 100)
✅ kakao_원작 여부_under19\kakao_novel_original_2.csv 병합 완료 (현재 행 개수: 200)
✅ kakao_원작 여부_under19\kakao_novel_original_3.csv 병합 완료 (현재 행 개수: 300)
✅ kakao_원작 여부_under19\kakao_novel_original_4.csv 병합 완료 (현재 행 개수: 400)
✅ kakao_원작 여부_under19\kakao_novel_original_5.csv 병합 완료 (현재 행 개수: 500)
✅ kakao_원작 여부_under19\kakao_novel_original_6.csv 병합 완료 (현재 행 개수: 600)
✅ kakao_원작 여부_under19\kakao_novel_original_7.csv 병합 완료 (현재 행 개수: 700)
✅ kakao_원작 여부_under19\kakao_novel_original_8.csv 병합 완료 (현재 행 개수: 800)
✅ kakao_원작 여부_under19\kakao_novel_original_9.csv 병합 완료 (현재 행 개수: 900)
✅ kakao_원작 여부_under19\kakao_novel_original_10.csv 병합 완료 (현재 행 개수: 1000)
✅ kakao_원작 여부_under19\kakao_novel_original_11.csv 병합 완료 (현재 행 개수: 1100)
✅ kakao_원작 여부_under19\kakao_novel_original_12.csv 병합 완료 (현재 행 개수: 1200)
✅ kakao_원작 여부_under19\kakao_novel_original_13.csv 병합 완료 (현재 행 개수: 1300)
✅ kakao_원작 여부_under19\kakao_novel_original_14.csv 병합 완료 (현재 행 개수: 1400)
✅ kakao_원작