In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import json
from datetime import datetime, timedelta

# 카테고리 코드와 이름 매핑
category_mapping = {
    '259': '금융',
    '258': '증권',
    '261': '산업/제계',
    '771': '중기/벤처',
    '260': '부동산',
    '262': '글로벌 경제',
    '310': '생활경제',
    '263': '경제일반'
}

# URL에서 카테고리 코드를 추출하고, 카테고리 이름 반환
def get_category_from_url(url):
    category_id = url.split('/')[-1].split('?')[0]  # URL의 마지막 부분에서 카테고리 번호 추출
    return category_mapping.get(category_id, '기타')  # 매핑되지 않은 경우 '기타' 반환

# 페이지 끝까지 스크롤하여 콘텐츠 로드
def scroll_to_bottom(driver, pause_time=1):
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            try:
                more_button = WebDriverWait(driver, 3).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "div > div.section_more > a"))
                )
                more_button.click()
                time.sleep(pause_time)
            except Exception as e:
                print(f"더보기 버튼 클릭 오류: {e}")
                break
        
        last_height = new_height

# 기사 세부 정보를 추출하는 함수
def extract_article_details(url):
    try:
        print(f"기사 정보 추출 중: {url}")
        res = requests.get(url)
        soup_in = BeautifulSoup(res.text, 'html.parser')

        title = soup_in.find('h2').text if soup_in.find('h2') else '제목 오류'

        author = '기자명 오류'
        if soup_in.find('em', {"class": "media_end_head_journalist_name"}):
            author = soup_in.find('em', {"class": "media_end_head_journalist_name"}).text
        elif soup_in.find('span', {"class": "byline_s"}):
            author = soup_in.find('span', {"class": "byline_s"}).text[:3] + " 기자"

        content = soup_in.find('article', {"id": "dic_area"}).text if soup_in.find('article', {"id": "dic_area"}) else '본문 오류'
        date = soup_in.find('span', {"class": "media_end_head_info_datestamp_time _ARTICLE_DATE_TIME"}).text if soup_in.find('span', {"class": "media_end_head_info_datestamp_time _ARTICLE_DATE_TIME"}) else '날짜 오류'
        mod_date = soup_in.find('span', {"class": "media_end_head_info_datestamp_time _ARTICLE_MODIFY_DATE_TIME"}).text if soup_in.find('span', {"class": "media_end_head_info_datestamp_time _ARTICLE_MODIFY_DATE_TIME"}) else '수정 날짜 없음'

        return title, author, content, date, mod_date

    except Exception as e:
        print(f"기사 세부 정보 추출 중 오류 발생: {e}")
        return '오류', '오류', '오류', '오류', '오류'

# 댓글 데이터를 추출하는 함수
def extract_comments(article_url):
    news_id = article_url.split("/")[-1]
    press_id = article_url.split("/")[-2]
    news_object = f"news{press_id}%2C{news_id}"

    url = f"https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json?ticket=news&templateId=view_economy_m1&pool=cbox5&_cv=20240826125754&lang=ko&country=KR&objectId={news_object}&categoryId=&pageSize=100&indexSize=10&groupId=&listType=OBJECT&pageType=more&page=1&initialize=true&followSize=5&userType=&useAltSort=true&replyPageSize=20&sort=FAVORITE&includeAllStatus=true&_=1725273489605"

    custom_header = {
        "Referer": article_url,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
    }

    response = requests.get(url, headers=custom_header)

    json_string = response.text.split("_callback(")[-1].strip(');')

    try:
        json_data = json.loads(json_string)
    except json.JSONDecodeError as e:
        print(f"JSON 파싱 오류: {e}")
        return []

    comments = []
    if 'result' in json_data and 'commentList' in json_data['result']:
        for comment in json_data['result']['commentList']:
            user_id_no = comment.get("userIdNo", "unknown")
            user_unique = user_id_no[:5] if user_id_no else "unknown"

            comments.append({
                "user_unique": user_unique,  # user_id 대신 user_unique 사용
                "comment": comment.get("contents"),
                "comment_date": comment.get("regTime"),
                "update_date": comment.get("modTime", "수정되지 않음")
            })
    return comments

# 뉴스 기사와 댓글을 크롤링하는 함수
def scrape_news(date_range, base_url):
    title_list, content_list, author_list = [], [], []
    date_list, update_date_list, publisher_list = [], [], []
    url_list, comments_list, category_list = [], [], []

    category = get_category_from_url(base_url)

    for date in date_range:
        url = f"{base_url}?date={date.strftime('%Y%m%d')}"
        print(f"크롤링 중: {url}")
        driver.get(url)
        time.sleep(3)

        scroll_to_bottom(driver)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        links = soup.select("div.sa_text > a")
        publishers = soup.select("div.sa_text_press")

        for link, publisher in zip(links, publishers):
            article_url = link.get('href')
            publisher_name = publisher.text.strip()
            url_list.append(article_url)
            publisher_list.append(publisher_name)
            category_list.append(category)

            title, author, content, date, mod_date = extract_article_details(article_url)
            title_list.append(title)
            author_list.append(author)
            content_list.append(content)
            date_list.append(date)
            update_date_list.append(mod_date)

            comments = extract_comments(article_url)
            comments_list.extend(comments)

            print(f"기사 URL: {article_url}, 발행자: {publisher_name}, 카테고리: {category}")

    return {
        "title": title_list,
        "author": author_list,
        "content": content_list,
        "publication_date": date_list,
        "update_date": update_date_list,
        "article_url": url_list,
        "publisher": publisher_list,
        "category": category_list,
        "comments": comments_list
    }

# 날짜 범위를 생성하는 함수
def generate_date_range(start_date, end_date):
    start = datetime.strptime(start_date, '%Y%m%d')
    end = datetime.strptime(end_date, '%Y%m%d')
    delta = timedelta(days=1)
    while start <= end:
        yield start
        start += delta

# 수집한 데이터를 CSV 파일로 저장하는 함수
def save_to_csv(start_date, end_date, all_scraped_data):
    """스크래핑한 데이터를 CSV 파일로 저장하는 함수"""
    # 파일 이름 생성
    file_name_articles = f"{start_date}-{end_date}_articles.csv"
    file_name_comments = f"{start_date}-{end_date}_comments.csv"

    # 기사 데이터 저장
    df_articles = pd.DataFrame({
        'platform': '네이버',
        'category': all_scraped_data['category'],  # 카테고리 자동 설정
        'publisher': all_scraped_data['publisher'],
        'publication_date': all_scraped_data['publication_date'],
        'title': all_scraped_data['title'],
        'content': all_scraped_data['content'],
        'author': all_scraped_data['author'],  # 기자 이름 컬럼 'author'로 수정
        'article_url': all_scraped_data['article_url'],
        'update_date': all_scraped_data['update_date']
    })

    # 댓글 데이터 저장
    df_comments = pd.DataFrame({
        'user_unique': [comment['user_unique'] for comment in all_scraped_data['comments']],
        'comment': [comment['comment'] for comment in all_scraped_data['comments']],
        'comment_date': [comment['comment_date'] for comment in all_scraped_data['comments']],
        'update_date': [comment['update_date'] for comment in all_scraped_data['comments']]
    })

    # 데이터프레임을 CSV로 저장
    df_articles.to_csv(file_name_articles, index=False)
    df_comments.to_csv(file_name_comments, index=False)

    print(f"기사 데이터가 '{file_name_articles}' 파일로 저장되었습니다.")
    print(f"댓글 데이터가 '{file_name_comments}' 파일로 저장되었습니다.")

# 크롬 드라이버 설정 및 크롤링 실행
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=chrome_options)

start_date = '20240801'
end_date = '20240831'
date_range = generate_date_range(start_date, end_date)

# 카테고리별 URL
base_urls = [
    "https://news.naver.com/breakingnews/section/101/258",  # 증권
#     "https://news.naver.com/breakingnews/section/101/259",  # 금융
#     "https://news.naver.com/breakingnews/section/101/261",  # 산업/제계
#     "https://news.naver.com/breakingnews/section/101/771",  # 중기/벤처
#     "https://news.naver.com/breakingnews/section/101/260",  # 부동산
#     "https://news.naver.com/breakingnews/section/101/262",  # 글로벌 경제
#     "https://news.naver.com/breakingnews/section/101/310",  # 생활경제
#     "https://news.naver.com/breakingnews/section/101/263"   # 경제일반
]

all_scraped_data = {
    "title": [],
    "author": [],
    "content": [],
    "publication_date": [],
    "update_date": [],
    "article_url": [],
    "publisher": [],
    "category": [],
    "comments": []
}

# 각 카테고리별로 데이터를 크롤링하고 수집
for base_url in base_urls:
    scraped_data = scrape_news(date_range, base_url)

    # 수집한 데이터를 합치기
    for key in all_scraped_data:
        all_scraped_data[key].extend(scraped_data[key])

# 크롬 드라이버 종료
driver.quit()

# 수집된 데이터를 CSV로 저장
save_to_csv(start_date, end_date, all_scraped_data)
