In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
from concurrent.futures import ThreadPoolExecutor

In [None]:
hdr = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
categories = ["finance", "industry", "employ", "autos", "stock", "estate", "consumer", "worldeconomy", "coin", "pension", "policy", "startup"]
start_date = "20240731"
date_obj = dt.datetime.strptime(start_date, "%Y%m%d")

In [None]:
def get_last_page_num(category: str, date: str):
    pgn = 1
    while True:
        url = f"https://news.daum.net/breakingnews/economic/{category}?page={pgn}&regDate={date}"
        res = requests.get(url, headers=hdr)
        soup = BeautifulSoup(res.text, 'html.parser')
        if soup.select("a.btn_page.btn_next") == []:
            break
        pgn += 10

    last_page_num = int(soup.select("a.num_page")[-1].get_text()) if soup.select("a.num_page") else 1
    return last_page_num

In [None]:
def make_monthly_day_list(date_obj) -> list:
    current_month = date_obj.month
    monthly_day_list = []

    while date_obj.month == current_month:
        monthly_day_list.append(date_obj.strftime("%Y%m%d"))
        date_obj -= dt.timedelta(days=1)

    return monthly_day_list

In [None]:
def source_from_news(news_url: str):
    try:
        res = requests.get(news_url, headers=hdr)
        soup = BeautifulSoup(res.text, 'html.parser')

        title = soup.select_one("h3.tit_view").get_text() if soup.select("h3.tit_view") else None
        publisher = soup.select_one("h1.doc-title").text.strip() if soup.select("h1.doc-title") else None
        reporter = soup.select('[dmcf-ptype="general"]')[-1].get_text() if soup.select('[dmcf-ptype="general"]') else None
        content = " ".join([i.get_text() for i in soup.select('[dmcf-ptype="general"]')]) if soup.select('[dmcf-ptype="general"]') else None

        return title, publisher, reporter, content
    except Exception as e:
        print(f"Error fetching news data from {news_url}: {e}")
        return None, None, None, None

In [None]:
def get_news_links(category: str, date: str):
    last_page_num = get_last_page_num(category, date)
    news_links = []

    for page_num in range(1, last_page_num + 1):
        page_url = f"https://news.daum.net/breakingnews/economic/{category}?page={page_num}&regDate={date}"
        res = requests.get(page_url, headers=hdr)
        soup = BeautifulSoup(res.text, 'html.parser')
        news_links.extend([a['href'] for a in soup.select("ul.list_news2.list_allnews li strong.tit_thumb a.link_txt")])

    return news_links

In [None]:
month_list = []

num_months = 12  # 원하는 달만큼 생성

for _ in range(num_months):
    monthly_day_list = make_monthly_day_list(date_obj)
    month_list.append(monthly_day_list)
    
    date_obj = dt.datetime.strptime(monthly_day_list[-1], "%Y%m%d")
    date_obj = date_obj.replace(day=1) - dt.timedelta(days=1)

In [None]:
def collect_data():
    for category in categories:
        for month in month_list:
            data = {
                "title": [],
                "category": [],
                "article_url": [],
                "date": [],
                "publisher": [],
                "reporter": [],
                "content": []
            }

            for date in month: # 여러 달 돌릴 때 수정 필요
                news_links = get_news_links(category, date)
                if not news_links:
                    continue

                with ThreadPoolExecutor(max_workers=10) as executor:
                    results = list(executor.map(source_from_news, news_links))

                for result, newslink in zip(results, news_links):
                    if result is not None:
                        title, publisher, reporter, content = result
                        data["title"].append(title)
                        data["category"].append(category)
                        data["article_url"].append(newslink)
                        data["date"].append(date)
                        data["publisher"].append(publisher)
                        data["reporter"].append(reporter)
                        data["content"].append(content)

                # print(f"{date} 완료")
            # month가 끝날 때 데이터를 저장
            month_str = month[-1][:6]  # "YYYYMM" 형식으로 날짜 추출
            df = pd.DataFrame(data)
            df.to_csv(f"C:/Users/SesacPython/Desktop/dataset/뉴스추천시스템/news_data_{category}_{month_str}.csv", index=False, encoding="utf-8-sig")
            print(f"{month_str} 데이터 저장 완료")
        
        print(f"{category} 완료")
    print("모든 데이터 수집 및 저장 완료")

if __name__ == "__main__":
    collect_data()