In [None]:
%pip install requests
%pip install bs4

In [None]:
# br태그 오류 수정 코드

import csv
import requests
import json
import re
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup as bs
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed

# 캐시를 위한 딕셔너리 초기화
cached_data = {}

# 날짜 범위를 1개월 단위로 나누는 함수
def get_monthly_date_ranges(start_date, end_date):
    date_ranges = []
    current_start = start_date
    while current_start < end_date:
        current_end = min(current_start + relativedelta(months=1) - timedelta(days=1), end_date)
        date_ranges.append((current_start, current_end))
        current_start = current_end + timedelta(days=1)
    return date_ranges

# 파일명에 사용할 수 없는 문자 제거 함수
def clean_filename(filename):
    filename = re.sub(r'[\/:*?"<>|.]', '_', filename)
    filename = re.sub(r'_+', '_', filename)
    return filename.strip('_')

# HTTP GET 요청을 재시도하는 함수
def fetch_url_with_retries(url, headers, retries=10, timeout=10):
    for i in range(retries):
        try:
            # print(f"{datetime.now()} - Attempt {i+1} to fetch URL: {url}")
            res = requests.get(url, headers=headers, timeout=timeout)
            if res.status_code == 200:
                return res
            else:
                print(f"Unexpected status code {res.status_code} for URL: {url}")
        except requests.exceptions.RequestException as e:
            print(f"{datetime.now()} - Request failed ({i+1}/{retries}): {e}")
            sleep(2)
    print(f"{datetime.now()} - Failed to retrieve JSON data from {url}")
    return None

# HTML 콘텐츠를 정리하는 함수 (br 태그를 줄바꿈으로 변환)
def clean_html_content(contents):
    for br in contents.find_all("br"):
        br.replace_with("\n")
    return contents.get_text(separator="\n").strip()

# 기사 데이터 수집 및 저장 함수
def process_article(contents_id, headers, directory):
    base_url = f'https://www.yna.co.kr/view/{contents_id}?section=search'

    # 캐시된 데이터 사용 시도
    if contents_id in cached_data:
        print(f"Using cached data for article {base_url}")
        crawling_soup = cached_data[contents_id]
    else:
        crawling_res = fetch_url_with_retries(base_url, headers)
        if crawling_res is None:
            print(f"Failed to retrieve the article {base_url}")
            return
        crawling_soup = bs(crawling_res.text, 'html.parser')
        cached_data[contents_id] = crawling_soup  # 캐싱

    try:
        news_title = crawling_soup.select_one('h1.tit').text.strip()
        news_date = crawling_soup.select_one('.txt-copyright > span.date').text.strip()[:10].replace("/", ".")
        # contents = crawling_soup.find("article", class_="story-news").find_all('p', text=True)
        # news_cont = clean_html_content(contents) if contents else "내용을 찾을 수 없습니다."
        # contents = contents[:-2]
        
        # 모든 p 태그를 찾아 리스트로 저장
        p_tags = crawling_soup.find("article", class_="story-news").find_all('p')
        
        # 뒤에서 두 번째까지 제거한 후, 각 p 태그의 텍스트를 추출하여 하나의 문자열로 결합
        if len(p_tags) > 2:
            p_tags = p_tags[:-1]
        
        contents = " ".join([p.get_text(strip=True) for p in p_tags])
        # contents = re.sub(r'["“”\'\'`]+', '"', contents)
        # contents = contents.replace('""', '"')  # 연속된 쌍따옴표
        # contents = contents.replace("“", '"').replace("”", '"')  # 특수 문자 따옴표
        # contents = contents.replace("''", '"')  # 연속된 홑따옴표
        
        safe_title = clean_filename(news_title)
        csv_file_path = f'{directory}/{news_date}_{safe_title}.csv'
        
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(['날짜', '제목', '내용', 'url'])
            # csvwriter.writerow([news_date, news_title, news_cont, base_url])
            csvwriter.writerow([news_date, news_title, contents, base_url])
        print(f"Saved CSV: {csv_file_path}")
    except AttributeError as e:
        print(f"Failed to extract data for article {base_url}: {e}")

# 페이지 데이터를 가져와서 처리하는 함수
def process_page(page_no, from_date, to_date, headers, directory):
    url = f'http://ars.yna.co.kr/api/v2/search.asis?callback=Search.SearchPreCallback&query=%EA%B8%88%EB%A6%AC&page_no={page_no}&period=diy&from={from_date}&to={to_date}&ctype=A&page_size=10&channel=basic_kr'
    
    res = fetch_url_with_retries(url, headers)
    if res is None:
        print(f"Failed to retrieve data for page {page_no} in range {from_date} ~ {to_date}")
        return
    try:
        json_str = re.search(r'Search\.SearchPreCallback\((.*)\)', res.text).group(1)
        datas = json.loads(json_str)
    except (AttributeError, json.JSONDecodeError) as e:
        print(f"Failed to parse JSON for page {page_no}: {e}")
        return
    contents_ids = [item['CONTENTS_ID'] for item in datas.get('KR_ARTICLE', {}).get('result', [])]
    with ThreadPoolExecutor(max_workers=5) as article_executor:
        futures = [article_executor.submit(process_article, contents_id, headers, directory) for contents_id in contents_ids]
        for future in as_completed(futures):
            try:
                future.result()  # Raise exceptions if any occurred
            except Exception as e:
                print(f"Error processing article: {e}")

# 메인 함수
def main():
    start_date = datetime.strptime('20240101', '%Y%m%d')
    end_date = datetime.strptime('20240811', '%Y%m%d')
    date_ranges = get_monthly_date_ranges(start_date, end_date)  # 1개월 단위로 변경
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
        'Referer': 'https://www.yna.co.kr/'
    }
    directory = './dataset'
    os.makedirs(directory, exist_ok=True)  # Ensure dataset directory exists

    with ThreadPoolExecutor(max_workers=10) as page_executor:
        for date_range in date_ranges:
            from_date = date_range[0].strftime('%Y%m%d')
            to_date = date_range[1].strftime('%Y%m%d')
            futures = [page_executor.submit(process_page, page_no, from_date, to_date, headers, directory) for page_no in range(1, 51)]
            for future in as_completed(futures):
                try:
                    future.result()  # Raise exceptions if any occurred
                except Exception as e:
                    print(f"Error processing page: {e}")

if __name__ == "__main__":
    main()