In [1]:
%pip install requests
%pip install bs4

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [39]:
import csv
import requests
import json
import re
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup as bs
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed

# 날짜 범위를 1개월 단위로 나누는 함수
def get_monthly_date_ranges(start_date, end_date):
    date_ranges = []
    current_start = start_date

    while current_start < end_date:
        current_end = min(current_start + relativedelta(months=1) - timedelta(days=1), end_date)
        date_ranges.append((current_start, current_end))
        current_start = current_end + timedelta(days=1)

    return date_ranges

# 파일명에 사용할 수 없는 문자 제거 함수
def clean_filename(filename):
    filename = re.sub(r'[\/:*?"<>|.]', '_', filename)
    filename = re.sub(r'_+', '_', filename)
    return filename.strip('_')

# HTTP GET 요청을 재시도하는 함수
def fetch_url_with_retries(url, headers, retries=3, timeout=10):
    for i in range(retries):
        try:
            res = requests.get(url, headers=headers, timeout=timeout)
            if res.status_code == 200:
                return res
            else:
                print(f"Unexpected status code {res.status_code} for URL: {url}")
        except requests.exceptions.RequestException as e:
            print(f"Request failed ({i+1}/{retries}): {e}")
            sleep(2)
    return None

# 기사 데이터 수집 및 저장 함수
def process_article(contents_id, headers, directory):
    base_url = f'https://www.yna.co.kr/view/{contents_id}?section=search'
    crawling_res = fetch_url_with_retries(base_url, headers)
    if crawling_res is None:
        print(f"Failed to retrieve the article {base_url}")
        return
    crawling_soup = bs(crawling_res.text, 'html.parser')
    try:
        news_title = crawling_soup.select_one('h1.tit').text.strip()
        news_date = crawling_soup.select_one('.txt-copyright > span.date').text.strip()[:10].replace("/", ".")
        contents = crawling_soup.select('.story-news p')
        news_cont = ' '.join([content.text.strip() for content in contents[2:-2]])

        safe_title = clean_filename(news_title)
        csv_file_path = f'{directory}/{news_date}_{safe_title}.csv'
        
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(['날짜', '제목', '내용'])
            csvwriter.writerow([news_date, news_title, news_cont])
        print(f"Saved CSV: {csv_file_path}")
    except AttributeError as e:
        print(f"Failed to extract data for article {base_url}: {e}")

# 페이지 데이터를 가져와서 처리하는 함수
def process_page(page_no, from_date, to_date, headers, directory):
    url = f'http://ars.yna.co.kr/api/v2/search.asis?callback=Search.SearchPreCallback&query=%EA%B8%88%EB%A6%AC&page_no={page_no}&period=diy&from={from_date}&to={to_date}&ctype=A&page_size=10&channel=basic_kr'
    
    res = fetch_url_with_retries(url, headers)
    if res is None:
        print(f"Failed to retrieve data for page {page_no} in range {from_date} ~ {to_date}")
        return
    try:
        json_str = re.search(r'Search\.SearchPreCallback\((.*)\)', res.text).group(1)
        datas = json.loads(json_str)
    except (AttributeError, json.JSONDecodeError) as e:
        print(f"Failed to parse JSON for page {page_no}: {e}")
        return

    contents_ids = [item['CONTENTS_ID'] for item in datas.get('KR_ARTICLE', {}).get('result', [])]
    with ThreadPoolExecutor(max_workers=5) as article_executor:
        futures = [article_executor.submit(process_article, contents_id, headers, directory) for contents_id in contents_ids]
        for future in as_completed(futures):
            try:
                future.result()  # Raise exceptions if any occurred
            except Exception as e:
                print(f"Error processing article: {e}")

# 메인 함수
def main():
    start_date = datetime.strptime('20160101', '%Y%m%d')
    end_date = datetime.strptime('20160630', '%Y%m%d')

    date_ranges = get_monthly_date_ranges(start_date, end_date)  # 1개월 단위로 변경
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
        'Referer': 'https://www.yna.co.kr/'
    }

    directory = './dataset'
    os.makedirs(directory, exist_ok=True)  # Ensure dataset directory exists

    with ThreadPoolExecutor(max_workers=10) as page_executor:
        for date_range in date_ranges:
            from_date = date_range[0].strftime('%Y%m%d')
            to_date = date_range[1].strftime('%Y%m%d')
            futures = [page_executor.submit(process_page, page_no, from_date, to_date, headers, directory) for page_no in range(1, 51)]
            for future in as_completed(futures):
                try:
                    future.result()  # Raise exceptions if any occurred
                except Exception as e:
                    print(f"Error processing page: {e}")

if __name__ == "__main__":
    main()


Saved CSV: ./dataset/2016.01.30_대만도 금리인하 태세…초단기자금 이율 인하.csv
Saved CSV: ./dataset/2016.01.29_일본 2_8%·중국 3_1%…아시아증시 나란히 올라.csv
Saved CSV: ./dataset/2016.01.29_유럽 이어 일본도…'마이너스 금리'로 경기부양 안간힘(종합).csv
Saved CSV: ./dataset/2016.01.31_깐깐해질 대출심사에 놀랐나…주택대출 증가세 확 꺾였다.csv
Saved CSV: ./dataset/2016.01.29_日銀, 마이너스금리 첫 도입…量·質·금리 '3박자 완화'(종합2보).csv
Saved CSV: ./dataset/2016.01.31_글로벌 경제상식 곳곳서 깨진다…유례없는 현상·조치 잇따라.csv
Saved CSV: ./dataset/2016.01.29_연합뉴스 이 시각 헤드라인_ - 16_00.csv
Saved CSV: ./dataset/2016.01.31_세계 경제 가라앉는다_…각국, 위기해결 가능수단 총동원.csv
Saved CSV: ./dataset/2016.01.29_일본은행, 금융위기 이후 통화정책 일지.csv
Saved CSV: ./dataset/2016.01.31_연합시론_ 통화정책도 '샌드위치' 신세 된 한국.csv
Saved CSV: ./dataset/2016.01.29_한은 제주본부, 폭설 피해 업체 특별운전자금 지원.csv
Saved CSV: ./dataset/2016.01.29_일본은행, 기준금리 -0_1% 채택(속보).csv
Saved CSV: ./dataset/2016.01.31_수도권 주택대출 심사 내일부터 깐깐해진다.csv
Saved CSV: ./dataset/2016.01.31_O2O는 배달·숙박, 핀테크는 간편결제·송금이 가장 친밀.csv
Saved CSV: ./dataset/2016.01.29_유럽증시, 日 마이너스 금리 도입에 상승 출발.csv
Saved CSV: ./dataset/2016.01.31_인터넷전