In [2]:
import requests
from bs4 import BeautifulSoup
import json
import os
from datetime import datetime, timedelta
import re
import time

BASE_URL = "https://finance.naver.com/news/news_list.naver?mode=LSS3D&section_id=101&section_id2=258&section_id3=401&date={target_date}&page={target_page_num}"
start_date = datetime(2024, 11, 9)
end_date = datetime(2025, 2, 6)

def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

def save_articles_to_file(articles, path):
    with open(path, 'w', encoding='utf-8') as json_file:
        json.dump(articles, json_file, ensure_ascii=False, indent=4)

def fetch_url(url, retries=3, delay=0.3):
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response
        except requests.RequestException as e:
            print(f"Error fetching URL {url}: {e}. Retrying ({attempt + 1}/{retries})...")
            attempt += 1
            time.sleep(delay)
    return None

def fetch_articles_for_date(target_date):
    articles = []
    page_num = 1

    while True:
        url = BASE_URL.format(target_date=target_date, target_page_num=page_num)
        response = fetch_url(url)

        if response is None:
            print(f"Failed to fetch articles for date {target_date} after multiple attempts.")
            break

        soup = BeautifulSoup(response.content, 'html.parser')
        news_list = soup.select('li.newsList > dl > dd > a')

        if not news_list:
            break

        for news in news_list:
            news_url = news['href']
            pattern = r'article_id=(\d+)&office_id=(\d+)'
            match = re.search(pattern, news_url)

            if match:
                article_id = match.group(1)
                office_id = match.group(2)
                article_url = f'https://n.news.naver.com/mnews/article/{office_id}/{article_id}'
                article_response = fetch_url(article_url)

                if article_response is None:
                    print(f"Failed to fetch article {article_url} after multiple attempts.")
                    continue

                article_soup = BeautifulSoup(article_response.content, 'html.parser')

                try:
                    title = article_soup.find('h2', class_='media_end_head_headline').text.strip()
                except AttributeError:
                    title = '정보를 찾을 수 없음'

                try:
                    date = article_soup.find('span', class_='media_end_head_info_datestamp_time _ARTICLE_DATE_TIME').text.strip()
                except AttributeError:
                    date = '정보를 찾을 수 없음'

                try:
                    content = article_soup.find('article', class_='_article_content').text.strip()
                except AttributeError:
                    content = '정보를 찾을 수 없음'

                article_data = {
                    'title': title,
                    'date': date,
                    'content': content
                }

                articles.append(article_data)

        page_num += 1

    return articles

def crawl():
    current_date = start_date

    total_days = (end_date - start_date).days + 1
    completed_days = 0

    while current_date <= end_date:
        year = current_date.strftime('%Y')
        month = current_date.strftime('%m')
        day = current_date.strftime('%d')

        directory_path = f'./news_2/{year}/{month}'
        create_directory(directory_path)

        target_date = current_date.strftime('%Y%m%d')
        articles = fetch_articles_for_date(target_date)

        if articles:
            file_path = f'{directory_path}/{target_date}.json'
            save_articles_to_file(articles, file_path)
            print(f"Saved {len(articles)} articles to {file_path}")
        else:
            print(f"No articles found for date {target_date}")

        completed_days += 1
        progress = (completed_days / total_days) * 100
        print(f"Crawling done for {target_date}. Progress: {progress:.2f}%")

        current_date += timedelta(days=1)

# if __name__ == '__main__':
#     main()

# 실행
crawl()

Saved 19 articles to ./news_2/2024/11/20241109.json
Crawling done for 20241109. Progress: 1.11%
Saved 37 articles to ./news_2/2024/11/20241110.json
Crawling done for 20241110. Progress: 2.22%
Saved 193 articles to ./news_2/2024/11/20241111.json
Crawling done for 20241111. Progress: 3.33%
Saved 198 articles to ./news_2/2024/11/20241112.json
Crawling done for 20241112. Progress: 4.44%
Saved 283 articles to ./news_2/2024/11/20241113.json
Crawling done for 20241113. Progress: 5.56%
Saved 198 articles to ./news_2/2024/11/20241114.json
Crawling done for 20241114. Progress: 6.67%
Saved 188 articles to ./news_2/2024/11/20241115.json
Crawling done for 20241115. Progress: 7.78%
Saved 18 articles to ./news_2/2024/11/20241116.json
Crawling done for 20241116. Progress: 8.89%
Saved 49 articles to ./news_2/2024/11/20241117.json
Crawling done for 20241117. Progress: 10.00%
Saved 166 articles to ./news_2/2024/11/20241118.json
Crawling done for 20241118. Progress: 11.11%
Saved 151 articles to ./news_2/2

In [4]:
from pykrx import stock

def get_stock_name(ticker):
    # 주어진 티커의 종목명 조회
    try:
        stock_name = stock.get_market_ticker_name(ticker)
        return stock_name
    except:
        return "티커가 유효하지 않습니다."
    
def get_all_stock_names():
    # 모든 티커 리스트 가져오기
    tickers = stock.get_market_ticker_list()
    
    # 각 티커의 종목명을 조회하여 리스트에 저장
    stock_names = []
    for ticker in tickers:
        stock_name = get_stock_name(ticker)
        stock_names.append((ticker, stock_name))
    
    return stock_names

In [5]:
stock_names = get_all_stock_names()

import os
import json
import shutil

# 종목명을 키로, 티커를 값으로 하는 딕셔너리 생성
stock_dict = {name: ticker for ticker, name in stock_names}

def add_tickers_to_json(src_folder_path, dest_folder_path):
    if not os.path.exists(dest_folder_path):
        os.makedirs(dest_folder_path)
    
    for root, dirs, files in os.walk(src_folder_path):
        for file in files:
            if file.endswith(".json"):
                src_file_path = os.path.join(root, file)
                relative_path = os.path.relpath(src_file_path, src_folder_path)
                dest_file_path = os.path.join(dest_folder_path, relative_path)
                
                os.makedirs(os.path.dirname(dest_file_path), exist_ok=True)

                with open(src_file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # data가 리스트임을 가정하고 처리
                for item in data:
                    content = item.get("content", "")
                    tickers = []

                    for name, ticker in stock_dict.items():
                        if name in content:
                            tickers.append(ticker)

                    if tickers:
                        item["ticker"] = tickers

                with open(dest_file_path, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)

In [7]:
src_folder_path = './news_temp'
dest_folder_path = './news'

add_tickers_to_json(src_folder_path, dest_folder_path)