# 한 뉴스 크롤링

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://finance.naver.com/"
url = "https://finance.naver.com/news/mainnews.naver?date=2025-10-30&page=2"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")

# get subject
subject = soup.select_one(".articleSubject > a").text

# get detail article url
article_path_with_query = soup.select_one(".articleSubject > a").get("href")
detail_article_url = urljoin(BASE_URL, article_path_with_query)

# get content
content_tag = soup.select_one(".articleSummary")
content = content_tag.contents[0].strip()
press = content_tag.select_one(".press").text
article_date = content_tag.select_one(".wdate").text

print(content)
print(press)
print(article_date)


글로벌 투자은행(IB)이 ‘인공지능(AI) 붐’에 올라탄 SK하이닉스의 목표주가를 앞다퉈 높이고 있다. 고대역폭메모리(HBM) 반도체를..
한국경제 
2025-10-30 17:26:09


# 페이지에 존재하는 모든 뉴스 크롤링

In [None]:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlencode
from datetime import datetime

ROOT = "https://finance.naver.com/"
PATH = "news/mainnews.naver"

def get_news_page_url(date, page):
  base_url = urljoin(ROOT, PATH)
  query_string = urlencode({"date": date, "page": page})

  return f"{base_url}?{query_string}"

def get_detail_news_url(path):
  return urljoin(ROOT, path)

def get_current_date(format: str ="%Y-%m-%d") -> str:
  now = datetime.now()

  return now.strftime(format)  


PAGE_COUNT = 2
response = requests.get(get_news_page_url("2025-10-30", 2))
soup = BeautifulSoup(response.text, "html.parser")

news_per_page = soup.select(".block1")


for article_card in news_per_page:
  # subject
  subject = article_card.select_one(".articleSubject > a").text
  
  # detail article link
  detail_article_url = get_detail_news_url(article_card.select_one(".articleSubject > a").get("href"))

  # content
  content_tag = article_card.select_one(".articleSummary")
  content = content_tag.contents[0].strip()
  press = content_tag.select_one(".press").text
  article_date = content_tag.select_one(".wdate").text

  # print("subject =>", subject)
  print({
    "subject": subject,
    "detail_article_url": detail_article_url,
    "content": content,
    "press": press,
    "article_date": article_date
  })

{'subject': '엔비디아 시총 5조弗, 독일 GDP도 넘어섰다', 'detail_article_url': 'https://finance.naver.com/news/news_read.naver?article_id=0005204743&office_id=015&mode=mainnews&type=&date=2025-10-30&page=2', 'content': '이 기사는 국내 최대 해외 투자정보 플랫폼 한경 글로벌마켓에 게재된 기사입니다. 엔비디아의 시가총액이 29일(현지시간) 사상 처음으로 ..', 'press': '한국경제 ', 'article_date': '2025-10-30 17:39:16'}
{'subject': "AI투자 부담 … MS·메타 역대급 실적에도 '우울'", 'detail_article_url': 'https://finance.naver.com/news/news_read.naver?article_id=0005582100&office_id=009&mode=mainnews&type=&date=2025-10-30&page=2', 'content': '3분기 매출 큰폭 증가 불구 막대한 투자비용에 발목잡혀 AI 수익화 앞선 구글은 급등 글로벌 빅테크 기업인 구글, 마이크로소프트(MS)..', 'press': '매일경제 ', 'article_date': '2025-10-30 17:38:09'}
{'subject': '글로벌IB "K반도체 사라"…하이닉스 목표가 70만원', 'detail_article_url': 'https://finance.naver.com/news/news_read.naver?article_id=0005204717&office_id=015&mode=mainnews&type=&date=2025-10-30&page=2', 'content': '글로벌 투자은행(IB)이 ‘인공지능(AI) 붐’에 올라탄 SK하이닉스의 목표주가를 앞다퉈 높이고 있다. 고대역폭메모리(HBM) 반도체를..', 'press': '한국경제 ', 'article

# pagination 뉴스 크롤링

In [None]:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlencode
from datetime import datetime
from pydantic import BaseModel

ROOT = "https://finance.naver.com/"
PATH = "news/mainnews.naver"


def get_news_page_url(page: int, date: str):
  base_url = urljoin(ROOT, PATH)
  query_string = urlencode({"date": date, "page": page})

  return f"{base_url}?{query_string}"

def get_detail_news_url(path):
  return urljoin(ROOT, path)

def get_current_date(format: str ="%Y-%m-%d") -> str:
  now = datetime.now()

  return now.strftime(format)  


def crawl_news_per_page(soup: BeautifulSoup):
  result = []
  news_per_page = soup.select(".block1")
  
  for article_card in news_per_page:
    # subject
    subject = article_card.select_one(".articleSubject > a").text
    
    # detail article link
    detail_article_url = get_detail_news_url(article_card.select_one(".articleSubject > a").get("href"))

    # content
    content_tag = article_card.select_one(".articleSummary")
    content = content_tag.contents[0].strip()
    press = content_tag.select_one(".press").text
    article_date = content_tag.select_one(".wdate").text

    result.append(
      {
        "subject": subject,
        "detail_article_url": detail_article_url,
        "content": content,
        "press": press,
        "article_date": article_date
      }
    )
  
  return result



def crawl_all_news(page_total_count: int, date: str):
  result = []

  for page in range(1, page_total_count + 1):
    request_url = get_news_page_url(page, date)
    response = requests.get(request_url)
    soup = BeautifulSoup(response.text, "html.parser")
    result.extend(crawl_news_per_page(soup))

    # 마지막 페이지 검증
    if (not soup.select_one(".pgRR")):
      break

  return result



all_news = crawl_all_news(20, get_current_date())


94
{'subject': '“경쟁 통해 투자자 편익 증대… ETF 내년 상반기까지 지원”', 'detail_article_url': 'https://finance.naver.com/news/news_read.naver?article_id=0001811305&office_id=005&mode=mainnews&type=&date=2025-10-31&page=5', 'content': '대체거래소(ATS) 넥스트레이드의 성장 속도는 쉽게 예상하기 어려웠다. 출범 7개월 만에 첫 거래일(3월 4일) 대비 거래량이 386배..', 'press': '국민일보 ', 'article_date': '2025-10-31 00:03:11'}


# 검증(최종)

In [None]:

from typing import Annotated
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlencode
from datetime import datetime
from pydantic import AfterValidator, BaseModel, HttpUrl, field_validator


def non_empty_str(v: str) -> str:
  s = (v or "").strip()
  if not s:
    raise ValueError("빈 문자열입니다.")
  
  return s

def valid_url(url: str) -> str:
  try:
    response = requests.get(url)
    # throw error if status 4xx, 5xx
    response.raise_for_status()

    return url
  except Exception as _:
    raise ValueError("유효하지 않은 URL입니다.")

def valid_date(date: str) -> str:
  v = (date or "").strip()

  if not v:
    raise ValueError("빈 문자열입니다.")

  try:
    datetime.strptime(v, "%Y-%m-%d %H:%M:%S")

    return date
  except ValueError:
    raise ValueError("유효하지 않은 날짜 형식입니다.")

class NewsItem(BaseModel):
  subject: Annotated[str, AfterValidator(non_empty_str)]
  detail_article_url: Annotated[str, AfterValidator(valid_url)]
  content: Annotated[str, AfterValidator(non_empty_str)]
  press: Annotated[str, AfterValidator(non_empty_str)]
  article_date: Annotated[str, AfterValidator(valid_date)]


ROOT = "https://finance.naver.com/"
PATH = "news/mainnews.naver"


def get_news_page_url(page: int, date: str):
  base_url = urljoin(ROOT, PATH)
  query_string = urlencode({"date": date, "page": page})

  return f"{base_url}?{query_string}"

def get_detail_news_url(path):
  return urljoin(ROOT, path)

def get_current_date(format: str ="%Y-%m-%d") -> str:
  now = datetime.now()

  return now.strftime(format)  


def crawl_news_per_page(soup: BeautifulSoup):
  result = []
  news_per_page = soup.select(".block1")
  
  for article_card in news_per_page:
    # subject
    subject = article_card.select_one(".articleSubject > a").text
    
    # detail article link
    detail_article_url = get_detail_news_url(article_card.select_one(".articleSubject > a").get("href"))

    # content
    content_tag = article_card.select_one(".articleSummary")
    content = content_tag.contents[0].strip()
    press = content_tag.select_one(".press").text.strip()
    article_date = content_tag.select_one(".wdate").text.strip()

    new_item = NewsItem(subject=subject, detail_article_url=detail_article_url, content=content, press=press, article_date=article_date)

    result.append(
      new_item.model_dump()
    )
  
  return result



def crawl_all_news(page_total_count: int, date: str):
  result = []

  for page in range(1, page_total_count + 1):
    request_url = get_news_page_url(page, date)
    response = requests.get(request_url)
    soup = BeautifulSoup(response.text, "html.parser")
    result.extend(crawl_news_per_page(soup))

    # 마지막 페이지 검증(네이버 증권 페이지는 마지막 페이지에 "다음" navigation 버튼이 존재하지 않음)
    if (not soup.select_one(".pgRR")):
      break

  return result



all_news = crawl_all_news(20, get_current_date())
print(all_news)



########## 검증 테스트 ##########
# subject= '“경쟁 통해 투자자 편익 증대… ETF 내년 상반기까지 지원”'
# detail_article_url= 'https://finance.naver.com/news/news_read.naver?article_id=0001811305&office_id=005&mode=mainnews&type=&date=2025-10-31&page=5'
# content= '대체거래소(ATS) 넥스트레이드의 성장 속도는 쉽게 예상하기 어려웠다. 출범 7개월 만에 첫 거래일(3월 4일) 대비 거래량이 386배..'
# press= '국민일보 '
# article_date= '2025-10-31 00:03:11'

# result = NewsItem(subject=subject, detail_article_url=detail_article_url, content=content, press=press, article_date=article_date)
# print(result.model_dump())

# print(type(result.model_dump()))


[{'subject': '코스피, 4100선 뚫어 또 최고치…"대형주 실적 모멘텀 견조"[마감]', 'detail_article_url': 'https://finance.naver.com/news/news_read.naver?article_id=0006152889&office_id=018&mode=mainnews&type=&date=2025-10-31&page=1', 'content': '코스피가 4100선도 뚫으며 종가 기준 사상 최고치를 또 갈아치웠다. 미국 주식시장 하락에도 한국 대형주 실적 모멘텀이 견조해 지수 상..', 'press': '이데일리', 'article_date': '2025-10-31 15:52:14'}, {'subject': '‘거래인가, 사면인가’… 트럼프, 자오창펑 손잡고 1.4조 ‘잭팟’', 'detail_article_url': 'https://finance.naver.com/news/news_read.naver?article_id=0005582540&office_id=009&mode=mainnews&type=&date=2025-10-31&page=1', 'content': 'WSJ “바이낸스, 트럼프 일가 활용 2조원대 빅딜” UAE 국부펀드 투자금 ‘트럼프 코인’ 결제 요청 ‘사면 로비’ 의혹 자금세탁방지..', 'press': '매일경제', 'article_date': '2025-10-31 14:35:10'}, {'subject': '출혈 경쟁속 부상한 \'AI 버블\'…"미 증시 조정 경고"', 'detail_article_url': 'https://finance.naver.com/news/news_read.naver?article_id=0001229097&office_id=215&mode=mainnews&type=&date=2025-10-31&page=1', 'content': '미국 빅테크들이 실적 발표 이후 명암이 엇갈리고 있습니다. 때되면 다시 불거지는 AI 거품론이 시장을 흔들고 있는데요. 핵심은 AI 투..',

# 엑셀 저장하기

In [3]:
import pandas as pd

all_news_2d = list(map(lambda item: [item["subject"], item["detail_article_url"], item["content"], item["press"], item["article_date"]], all_news))
df = pd.DataFrame(all_news_2d, columns=["제목", "기사 링크", "내용", "언론사", "날짜"])
df.to_excel("./outputs/naver_finance_news.xlsx", index=False)
