In [1]:
!pip install selenium
!pip install beautifulsoup4
!pip install feedparser newspaper3k waybackpy pandas
!pip install requests
!pip install "lxml[html_clean]"

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [2]:
import feedparser
from newspaper import Article
import pandas as pd
import time
from datetime import datetime, timedelta
import os
import urllib.parse
import re
import requests
import tempfile

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ✅ M7 종목 정의
m7_stocks = {
    "AAPL": "Apple",
    "MSFT": "Microsoft",
    "NVDA": "Nvidia",
    "TSLA": "Tesla",
    "AMZN": "Amazon",
    "META": "Meta",
    "GOOGL": "Google"
}

# ✅ 수집 기간 정의 -> 이부분 날짜 수정
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 4, 27)

# ✅ 뉴스 저장 리스트
all_articles = []

# ✅ RSS에서 원본 뉴스기사 URL 추출
def fetch_rss_links(query, from_date, to_date, max_links=20):
    query_full = f"{query} after:{from_date} before:{to_date}"
    query_encoded = urllib.parse.quote_plus(query_full)
    query_url = f"https://news.google.com/rss/search?q={query_encoded}&hl=en-US&gl=US&ceid=US:en"
    feed = feedparser.parse(query_url)

    links = []
    for entry in feed.entries[:max_links]:
        try:
            google_url = entry.link
            # 스킵: consent.google.com 관련 링크
            if "consent.google.com" in google_url:
                continue
            if "url=" in google_url:
                match = re.search(r"url=(https?[^&]+)", google_url)
                real_url = match.group(1) if match else google_url
            else:
                real_url = google_url
            links.append((real_url, entry.published, entry.title))
        except:
            continue
    return links

# ✅ Selenium을 사용하여 뉴스 본문 추출 (고유한 사용자 데이터 디렉터리 지정)
def extract_article_with_selenium(url):
    # ✅ 매번 임시 폴더 생성
    with tempfile.TemporaryDirectory() as tmpdirname:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument(f"--user-data-dir={tmpdirname}")  # <- 고유한 폴더 사용

        # 크롬 드라이버 실행
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(30)

        try:
            driver.get(url)

            # 페이지 로딩 기다리기
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            time.sleep(2)  # 충분한 로딩 대기

            text = driver.find_element(By.TAG_NAME, "body").text
            return text

        except Exception as e:
            print(f"❌ Selenium error for URL: {url}\n{e}")
            return None

        finally:
            driver.quit()



# ✅ 수집 루프 시작
for ticker, name in m7_stocks.items():
    print(f"\n🔍 [{ticker}] Collecting news...")

    current_date = start_date
    while current_date < end_date:
        from_str = current_date.strftime("%Y-%m-%d")
        to_str = (current_date + timedelta(days=30)).strftime("%Y-%m-%d")

        links = fetch_rss_links(f"{name} stock", from_str, to_str, max_links=20)  # ⬅️ 상위 10개로 제한
        print(f"🗓️  {from_str} ~ {to_str}: {len(links)} links found")

        for url, pub_date, title in links:
            try:
                pub_dt = datetime.strptime(pub_date[:16], "%a, %d %b %Y")
                pub_str = pub_dt.strftime("%Y-%m-%d")
            except:
                continue

            article_text = extract_article_with_selenium(url)
            if not article_text or len(article_text) < 200:
                print(f"⚠️ Skipped: {url}")
                continue

            all_articles.append({
                "date": pub_str,
                "ticker": ticker,
                "company": name,
                "title": title,
                "text": article_text,
                "original_url": url
            })

            time.sleep(1)

        current_date += timedelta(days=30)

# ✅ 결과 저장 (하나의 CSV)
df = pd.DataFrame(all_articles)
os.makedirs("output", exist_ok=True)
# df.to_csv("output/M7_news_2020_2025.csv", index=False)
print(f"\n✅ Total articles collected: {len(df)}")
df


🔍 [AAPL] Collecting news...
🗓️  2020-01-01 ~ 2020-01-31: 20 links found


KeyboardInterrupt: 

In [None]:
df # 데이터프레임 확인 후

In [None]:
df.to_csv("output/M7_news_2020_2025.csv", index=False) # 마지막에 csv 파일로 저장
print("📁 Saved to: output/M7_news_2020_2025.csv")