In [None]:
!pip install feedparser newspaper3k beautifulsoup4 lxml
!pip install pandas
!pip install requests
!pip install selenium
!pip install "lxml[html_clean]"

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspap

In [None]:
!apt update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.81)] [Co[0m                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
[33m0% [2 InRelease 54.7 kB/128 kB 43%] [Connecting to security.ubuntu.com (91.189.[0m                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hi

In [None]:
import feedparser
from newspaper import Article
import pandas as pd
import time
from datetime import datetime, timedelta
import os
import urllib.parse
import re
import requests
import tempfile

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# M7 종목 정의
m7_stocks = {
    "AAPL": "Apple",
    "MSFT": "Microsoft",
    "NVDA": "Nvidia",
    "TSLA": "Tesla",
    "AMZN": "Amazon",
    "META": "Meta",
    "GOOGL": "Google"
}

# 수집 기간 정의 -> 이부분 날짜 수정 2022, 9, 1 ~ 2023, 12, 31
start_date = datetime(2022, 9, 1)
end_date = datetime(2023, 12, 31)

# 뉴스 저장 리스트
all_articles = []

# RSS에서 원본 뉴스기사 URL 추출
def fetch_rss_links(query, from_date, to_date, max_links=20):
    query_full = f"{query} after:{from_date} before:{to_date}"
    query_encoded = urllib.parse.quote_plus(query_full)
    query_url = f"https://news.google.com/rss/search?q={query_encoded}&hl=en-US&gl=US&ceid=US:en"
    feed = feedparser.parse(query_url)

    links = []
    for entry in feed.entries[:max_links]:
        try:
            google_url = entry.link
            # 스킵: consent.google.com 관련 링크
            if "consent.google.com" in google_url:
                continue
            if "url=" in google_url:
                match = re.search(r"url=(https?[^&]+)", google_url)
                real_url = match.group(1) if match else google_url
            else:
                real_url = google_url
            links.append((real_url, entry.published, entry.title))
        except:
            continue
    return links

# Selenium을 사용하여 뉴스 본문 추출 (고유한 사용자 데이터 디렉터리 지정)
def extract_article_with_selenium(url):
    # 매번 임시 폴더 생성
    with tempfile.TemporaryDirectory() as tmpdirname:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument(f"--user-data-dir={tmpdirname}")  # <- 고유한 폴더 사용

        # 크롬 드라이버 실행
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(30)

        try:
            driver.get(url)

            # 페이지 로딩 기다리기
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            time.sleep(2)  # 충분한 로딩 대기

            text = driver.find_element(By.TAG_NAME, "body").text
            return text

        except Exception as e:
            print(f"❌ Selenium error for URL: {url}\n{e}")
            return None

        finally:
            driver.quit()



# 수집 루프 시작
for ticker, name in m7_stocks.items():
    print(f"\n🔍 [{ticker}] Collecting news...")
    # 각 종목별로 고유한 프로필 디렉터리 지정 (예: /tmp/chrome_profile_AAPL)
    # profile_dir = f"/tmp/chrome_profile_{ticker}"

    current_date = start_date
    while current_date < end_date:
        from_str = current_date.strftime("%Y-%m-%d")
        to_str = (current_date + timedelta(days=30)).strftime("%Y-%m-%d")

        links = fetch_rss_links(f"{name} stock", from_str, to_str, max_links=20)  # ⬅️ 상위 10개로 제한
        print(f"🗓️  {from_str} ~ {to_str}: {len(links)} links found")

        for url, pub_date, title in links:
            try:
                pub_dt = datetime.strptime(pub_date[:16], "%a, %d %b %Y")
                pub_str = pub_dt.strftime("%Y-%m-%d")
            except:
                continue

            article_text = extract_article_with_selenium(url)
            if not article_text or len(article_text) < 200:
                print(f"⚠️ Skipped: {url}")
                continue

            all_articles.append({
                "date": pub_str,
                "ticker": ticker,
                "company": name,
                "title": title,
                "text": article_text,
                "original_url": url
            })

            time.sleep(1)

        current_date += timedelta(days=30)

# 결과 저장 (하나의 CSV)
df = pd.DataFrame(all_articles)
os.makedirs("output", exist_ok=True)
# df.to_csv("output/M7_news_2020_2025.csv", index=False)
print(f"\n✅ Total articles collected: {len(df)}")


🔍 [AAPL] Collecting news...
🗓️  2025-03-01 ~ 2025-03-31: 20 links found
🗓️  2025-03-31 ~ 2025-04-30: 20 links found

🔍 [MSFT] Collecting news...
🗓️  2025-03-01 ~ 2025-03-31: 20 links found
🗓️  2025-03-31 ~ 2025-04-30: 20 links found

🔍 [NVDA] Collecting news...
🗓️  2025-03-01 ~ 2025-03-31: 20 links found
🗓️  2025-03-31 ~ 2025-04-30: 20 links found

🔍 [TSLA] Collecting news...
🗓️  2025-03-01 ~ 2025-03-31: 20 links found
🗓️  2025-03-31 ~ 2025-04-30: 20 links found

🔍 [AMZN] Collecting news...
🗓️  2025-03-01 ~ 2025-03-31: 20 links found
🗓️  2025-03-31 ~ 2025-04-30: 20 links found

🔍 [META] Collecting news...
🗓️  2025-03-01 ~ 2025-03-31: 20 links found
🗓️  2025-03-31 ~ 2025-04-30: 20 links found

🔍 [GOOGL] Collecting news...
🗓️  2025-03-01 ~ 2025-03-31: 20 links found
🗓️  2025-03-31 ~ 2025-04-30: 20 links found

✅ Total articles collected: 280
📁 Saved to: output/M7_news_2020_2025.csv


Unnamed: 0,date,ticker,company,title,text,original_url
0,2025-03-13,AAPL,Apple,Apple Stock Bulls Are Losing Confidence In iPh...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMioAFBV...
1,2025-03-18,AAPL,Apple,Apple stock rout shaves nearly $700 billion fr...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMiwAFBV...
2,2025-03-12,AAPL,Apple,Watch These Apple Stock Price Levels as Sell-O...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMiowFBV...
3,2025-03-19,AAPL,Apple,"‘Bears Beware,’ Says Daniel Ives About Apple S...",EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMiqgFBV...
4,2025-03-17,AAPL,Apple,Apple AI Delay May Be an Opportunity for the S...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMieEFVX...
...,...,...,...,...,...,...
275,2025-04-25,GOOGL,Google,Did Google Earnings Results Put Search Worries...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMinwFBV...
276,2025-04-17,GOOGL,Google,Federal Judge Rules Against Google In Advertis...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMiuwFBV...
277,2025-04-24,GOOGL,Google,Google Stock: Google Earnings Beat As Internet...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMiiwFBV...
278,2025-04-28,GOOGL,Google,Here's How Waymo's Robotaxi Momentum Will Pay ...,EN\nSign in\nBefore you continue to Google\nWe...,https://news.google.com/rss/articles/CBMikAFBV...


In [None]:
df

In [None]:
df.to_csv("output/M7_news_2020_2025.csv", index=False)
print("📁 Saved to: output/M7_news_2020_2025.csv")