In [None]:
# !pip install finnhub-python

In [1]:
import os
from datetime import datetime, timedelta

import finnhub
import numpy as np
import pandas as pd
import yfinance as yf

# from google.colab import userdata

In [None]:
np.random.seed(42)
ticker = "^GSPC"
# end = datetime.today() - timedelta(days=1)
# start = end - timedelta(days=30)
start_str, end_str = "2024-07-25", "2025-07-25"

In [3]:
stock_df = yf.download(
    ticker, start=start_str, end=end_str, auto_adjust=True, progress=False
)[["Close"]].reset_index()
stock_df.columns = ["date", "price"]
stock_df

Unnamed: 0,date,price
0,2024-07-22,123.500984
1,2024-07-23,122.551285
2,2024-07-24,114.213913
3,2024-07-25,112.244537
4,2024-07-26,113.024284
...,...,...
245,2025-07-15,170.699997
246,2025-07-16,171.369995
247,2025-07-17,173.000000
248,2025-07-18,172.410004


In [9]:
finnhub_client = finnhub.Client(api_key=os.getenv("FINNHUB_API_KEY"))

In [10]:
def fetch_news_in_chunks(ticker, start_date, end_date, client, chunk_days=7):
    all_news = []
    current = start_date
    while current <= end_date:
        chunk_end = min(current + timedelta(days=chunk_days - 1), end_date)
        news_chunk = client.company_news(
            ticker,
            _from=current.strftime("%Y-%m-%d"),
            to=chunk_end.strftime("%Y-%m-%d"),
        )
        all_news.extend(news_chunk)
        current = chunk_end + timedelta(days=1)
    return pd.DataFrame(all_news)

In [11]:
start_dt = datetime.strptime(start_str, "%Y-%m-%d").date() - timedelta(days=2)
end_dt = (stock_df["date"].max()).date()
news = fetch_news_in_chunks(ticker, start_dt, end_dt, finnhub_client, 7)

In [12]:
news_df = news[["datetime", "headline", "summary"]].copy()

news_df = news_df[
    news_df["datetime"].apply(lambda x: isinstance(x, (int, float)) and x > 0)
]
news_df["datetime"] = pd.to_datetime(news_df["datetime"], unit="s")
news_df["assigned_date"] = news_df["datetime"].apply(
    lambda x: (x + timedelta(days=1)).date() if x.hour >= 16 else x.date()
)

In [14]:
news_df.head()

Unnamed: 0,datetime,headline,summary,assigned_date
0,2024-08-01 23:02:34,Nvidia Stock Sank Today -- Is It Time to Buy t...,What's up next for Nvidia stock after an incre...,2024-08-02
1,2024-08-01 22:36:09,US Justice Dept. is investigating Nvidia's acq...,Nvidia announced the acquisition of the Israel...,2024-08-02
2,2024-08-01 22:13:28,US launches Nvidia antitrust probe after rival...,The U.S. Department of Justice has launched an...,2024-08-02
3,2024-08-01 21:36:34,"NVIDIA, Maplebear And Two Other Stocks Insider...","The Nasdaq Composite jumped 2.64% at 17,599.40...",2024-08-02
4,2024-08-01 20:59:38,Nvidia faces US DOJ probe over complaints from...,Antitrust officials at the U.S.Department of J...,2024-08-02


In [20]:
news_df = news_df[
    (news_df["assigned_date"] >= stock_df["date"].min().date())
    & (news_df["assigned_date"] <= stock_df["date"].max().date())
].copy()
news_df["assigned_date"].value_counts().sort_values().head(10)

assigned_date
2024-08-05    1
2024-10-28    1
2025-06-28    1
2024-10-25    2
2025-01-19    2
2025-03-09    2
2025-02-09    3
2025-05-23    3
2025-03-02    3
2025-01-18    4
Name: count, dtype: int64

In [24]:
news_df.sort_values(by="assigned_date", inplace=True)
news_df

Unnamed: 0,datetime,headline,summary,assigned_date
227,2024-07-30 14:00:00,Down Between 17% and 35% From Their 52-Week Hi...,There are plenty of different ways to invest i...,2024-07-30
226,2024-07-30 14:15:00,1 Top Artificial Intelligence (AI) Stock Billi...,Some hedge funds have been selling Nvidia and ...,2024-07-30
225,2024-07-30 14:16:15,"Stock market news today: Nasdaq sinks, Nvidia ...",A packed day of earnings and the start of the ...,2024-07-30
224,2024-07-30 14:26:00,More Big Tech Earnings Are Coming. What’s Next...,Microsoft will kick off this week’s Big Tech e...,2024-07-30
223,2024-07-30 14:54:00,"Sensata (ST) Q2 Earnings Meet Estimates, Reven...",Sensata (ST) second-quarter revenues are drive...,2024-07-30
...,...,...,...,...
10958,2025-07-21 13:05:27,Digi Power X to Raise US$15 Million in Direct ...,"Digi Power X (Nasdaq: DGXX and TSXV: DGX), an ...",2025-07-21
10957,2025-07-21 13:22:00,WeRide Teams Up With Lenovo to Launch 100% Aut...,"GUANGZHOU, China, July 21, 2025 (GLOBE NEWSWIR...",2025-07-21
10956,2025-07-21 13:39:19,Amazon Investors Search for Signs of AI Lift W...,(Bloomberg) -- Aggressive spending on artifici...,2025-07-21
10984,2025-07-21 01:30:00,Nvidia and Broadcom: Here's How These Top AI S...,Nvidia and Broadcom both have reported soaring...,2025-07-21


In [28]:
# remove stock price data before 7-30-2024
stock_df["date"] = pd.to_datetime(stock_df["date"]).dt.date
stock_df = stock_df[
    (stock_df["date"] >= news_df["assigned_date"].min())
    & (stock_df["date"] <= news_df["assigned_date"].max())
].copy()
stock_df.head()

Unnamed: 0,date,price
6,2024-07-30,103.697243
7,2024-07-31,116.98304
8,2024-08-01,109.175514
9,2024-08-02,107.236115
10,2024-08-05,100.418274


In [30]:
news_df.to_csv("../../data/processed/7-30-2024_to_7-21-2025_nvda_news.csv", index=False)
stock_df.to_csv(
    "../../data/processed/7-30-2024_to_7-21-2025_nvda_stock.csv", index=False
)