# 1. Import

In [39]:
import pandas as pd
from pathlib import Path
import openpyxl
import string

import re
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

DATA_PATH = Path("../data/")

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# 2. Load Data

In [7]:
headlines_df = pd.read_csv(DATA_PATH / "NVIDIA_NewsHeadlines_20241101-Present.csv")

stock_df = pd.read_excel(DATA_PATH / "NVIDIA_StockPriceDaily_20241101-Present.xlsx", engine='openpyxl', header=None)

# Now manually assign column names:
stock_df.columns = [
    'date', 'open', 'high', 'low', 'close', 'adj_close', 'volume'
]

# drop first column by index position
headlines_df = headlines_df.iloc[:, 1:]

stock_df["date"] = pd.to_datetime(stock_df["date"]).dt.date
headlines_df["date"] = pd.to_datetime(headlines_df["seendate"]).dt.date

In [8]:
stock_df.head()

Unnamed: 0,date,open,high,low,close,adj_close,volume
0,2025-11-17,185.97,189.0,184.32,186.6,186.6,172821100
1,2025-11-14,182.86,191.01,180.58,190.17,190.17,186591900
2,2025-11-13,191.05,191.44,183.85,186.86,186.86,207423100
3,2025-11-12,195.72,195.89,191.13,193.8,193.8,154935300
4,2025-11-11,195.16,195.42,191.3,193.16,193.16,176483300


In [9]:
headlines_df.head()

Unnamed: 0,title,seendate,url,sourcecountry,language,domain,date
0,Why Nvidia stock is sinking today,2024-11-01 01:30:00+00:00,https://www.fool.com.au/2024/11/01/why-nvidia-...,Australia,English,fool.com.au,2024-11-01
1,Stock market suffers a Halloween selloff as te...,2024-11-01 01:30:00+00:00,https://www.morningstar.com/news/marketwatch/2...,China,English,morningstar.com,2024-11-01
2,US close : Stocks sharply lower following tech...,2024-11-01 01:45:00+00:00,https://www.sharecast.com/news/market-report-u...,United Kingdom,English,sharecast.com,2024-11-01
3,Why Nvidia Stock Is Sinking Today | The Motley...,2024-11-01 02:15:00+00:00,https://www.fool.com/investing/2024/10/31/why-...,United States,English,fool.com,2024-11-01
4,Billionaire Philippe Laffont of Coatue Is Dump...,2024-11-01 03:00:00+00:00,https://finance.yahoo.com/news/billionaire-phi...,United States,English,finance.yahoo.com,2024-11-01


In [15]:
merged_df = pd.merge(headlines_df, stock_df, on='date', how='inner')
desired_order = [
    'language',
    'sourcecountry',
    'seendate',
    'date',
    'url',
    'title',
    'domain',
    'open',
    'high',
    'low',
    'close',
    'adj_close',
    'volume'
]

merged_df = merged_df[desired_order]


merged_df.head()

Unnamed: 0,language,sourcecountry,seendate,date,url,title,domain,open,high,low,close,adj_close,volume
0,English,Australia,2024-11-18 03:45:00+00:00,2024-11-18,https://www.fool.com.au/2024/11/18/prediction-...,Prediction : Nvidia stock is going to soar aft...,fool.com.au,139.5,141.55,137.15,140.15,140.11,221205300
1,English,Cyprus,2024-11-18 04:00:00+00:00,2024-11-18,https://cyprus-mail.com/2024/11/18/softbank-fi...,SoftBank first to receive new Nvidia chips for...,cyprus-mail.com,139.5,141.55,137.15,140.15,140.11,221205300
2,English,United States,2024-11-18 06:30:00+00:00,2024-11-18,https://247wallst.com/market-news/2024/11/17/n...,Nasdaq Futures Up Sunday Night : NVIDIA Earnin...,247wallst.com,139.5,141.55,137.15,140.15,140.11,221205300
3,English,United States,2024-11-18 11:00:00+00:00,2024-11-18,https://www.benzinga.com/24/11/42029943/dow-tu...,Dow Tumbles Over 300 Points Following Economic...,benzinga.com,139.5,141.55,137.15,140.15,140.11,221205300
4,English,United States,2024-11-18 11:30:00+00:00,2024-11-18,https://www.fool.com/investing/2024/11/18/nvid...,Prediction : Nvidia Stock Will Soar After Nov ...,fool.com,139.5,141.55,137.15,140.15,140.11,221205300


In [None]:
merged_df.to_csv(DATA_PATH / "NVIDIA_Merged_20241101-Present.csv", index=False)

# 3. Split Data

In [59]:
# -------------------------------------------------------------------
# Train / test split by date (training = pre 2025-11-01)
# -------------------------------------------------------------------
SPLIT_DATE = pd.Timestamp('2025-11-01')

train_df = merged_df[merged_df['date'] < SPLIT_DATE].copy()
test_df  = merged_df[merged_df['date'] >= SPLIT_DATE].copy()

# 4. Text Cleaning for TF IDF

In [62]:
# -------------------------------------------------------------------
# 2. Text cleaning
# -------------------------------------------------------------------

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Add nvidia variants to custom stopwords
custom_stop = {"nvidia", "nvidias", "nvdia"}

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)

    tokens = nltk.word_tokenize(text)

    tokens = [w for w in tokens if w not in stop_words and w not in custom_stop]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return ' '.join(tokens)

train_df["cleaned_title"] = [preprocess_text(t) for t in train_df["title"]]
test_df["cleaned_title"]  = [preprocess_text(t) for t in test_df["title"]]

In [63]:
train_daily = (
    train_df
    .groupby('date')['cleaned_title']
    .apply(lambda x: " ".join(x))
    .reset_index()
    .sort_values('date')
)

test_daily = (
    test_df
    .groupby('date')['cleaned_title']
    .apply(lambda x: " ".join(x))
    .reset_index()
    .sort_values('date')
)

In [68]:
display(train_daily.head())
display(test_daily.head())

Unnamed: 0,date,cleaned_title
0,2024-11-18,prediction stock going soar november softbank ...
1,2024-11-19,investor wait earnings nasdaq gain ground tesl...
2,2024-11-20,idia nvda stock rose nearly tuesday q earnings...
3,2024-11-21,third quarter earnings top expectation dow cli...
4,2024-11-22,stock despite amazing earnings history say com...


Unnamed: 0,date,cleaned_title
0,2025-11-03,cramer mag much market get money losing narrat...
1,2025-11-04,kospi due profit taking tuesday palantir ceo a...
2,2025-11-05,ai reality check palantir plunge post earnings...
3,2025-11-06,october mark best ever month etf flow jensen h...
4,2025-11-07,could first company reach trillion buy nvda st...


# 5. TFIDF on Training data

In [65]:
# -------------------------------------------------------------------
# 2) TF-IDF on TRAIN, then transform TEST with same vectorizer
# -------------------------------------------------------------------
vectorizer = TfidfVectorizer(
    max_features=800,      # you can tune this
    ngram_range=(1, 2),     # unigrams + bigrams
    min_df=2                # ignore super-rare tokens
)

X_train_text = vectorizer.fit_transform(train_daily['cleaned_title'])
X_test_text  = vectorizer.transform(test_daily['cleaned_title'])

['..\\data/tfidf/tfidf_vectorizer.pkl']

In [72]:
X_train_text = sparse.load_npz(f"{DATA_PATH}/tfidf/X_train_tfidf.npz")
X_test_text  = sparse.load_npz(f"{DATA_PATH}/tfidf/X_test_tfidf.npz")
vectorizer   = joblib.load(f"{DATA_PATH}/tfidf/tfidf_vectorizer.pkl")

In [69]:
# get daily close prices from merged_df, consistent ordering
daily_close = (
    merged_df
    .groupby('date')['close']
    .first()
    .sort_index()
)

display(daily_close.head())

date
2024-11-18    140.15
2024-11-19    147.01
2024-11-20    145.89
2024-11-21    146.67
2024-11-22    141.95
Name: close, dtype: float64