# 1. Imports

In [None]:
import pandas as pd
from pathlib import Path
import openpyxl
import string

import re
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import hstack
from sklearn.linear_model import Ridge

from scipy import sparse
import joblib

DATA_PATH = Path("../data/")
DATA_OUTPUT_PATH = Path("../output/")

# 2. Load Data

In [9]:
# Load sparse TF-IDF matrices and vectorizer
X_train_text = sparse.load_npz(f"{DATA_PATH}/tfidf/X_train_tfidf.npz") # Load training TF-IDF matrix
X_test_text  = sparse.load_npz(f"{DATA_PATH}/tfidf/X_test_tfidf.npz") # Load testing TF-IDF matrix
vectorizer   = joblib.load(f"{DATA_PATH}/tfidf/tfidf_vectorizer.pkl") # Load TF-IDF vectorizer

In [10]:
df_nvidia = pd.read_csv(DATA_PATH / "NVIDIA_MergedData_20241101-Present.csv")

df_nvidia['date'] = pd.to_datetime(df_nvidia['date'])
display(df_nvidia.head())

Unnamed: 0,language,sourcecountry,seendate,date,url,title,domain,open,high,low,close,adj_close,volume
0,English,Australia,2024-11-18 03:45:00+00:00,2024-11-18,https://www.fool.com.au/2024/11/18/prediction-...,Prediction : Nvidia stock is going to soar aft...,fool.com.au,139.5,141.55,137.15,140.15,140.11,221205300
1,English,Cyprus,2024-11-18 04:00:00+00:00,2024-11-18,https://cyprus-mail.com/2024/11/18/softbank-fi...,SoftBank first to receive new Nvidia chips for...,cyprus-mail.com,139.5,141.55,137.15,140.15,140.11,221205300
2,English,United States,2024-11-18 06:30:00+00:00,2024-11-18,https://247wallst.com/market-news/2024/11/17/n...,Nasdaq Futures Up Sunday Night : NVIDIA Earnin...,247wallst.com,139.5,141.55,137.15,140.15,140.11,221205300
3,English,United States,2024-11-18 11:00:00+00:00,2024-11-18,https://www.benzinga.com/24/11/42029943/dow-tu...,Dow Tumbles Over 300 Points Following Economic...,benzinga.com,139.5,141.55,137.15,140.15,140.11,221205300
4,English,United States,2024-11-18 11:30:00+00:00,2024-11-18,https://www.fool.com/investing/2024/11/18/nvid...,Prediction : Nvidia Stock Will Soar After Nov ...,fool.com,139.5,141.55,137.15,140.15,140.11,221205300


In [None]:
SPLIT_DATE = pd.Timestamp("2025-11-01")

train_df = df_nvidia[df_nvidia["date"] < SPLIT_DATE].copy()
test_df  = df_nvidia[df_nvidia["date"] >= SPLIT_DATE].copy()

# One close per day (you can use .first(), .last(), or .mean())
train_daily = (
    train_df.groupby("date")["close"]
    .first()
    .reset_index()
    .sort_values("date")
)

test_daily = (
    test_df.groupby("date")["close"]
    .first()
    .reset_index()
    .sort_values("date")
)

train_dates = train_daily["date"].values
test_dates  = test_daily["date"].values

y_train_all = train_daily["close"].values   # one close per train day
y_test_all  = test_daily["close"].values    # one close per test day

# Checking number of days must match TF-IDF rows
print("X_train_text shape:", X_train_text.shape)
print("train_daily days:", len(train_dates))
print("X_test_text shape:", X_test_text.shape)
print("test_daily days:", len(test_dates))

X_train_text shape: (228, 800)
train_daily days: 228
X_test_text shape: (5, 800)
test_daily days: 5


# 3. Model Logic

To predict close(t), you must use only information available at the end of day (t-1).

- TFIDF_day1 & closing_day1_price → predicts closing_day2_price
- TFIDF_day2 & closing_day2_price → predicts closing_day3_price

For example: If our data looks like this. 
| Day | Date       | TF-IDF Vector (simplified) | Close |
| --- | ---------- | -------------------------- | ----- |
| 1   | 2025-10-01 | `[0.2, 0.1, 0.0]`          | 100   |
| 2   | 2025-10-02 | `[0.4, 0.0, 0.1]`          | 102   |
| 3   | 2025-10-03 | `[0.3, 0.2, 0.1]`          | 101   |
| 4   | 2025-10-04 | `[0.1, 0.4, 0.3]`          | 103   |


We first need to get every TFIDF except the last day (since last day's TFIDF isn't used for anything in terms of training.) Last day's of training TFIDF should be used as the TFIDF going into predicting the test.
| Row | Source Day | TF-IDF(t−1)       |
| --- | ---------- | ----------------- |
| 0   | Day 1      | `[0.2, 0.1, 0.0]` |
| 1   | Day 2      | `[0.4, 0.0, 0.1]` |
| 2   | Day 3      | `[0.3, 0.2, 0.1]` |

Get the values of the close price on that TFIDF day (`prev_close_train`)
| Row | Close(t−1) |
| --- | ---------- |
| 0   | 100        |
| 1   | 102        |
| 2   | 101        |

Get the value that we are predicting for (`y_train`)
| Row | Close(t) |
| --- | -------- |
| 0   | 102      |
| 1   | 101      |
| 2   | 103      |

Merge (`X_train_iter`)
| Row | TF-IDF(t−1)       | Close(t−1) | → Predict |
| --- | ----------------- | ---------- | --------- |
| 0   | `[0.2, 0.1, 0.0]` | `100`      | `102`     |
| 1   | `[0.4, 0.0, 0.1]` | `102`      | `101`     |
| 2   | `[0.3, 0.2, 0.1]` | `101`      | `103`     |





In [13]:
# TF-IDF(t-1): drop the last TF-IDF row
X_train_tfidf_prev = X_train_text[:-1]          # days 0..N-2

# Close(t-1): drop the last close
prev_close_train = y_train_all[:-1].reshape(-1, 1)

# Target Close(t): drop the first close
y_train = y_train_all[1:]                  # days 1..N-1

# Final training features: [TFIDF(t-1), Close(t-1)]
X_train_iter = hstack([X_train_tfidf_prev, prev_close_train])

In [14]:
model = Ridge(alpha=1.0)
model.fit(X_train_iter, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


# 4. Prediction (Ridge Regression)

In [23]:
predicted_closes = []

# Start with last *actual* training close
prev_close = float(y_train_all[-1])

for i in range(len(test_dates)):
    prev_close_feat = np.array([[prev_close]])  # shape (1,1)
    X_i = hstack([X_test_text[i, :], prev_close_feat])  # (1, n_features+1)

    y_pred_i = model.predict(X_i)[0]
    predicted_closes.append(y_pred_i)

    # For next day, feed in today's predicted close
    prev_close = y_pred_i

pred_df = pd.DataFrame({
    "date": test_dates,
    # label mon, tues, wed.
    "dayofweek": [pd.Timestamp(d).day_name() for d in test_dates],
    "predicted_close": predicted_closes,
    "actual_close": y_test_all,
}).set_index("date")

pred_df.to_csv(DATA_OUTPUT_PATH / "nvidia_price_predictions.csv")

In [24]:
pred_df.head(10)

Unnamed: 0_level_0,dayofweek,predicted_close,actual_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-11-03,Monday,203.059916,206.88
2025-11-04,Tuesday,202.920672,198.69
2025-11-05,Wednesday,202.687083,195.21
2025-11-06,Thursday,202.038753,188.08
2025-11-07,Friday,202.39416,188.15
