# 1. Imports

In [34]:
import pandas as pd
from pathlib import Path

import numpy as np

from scipy.sparse import hstack
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

from scipy import sparse
import joblib

DATA_PATH = Path("../data/")
DATA_OUTPUT_PATH = Path("../output/")

# 2. Load Data

In [35]:
# Load sparse TF-IDF matrices and vectorizer
X_train_text = sparse.load_npz(f"{DATA_PATH}/tfidf/X_train_tfidf.npz") # Load training TF-IDF matrix
X_test_text  = sparse.load_npz(f"{DATA_PATH}/tfidf/X_test_tfidf.npz") # Load testing TF-IDF matrix
vectorizer   = joblib.load(f"{DATA_PATH}/tfidf/tfidf_vectorizer.pkl") # Load TF-IDF vectorizer

In [36]:
df_nvidia = pd.read_csv(DATA_PATH / "NVIDIA_Merged_20241101-Present.csv")

df_nvidia['date'] = pd.to_datetime(df_nvidia['date'])
display(df_nvidia.head())

Unnamed: 0,language,sourcecountry,seendate,date,url,title,domain,open,high,low,close,adj_close,volume
0,English,Australia,2024-11-18 03:45:00+00:00,2024-11-18,https://www.fool.com.au/2024/11/18/prediction-...,Prediction : Nvidia stock is going to soar aft...,fool.com.au,139.5,141.55,137.15,140.15,140.11,221205300
1,English,Cyprus,2024-11-18 04:00:00+00:00,2024-11-18,https://cyprus-mail.com/2024/11/18/softbank-fi...,SoftBank first to receive new Nvidia chips for...,cyprus-mail.com,139.5,141.55,137.15,140.15,140.11,221205300
2,English,China,2024-11-18 04:00:00+00:00,2024-11-18,https://www.morningstar.com/markets/this-unlov...,Why Small - Cap Value Stocks Look Attractive R...,morningstar.com,139.5,141.55,137.15,140.15,140.11,221205300
3,English,United States,2024-11-18 06:30:00+00:00,2024-11-18,https://247wallst.com/market-news/2024/11/17/n...,Nasdaq Futures Up Sunday Night : NVIDIA Earnin...,247wallst.com,139.5,141.55,137.15,140.15,140.11,221205300
4,English,United States,2024-11-18 11:00:00+00:00,2024-11-18,https://www.benzinga.com/24/11/42029943/dow-tu...,Dow Tumbles Over 300 Points Following Economic...,benzinga.com,139.5,141.55,137.15,140.15,140.11,221205300


In [48]:
SPLIT_DATE = pd.Timestamp("2025-11-01")

train_df = df_nvidia[df_nvidia["date"] < SPLIT_DATE].copy()
test_df  = df_nvidia[df_nvidia["date"] >= SPLIT_DATE].copy()

# One close per day (you can use .first(), .last(), or .mean())
train_daily = (
    train_df.groupby("date")["close"]
    .first()
    .reset_index()
    .sort_values("date")
)

test_daily = (
    test_df.groupby("date")["close"]
    .first()
    .reset_index()
    .sort_values("date")
)

train_dates = train_daily["date"].values
test_dates  = test_daily["date"].values

y_train_all = train_daily["close"].values   # one close per train day
y_test_all  = test_daily["close"].values    # one close per test day

# Checking number of days must match TF-IDF rows
print("X_train_text shape:", X_train_text.shape)
print("train_daily days:", len(train_dates))
print("X_test_text shape:", X_test_text.shape)
print("test_daily days:", len(test_dates))

X_train_text shape: (226, 50)
train_daily days: 226
X_test_text shape: (15, 50)
test_daily days: 15


# 2.1. Iterative forecast function

In [49]:
def iterative_forecast(model, X_test_text, y_train_all, test_dates):
    """
    model: trained on [TFIDF(t), Close(t)] → Close(t+1)
    X_test_text: TFIDF for each test day, in order
    y_train_all: array of all *train* closes (used to seed prev_close)
    test_dates: array of dates for the test period
    """
    predicted_closes = []

    # Start with last *actual* training close
    prev_close = float(y_train_all[-1])

    for i in range(len(test_dates)):
        # (1, 1) dense feature for prev_close
        prev_close_feat = np.array([[prev_close]])

        # (1, n_features+1): [TFIDF(test_day_i), prev_close]
        X_i = hstack([X_test_text[i, :], prev_close_feat])

        # Predict close for this test day
        y_pred_i = float(model.predict(X_i)[0])
        predicted_closes.append(y_pred_i)

        # Next day uses this day's predicted close
        prev_close = y_pred_i

    return np.array(predicted_closes)

# 2.2. Evaluation metric function

In [63]:
def build_forecast_df(preds, test_dates, y_test_all, model_name="Model", add_summary_row=True):
    """
    preds        : array-like, predicted closes for each test date
    test_dates   : array-like of datetime-like values
    y_test_all   : array-like, actual closes for each test date
    model_name   : name for printing/logging
    add_summary_row : if True, appends a final 'METRICS' row
    """
    preds = np.asarray(preds)
    y_test_all = np.asarray(y_test_all)

    # Base prediction table
    pred_df = pd.DataFrame({
        "date": test_dates,
        "dayofweek": [pd.Timestamp(d).day_name() for d in test_dates],
        "predicted_close": preds,
        "actual_close": y_test_all,
    }).set_index("date")

    # Metrics
    rmse = mean_squared_error(y_test_all, preds)
    mae  = mean_absolute_error(y_test_all, preds)
    r2   = r2_score(y_test_all, preds)

    metrics = {
        "model": model_name,
        "rmse": rmse,
        "mae": mae,
        "r2": r2
    }

    # Pretty print
    print(f"\n=== {model_name} (iterative forecast) ===")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}")
    print(f"R^2 : {r2:.4f}")

    return pred_df, metrics

# 3. Further Data Preprocessing for Iterative Prediction

To predict close(t), you must use only information available at the end of day (t-1).

- TFIDF_day1 & closing_day1_price → predicts closing_day2_price
- TFIDF_day2 & closing_day2_price → predicts closing_day3_price

For example: If our data looks like this. 
| Day | Date       | TF-IDF Vector (simplified) | Close |
| --- | ---------- | -------------------------- | ----- |
| 1   | 2025-10-01 | `[0.2, 0.1, 0.0]`          | 100   |
| 2   | 2025-10-02 | `[0.4, 0.0, 0.1]`          | 102   |
| 3   | 2025-10-03 | `[0.3, 0.2, 0.1]`          | 101   |
| 4   | 2025-10-04 | `[0.1, 0.4, 0.3]`          | 103   |


We first need to get every TFIDF except the last day (since last day's TFIDF isn't used for anything in terms of training.) Last day's of training TFIDF should be used as the TFIDF going into predicting the test.
| Row | Source Day | TF-IDF(t−1)       |
| --- | ---------- | ----------------- |
| 0   | Day 1      | `[0.2, 0.1, 0.0]` |
| 1   | Day 2      | `[0.4, 0.0, 0.1]` |
| 2   | Day 3      | `[0.3, 0.2, 0.1]` |

Get the values of the close price on that TFIDF day (`prev_close_train`)
| Row | Close(t−1) |
| --- | ---------- |
| 0   | 100        |
| 1   | 102        |
| 2   | 101        |

Get the value that we are predicting for (`y_train`)
| Row | Close(t) |
| --- | -------- |
| 0   | 102      |
| 1   | 101      |
| 2   | 103      |

Merge (`X_train_iter`)
| Row | TF-IDF(t−1)       | Close(t−1) | → Predict |
| --- | ----------------- | ---------- | --------- |
| 0   | `[0.2, 0.1, 0.0]` | `100`      | `102`     |
| 1   | `[0.4, 0.0, 0.1]` | `102`      | `101`     |
| 2   | `[0.3, 0.2, 0.1]` | `101`      | `103`     |





In [51]:
# TF-IDF(t-1): drop the last TF-IDF row
X_train_tfidf_prev = X_train_text[:-1]          # days 0..N-2

# Close(t-1): drop the last close
prev_close_train = y_train_all[:-1].reshape(-1, 1)

# Target Close(t): drop the first close
y_train = y_train_all[1:]                  # days 1..N-1

# Final training features: [TFIDF(t-1), Close(t-1)]
X_train_iter = hstack([X_train_tfidf_prev, prev_close_train])

# 4. Training & Prediction (Ridge Regression)

In [54]:
model = Ridge(alpha=1.0)
model.fit(X_train_iter, y_train)

ridge_pred = []

# Start with last *actual* training close
prev_close = float(y_train_all[-1])

for i in range(len(test_dates)):
    prev_close_feat = np.array([[prev_close]])  # shape (1,1)
    X_i = hstack([X_test_text[i, :], prev_close_feat])  # (1, n_features+1)

    y_pred_i = model.predict(X_i)[0]
    ridge_pred.append(y_pred_i)

    # For next day, feed in today's predicted close
    prev_close = y_pred_i

In [64]:
ridge_df, ridge_metrics = build_forecast_df(ridge_pred, test_dates, y_test_all, "Ridge")
display(ridge_df)


=== Ridge (iterative forecast) ===
RMSE: 227.5032
MAE : 13.3606
R^2 : -3.1646


Unnamed: 0_level_0,dayofweek,predicted_close,actual_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-11-03,Monday,202.040318,206.88
2025-11-04,Tuesday,202.318493,198.69
2025-11-05,Wednesday,200.468542,195.21
2025-11-06,Thursday,200.70112,188.08
2025-11-07,Friday,201.901463,188.15
2025-11-10,Monday,202.459755,199.05
2025-11-11,Tuesday,203.19269,193.16
2025-11-12,Wednesday,203.453543,193.8
2025-11-13,Thursday,203.239027,186.86
2025-11-14,Friday,204.578482,190.17


# 5. Model Logic (Random Forest Regressor)

In [57]:
# Base model
rf = RandomForestRegressor(
    random_state=42,
    n_jobs=-1
)

# Hyperparameter grid (you can tweak these)
rf_param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

# Grid search with 3-fold CV
rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=2
)

# 6. Training & Prediction (Random Forest Regressor)

In [58]:
# Fit on training data
rf_grid.fit(X_train_iter, y_train)

print("Best RF params:", rf_grid.best_params_)
print("Best RF CV RMSE:", np.sqrt(-rf_grid.best_score_))

# Evaluate on test set
rf_best = rf_grid.best_estimator_
predicted_closes_rf = iterative_forecast(rf_best, X_test_text, y_train_all, test_dates)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best RF params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best RF CV RMSE: 27.75241750548653


In [65]:
rf_df, rf_metrics = build_forecast_df(predicted_closes_rf, test_dates, y_test_all, "Random Forest")
display(rf_df)


=== Random Forest (iterative forecast) ===
RMSE: 526.5237
MAE : 21.1812
R^2 : -8.6385


Unnamed: 0_level_0,dayofweek,predicted_close,actual_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-11-03,Monday,170.351753,206.88
2025-11-04,Tuesday,172.870205,198.69
2025-11-05,Wednesday,166.628992,195.21
2025-11-06,Thursday,165.002053,188.08
2025-11-07,Friday,173.345235,188.15
2025-11-10,Monday,168.080157,199.05
2025-11-11,Tuesday,167.268687,193.16
2025-11-12,Wednesday,166.54906,193.8
2025-11-13,Thursday,170.374483,186.86
2025-11-14,Friday,160.7088,190.17


# 7. Model Logic (XGBoost)

In [61]:
xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    tree_method="hist"
)

xgb.fit(X_train_iter, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


# 8. Training & Prediction (Random Forest Regressor)

In [66]:
xgb_preds = iterative_forecast(xgb, X_test_text, y_train_all, test_dates)
xgb_df, xgb_metrics = build_forecast_df(xgb_preds, test_dates, y_test_all, "XGBoost")
display(xgb_df)


=== XGBoost (iterative forecast) ===
RMSE: 193.8659
MAE : 12.3486
R^2 : -2.5489


Unnamed: 0_level_0,dayofweek,predicted_close,actual_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-11-03,Monday,192.193253,206.88
2025-11-04,Tuesday,194.536713,198.69
2025-11-05,Wednesday,189.752258,195.21
2025-11-06,Thursday,182.529251,188.08
2025-11-07,Friday,183.024368,188.15
2025-11-10,Monday,180.243988,199.05
2025-11-11,Tuesday,177.846405,193.16
2025-11-12,Wednesday,173.087875,193.8
2025-11-13,Thursday,170.423157,186.86
2025-11-14,Friday,167.461075,190.17


# 9. Results & Discussion

Across all three models (XGBoost, Random Forest, and Ridge Regression) the iterative next-day forecasting approach struggled to produce accurate predictions for the November 2025 test period. 

Although each model successfully learned a general downward trend from the training data, most  of them systematically underpredicted the actual NVIDIA closing prices (except for linear regression). This consistent downward drift occurred because the models had to repeatedly rely on their own previous predictions during test time, causing any small initial error to accumulate rapidly over the entire forecast window.

Furthermore, because the feature set included only TF-IDF news vectors (which contain very weak price-movement signal) and a single lagged close value, the models lacked the richer historical and market context needed to respond to the sharp price swings seen in early–mid November 2025. As a result, none of the three models managed to capture the true volatility or magnitude of the actual closing prices.

- XGBoost performed the best overall, achieving the lowest RMSE, but still produced a steadily declining prediction curve that diverged from reality.
- Ridge Regression was the most stable, showing smoother predictions but still unable to match actual price levels.
- Random Forest performed the worst, diverging quickly due to its difficulty with high-dimensional TF-IDF inputs and lack of sequential structure.