In [1]:
# imports
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import sparse
import joblib
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix
)

In [2]:
# load data tfidf matrices, mapping vectorizer, merged dataset
DATA_PATH = Path("../data/")

X_train_text = sparse.load_npz(DATA_PATH / "tfidf/X_train_tfidf.npz")
X_test_text  = sparse.load_npz(DATA_PATH / "tfidf/X_test_tfidf.npz")
vectorizer   = joblib.load(DATA_PATH / "tfidf/tfidf_vectorizer.pkl")

df_nvidia = pd.read_csv(DATA_PATH / "NVIDIA_Merged_20241101-Present.csv")
df_nvidia["date"] = pd.to_datetime(df_nvidia["date"])

print("TF-IDF shapes:")
print("Train:", X_train_text.shape)
print("Test :", X_test_text.shape)
df_nvidia.head()

TF-IDF shapes:
Train: (226, 50)
Test : (15, 50)


Unnamed: 0,language,sourcecountry,seendate,date,url,title,domain,open,high,low,close,adj_close,volume
0,English,Australia,2024-11-18 03:45:00+00:00,2024-11-18,https://www.fool.com.au/2024/11/18/prediction-...,Prediction : Nvidia stock is going to soar aft...,fool.com.au,139.5,141.55,137.15,140.15,140.11,221205300
1,English,Cyprus,2024-11-18 04:00:00+00:00,2024-11-18,https://cyprus-mail.com/2024/11/18/softbank-fi...,SoftBank first to receive new Nvidia chips for...,cyprus-mail.com,139.5,141.55,137.15,140.15,140.11,221205300
2,English,China,2024-11-18 04:00:00+00:00,2024-11-18,https://www.morningstar.com/markets/this-unlov...,Why Small - Cap Value Stocks Look Attractive R...,morningstar.com,139.5,141.55,137.15,140.15,140.11,221205300
3,English,United States,2024-11-18 06:30:00+00:00,2024-11-18,https://247wallst.com/market-news/2024/11/17/n...,Nasdaq Futures Up Sunday Night : NVIDIA Earnin...,247wallst.com,139.5,141.55,137.15,140.15,140.11,221205300
4,English,United States,2024-11-18 11:00:00+00:00,2024-11-18,https://www.benzinga.com/24/11/42029943/dow-tu...,Dow Tumbles Over 300 Points Following Economic...,benzinga.com,139.5,141.55,137.15,140.15,140.11,221205300


In [3]:
# train/test split
SPLIT_DATE = pd.Timestamp("2025-11-01")

train_df = df_nvidia[df_nvidia["date"] < SPLIT_DATE]
test_df  = df_nvidia[df_nvidia["date"] >= SPLIT_DATE]

# collapse multiple headlines per day & one closing price per day.
train_daily = train_df.groupby("date")["close"].first().reset_index().sort_values("date")
test_daily  = test_df.groupby("date")["close"].first().reset_index().sort_values("date")

y_train_all = train_daily["close"].values
y_test_all  = test_daily["close"].values

print(f"Train days: {len(train_daily)}")
print(f"Test days : {len(test_daily)}")
train_daily.head(), test_daily.head()

Train days: 226
Test days : 15


(        date   close
 0 2024-11-18  140.15
 1 2024-11-19  147.01
 2 2024-11-20  145.89
 3 2024-11-21  146.67
 4 2024-11-22  141.95,
         date   close
 0 2025-11-03  206.88
 1 2025-11-04  198.69
 2 2025-11-05  195.21
 3 2025-11-06  188.08
 4 2025-11-07  188.15)

In [4]:
# daily returns
# r(t) = (close[t] − close[t−1]) / close[t−1]
train_returns = (y_train_all[1:] - y_train_all[:-1]) / y_train_all[:-1]
test_returns  = (y_test_all[1:]  - y_test_all[:-1])  / y_test_all[:-1]

# handle last-train and first-test boundary for walk-forward
first_test_return_prev = (y_test_all[0] - y_train_all[-1]) / y_train_all[-1]
test_returns = np.r_[first_test_return_prev, test_returns]

print("Train returns length:", len(train_returns))
print("Test returns length :", len(test_returns))
print("Sample train returns:", train_returns[:5])
print("Sample test returns :", test_returns[:5])

Train returns length: 225
Test returns length : 15
Sample train returns: [ 0.04894756 -0.00761853  0.00534649 -0.03218109 -0.04177527]
Sample test returns : [ 0.02168008 -0.03958817 -0.01751472 -0.03652477  0.00037218]


In [5]:
# window size 1 alignment (baseline "yesterday predicts tomorrow")
# feature: r(t-1)
# label: movement(t)
y_train_w1 = (train_returns[1:] > 0).astype(int)
prev_return_train_w1 = train_returns[:-1].reshape(-1, 1)

# align tfidf by trimming last 2 days.
X_train_tfidf_prev_w1 = X_train_text[:-2]

print("Window 1 shapes:")
print("TF-IDF:", X_train_tfidf_prev_w1.shape)
print("Returns:", prev_return_train_w1.shape)
print("Labels:", y_train_w1.shape)

Window 1 shapes:
TF-IDF: (224, 50)
Returns: (224, 1)
Labels: (224,)


In [6]:
# window size 3 alignment (short-term autocorrelation)
# features: [r(t-1), r(t-2), r(t-3)]
# label: movement(t)
def make_forward_window(arr, window):
    L = len(arr)
    feats = np.column_stack([arr[i : i + (L - window)] for i in range(window)])
    labels = arr[window:]
    return feats, labels

prev_return_train_w3, y_raw_w3 = make_forward_window(train_returns, 3)
y_train_w3 = (y_raw_w3 > 0).astype(int)

# drop last 4 days to align tfidf
X_train_tfidf_prev_w3 = X_train_text[:-4]

print("Window 3 shapes:")
print("TF-IDF:", X_train_tfidf_prev_w3.shape)
print("Returns:", prev_return_train_w3.shape)
print("Labels:", y_train_w3.shape)

Window 3 shapes:
TF-IDF: (222, 50)
Returns: (222, 3)
Labels: (222,)


In [7]:
# svd & pca
svd_dim = 50
svd = TruncatedSVD(n_components=svd_dim, random_state=5312)

# fit on window-1 tfidf (most data)
X_train_svd_w1 = svd.fit_transform(X_train_tfidf_prev_w1)
X_train_svd_w3 = svd.transform(X_train_tfidf_prev_w3)
X_test_svd     = svd.transform(X_test_text)

print("SVD output example (first row):")
print(X_train_svd_w1[0][:10])

# pca only for logistic regression
pca_dim = 10
pca = PCA(n_components=pca_dim, random_state=5312)

X_train_pca_w1 = pca.fit_transform(X_train_svd_w1)
X_train_pca_w3 = pca.transform(X_train_svd_w3)
X_test_pca     = pca.transform(X_test_svd)

print("\nPCA output example (first row):")
print(X_train_pca_w1[0])

SVD output example (first row):
[ 0.84020023  0.13477277 -0.20840343 -0.07820849 -0.10059298  0.17792114
  0.10120211 -0.0534392  -0.11416216 -0.20409693]

PCA output example (first row):
[ 0.12654047 -0.21365639 -0.08439016 -0.0955109   0.16770294 -0.1143939
  0.08081019  0.03170237 -0.21525048  0.19806781]


In [8]:
# scale returns for lr/xgb/pca models for better results
# svm uses its own internal scaling
scaler_w1 = StandardScaler()
prev_return_scaled_w1 = scaler_w1.fit_transform(prev_return_train_w1)

scaler_w3 = StandardScaler()
prev_return_scaled_w3 = scaler_w3.fit_transform(prev_return_train_w3)

print("Scaled return example (w1):", prev_return_scaled_w1[:3])
print("Scaled return example (w3):", prev_return_scaled_w3[:3])

Scaled return example (w1): [[ 1.42049155]
 [-0.29819081]
 [ 0.09573351]]
Scaled return example (w3): [[ 1.41857385 -0.29515325  0.10081444]
 [-0.29688575  0.09919488 -1.03969798]
 [ 0.09629991 -1.04225554 -1.33127789]]


In [9]:
# training matrices
feature_blocks = {
    # window 1 models
    "lr_xgb_w1": np.hstack([X_train_svd_w1, prev_return_scaled_w1]),
    "svm_w1":    np.hstack([X_train_svd_w1, prev_return_train_w1]),
    "pca_w1":    np.hstack([X_train_pca_w1, prev_return_scaled_w1]),

    # window 3 models
    "lr_xgb_w3": np.hstack([X_train_svd_w3, prev_return_scaled_w3]),
    "svm_w3":    np.hstack([X_train_svd_w3, prev_return_train_w3]),
    "pca_w3":    np.hstack([X_train_pca_w3, prev_return_scaled_w3]),
}

print("\nFeature block shapes:")
for name, mat in feature_blocks.items():
    print(name, mat.shape)


Feature block shapes:
lr_xgb_w1 (224, 51)
svm_w1 (224, 51)
pca_w1 (224, 11)
lr_xgb_w3 (222, 53)
svm_w3 (222, 53)
pca_w3 (222, 13)


In [10]:
# walk-forward prediction function
# simulates real-world forecasting since:
# - we predict day t using only info available up to day t-1
# - after predicting, we update the return window using the actual return
# - prevents lookahead bias
def walk_forward_predict(model, X_test_embed, test_returns, last_train_returns,scaler_prev=None, use_proba=True, window=1):
    scores = []

    # initialize rolling state
    if window == 1:
        state = float(last_train_returns)
    else:
        state = np.array(last_train_returns).astype(float)  # shape (3,)

    print("Initial return state:", state)

    for i in range(len(test_returns)):
        # build return feature
        if window == 1:
            ret_feat = np.array([[state]])
        else:
            ret_feat = state.reshape(1, -1)

        # scale returns if scaler provided
        if scaler_prev is not None:
            ret_feat = scaler_prev.transform(ret_feat)

        # build complete feature vector
        X_i = np.hstack([X_test_embed[i].reshape(1, -1), ret_feat])

        # predict using appropriate 'score'
        if use_proba:
            prob_up = model.predict_proba(X_i)[0][1]
            scores.append(prob_up)
        else:
            margin = model.decision_function(X_i)[0]
            scores.append(margin)

        # update return window using actual realized returns
        if window == 1:
            state = test_returns[i]
        else:
            state = np.roll(state, -1)
            state[-1] = test_returns[i]

    return np.array(scores)

In [11]:
# eval thresholds for each model
# models produce scores:
# - logistic regression / xgb produce probabilities
# - svm produce decision-function margins
# convert scores into 0/1 labels via thresholds
# sweeping thresholds reveals the best accuracy/precision/recall/F1 tradeoff
def evaluate_thresholds(raw_scores, actual, thresholds, model_name, svd_dim, window):
    rows = []

    for thr in thresholds:
        preds = (raw_scores >= thr).astype(int)

        rows.append({
            "model": model_name,
            "svd_dim": svd_dim,
            "window": window,
            "threshold": thr,
            "accuracy": accuracy_score(actual, preds),
            "precision": precision_score(actual, preds, zero_division=0),
            "recall": recall_score(actual, preds),
            "f1": f1_score(actual, preds),
            "confusion": confusion_matrix(actual, preds)
        })

    return rows

In [12]:
# models
# each model is defined with:
# - initialization
# - hyperparameter grid
# - whether it uses probabilities
# - which feature matrix to use
# - window size (1 or 3)
# - whether to scale the return window
models = {
    # logistic regression (svd)
    "logreg_w1": {
        "init": LogisticRegression(class_weight="balanced", solver="lbfgs"),
        "param_grid": {"C": [0.1, 0.5, 1, 5, 10], "penalty": ["l2"], "max_iter": [2000]},
        "use_proba": True,
        "thresholds": np.arange(0.40, 0.61, 0.025),
        "train_matrix": "lr_xgb_w1",
        "window": 1,
        "scale_returns": True,
    },
    "logreg_w3": {
        "init": LogisticRegression(class_weight="balanced", solver="lbfgs"),
        "param_grid": {"C": [0.1, 0.5, 1, 5, 10], "penalty": ["l2"], "max_iter": [2000]},
        "use_proba": True,
        "thresholds": np.arange(0.40, 0.61, 0.025),
        "train_matrix": "lr_xgb_w3",
        "window": 3,
        "scale_returns": True,
    },

    # xgboost
    "xgboost_w1": {
        "init": XGBClassifier(objective="binary:logistic", eval_metric="logloss", random_state=5312),
        "param_grid": {
            "n_estimators": [100, 200],
            "max_depth": [2, 3],
            "learning_rate": [0.01, 0.05, 0.1, 0.15],
        },
        "use_proba": True,
        "thresholds": np.arange(0.40, 0.71, 0.05),
        "train_matrix": "lr_xgb_w1",
        "window": 1,
        "scale_returns": True,
    },
    "xgboost_w3": {
        "init": XGBClassifier(objective="binary:logistic", eval_metric="logloss", random_state=5312),
        "param_grid": {
            "n_estimators": [100, 200],
            "max_depth": [2, 3],
            "learning_rate": [0.01, 0.05, 0.1, 0.15],
        },
        "use_proba": True,
        "thresholds": np.arange(0.40, 0.71, 0.05),
        "train_matrix": "lr_xgb_w3",
        "window": 3,
        "scale_returns": True,
    },

    # linear svm
    "svm_w1": {
        "init": Pipeline([("scaler", StandardScaler()), 
                          ("svm", LinearSVC(class_weight="balanced", random_state=5312))]),
        "param_grid": {"svm__C": [0.01, 0.05, 0.1, 1, 5], "svm__loss": ["squared_hinge"], "svm__max_iter": [2000]},
        "use_proba": False,
        "thresholds": np.arange(-1.0, 0.1, 0.1),
        "train_matrix": "svm_w1",
        "window": 1,
        "scale_returns": False,
    },
    "svm_w3": {
        "init": Pipeline([("scaler", StandardScaler()), 
                          ("svm", LinearSVC(class_weight="balanced", random_state=5312))]),
        "param_grid": {"svm__C": [0.01, 0.05, 0.1, 1, 5], "svm__loss": ["squared_hinge"], "svm__max_iter": [2000]},
        "use_proba": False,
        "thresholds": np.arange(-1.0, 0.1, 0.1),
        "train_matrix": "svm_w3",
        "window": 3,
        "scale_returns": False,
    },

    # logistic regression (pca)
    "logreg_pca_w1": {
        "init": LogisticRegression(class_weight="balanced", solver="lbfgs"),
        "param_grid": {"C": [0.1, 0.5, 1, 5, 10], "penalty": ["l2"], "max_iter": [2000]},
        "use_proba": True,
        "thresholds": np.arange(0.40, 0.61, 0.025),
        "train_matrix": "pca_w1",
        "window": 1,
        "scale_returns": True,
    },
    "logreg_pca_w3": {
        "init": LogisticRegression(class_weight="balanced", solver="lbfgs"),
        "param_grid": {"C": [0.1, 0.5, 1, 5, 10], "penalty": ["l2"], "max_iter": [2000]},
        "use_proba": True,
        "thresholds": np.arange(0.40, 0.61, 0.025),
        "train_matrix": "pca_w3",
        "window": 3,
        "scale_returns": True,
    },
}

In [13]:
# train
master_results = []

for name, cfg in models.items():
    print(f"\n========== Training {name.upper()} ==========")

    tm = cfg["train_matrix"]
    window = cfg["window"]

    # select train matrix
    X_train_model = feature_blocks[tm]

    # select labels based on window size
    y_train_model = y_train_w1 if window == 1 else y_train_w3

    print("Train matrix shape:", X_train_model.shape)
    print("Label shape:", y_train_model.shape)

    # determine SVD vs PCA embedding for test set
    X_test_embed = X_test_pca if tm.startswith("pca_") else X_test_svd

    # gridsearch
    grid = GridSearchCV(
        cfg["init"], 
        cfg["param_grid"], 
        scoring="f1",
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train_model, y_train_model)
    best_model = grid.best_estimator_

    print("Best parameters:", grid.best_params_)

    # select return scaling
    scaler_prev = None
    if cfg["scale_returns"]:
        scaler_prev = scaler_w1 if window == 1 else scaler_w3

    # select initial return window
    last_train_returns = train_returns[-window:] if window == 3 else train_returns[-1]

    # walk-forward prediction
    raw_scores = walk_forward_predict(
        best_model,
        X_test_embed,
        test_returns,
        last_train_returns,
        scaler_prev=scaler_prev,
        use_proba=cfg["use_proba"],
        window=window
    )

    print("Sample prediction scores:", raw_scores[:5])

    # convert test returns to binary labels
    actual = (test_returns > 0).astype(int)

    # evaluate thresholds
    rows = evaluate_thresholds(
        raw_scores,
        actual,
        cfg["thresholds"],
        name,
        svd_dim=50,
        window=window
    )
    master_results.extend(rows)


Train matrix shape: (224, 51)
Label shape: (224,)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters: {'C': 0.5, 'max_iter': 2000, 'penalty': 'l2'}
Initial return state: -0.0019715116565625574
Sample prediction scores: [0.45594738 0.42206082 0.49578346 0.46101798 0.51137637]

Train matrix shape: (222, 53)
Label shape: (222,)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters: {'C': 0.1, 'max_iter': 2000, 'penalty': 'l2'}
Initial return state: [ 0.02989604 -0.02004444 -0.00197151]
Sample prediction scores: [0.41766518 0.53955615 0.55663472 0.40850523 0.58031411]

Train matrix shape: (224, 51)
Label shape: (224,)
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Initial return state: -0.0019715116565625574
Sample prediction scores: [0.42610776 0.6413468  0.59130985 0.47720867 0.6418469 ]

Train matrix shape: (222, 53)
Label shape: (222,)
Fitting 5 fo

In [14]:
# results
results_df = pd.DataFrame(master_results)
results_df_sorted = results_df.sort_values("f1", ascending=False)

print("\nTop 15 model threshold combinations:")
results_df_sorted.head(15)


Top 15 model threshold combinations:


Unnamed: 0,model,svd_dim,window,threshold,accuracy,precision,recall,f1,confusion
2,logreg_w1,50,1,0.45,0.8,0.666667,1.0,0.8,"[[6, 3], [0, 6]]"
56,logreg_pca_w1,50,1,0.45,0.866667,1.0,0.666667,0.8,"[[9, 0], [2, 4]]"
55,logreg_pca_w1,50,1,0.425,0.8,0.714286,0.833333,0.769231,"[[7, 2], [1, 5]]"
3,logreg_w1,50,1,0.475,0.8,0.8,0.666667,0.727273,"[[8, 1], [2, 4]]"
40,svm_w1,50,1,-0.2,0.733333,0.625,0.833333,0.714286,"[[6, 3], [1, 5]]"
57,logreg_pca_w1,50,1,0.475,0.8,1.0,0.5,0.666667,"[[9, 0], [3, 3]]"
39,svm_w1,50,1,-0.3,0.666667,0.555556,0.833333,0.666667,"[[5, 4], [1, 5]]"
1,logreg_w1,50,1,0.425,0.6,0.5,1.0,0.666667,"[[3, 6], [0, 6]]"
36,svm_w1,50,1,-0.6,0.533333,0.461538,1.0,0.631579,"[[2, 7], [0, 6]]"
34,svm_w1,50,1,-0.8,0.533333,0.461538,1.0,0.631579,"[[2, 7], [0, 6]]"


In [15]:
# best config per model
best_by_model = (
    results_df
    .sort_values("f1", ascending=False)
    .groupby("model")
    .head(1)
    .reset_index(drop=True)
)

print("\nBest configuration per model:")
best_by_model


Best configuration per model:


Unnamed: 0,model,svd_dim,window,threshold,accuracy,precision,recall,f1,confusion
0,logreg_w1,50,1,0.45,0.8,0.666667,1.0,0.8,"[[6, 3], [0, 6]]"
1,logreg_pca_w1,50,1,0.45,0.866667,1.0,0.666667,0.8,"[[9, 0], [2, 4]]"
2,svm_w1,50,1,-0.2,0.733333,0.625,0.833333,0.714286,"[[6, 3], [1, 5]]"
3,xgboost_w1,50,1,0.4,0.533333,0.461538,1.0,0.631579,"[[2, 7], [0, 6]]"
4,logreg_pca_w3,50,3,0.4,0.466667,0.428571,1.0,0.6,"[[1, 8], [0, 6]]"
5,logreg_w3,50,3,0.4,0.466667,0.428571,1.0,0.6,"[[1, 8], [0, 6]]"
6,svm_w3,50,3,-0.3,0.533333,0.454545,0.833333,0.588235,"[[3, 6], [1, 5]]"
7,xgboost_w3,50,3,0.4,0.4,0.333333,0.5,0.4,"[[3, 6], [3, 3]]"


# Interpretation

Overall, logistic regression with a window size of 1 (both with and without pca) performed the best, achieving an f1 score of 0.80, differing only in their precision and recall. This indicates that:
- although both models share the same F1 score, the svd + pca version achieves higher accuracy because it makes fewer positive predictions (higher precision), whereas the svd-only version achieves perfect recall by predicting ‘up’ more frequently
- the relationship between svd-encoded headlines and previous day return is approximately linear to next day direction
- the dataset size of ~224 training days is way too small for a complex model like xgboost to generalize accurately

We also see that window size 1 outperforms window size 3 consistently across all three models. This may by due to:
- financial return autocorrelation decays extremely fast, so including return of 2 or 3 days prior adds more noise than useful signals (especially in small datasets)
- window size 3 reduces the number of training samples (not by a lot), and adds unnecessary dimensionality\
This suggest shallow momentum in nvidia's short term returns

Logistic Regression:
- achieved the best f1 scores for both svd and pca version indicating that it benefits from a smooth linear decision boundary, proper scaling of return features, and the geometry of svd embeddings, which create linearly separable representations
- logisitc regression's performance also suggests that the predicitve structure of our data is close to linear

Linear SVM:
- selected small C values indicating heavy regularization indiciating high noise in the features
- performs well (f1 = 0.73) but not as well as logistic regression
- margins become unstable with small datasets, leading to more sensitive support vector placements than logistic regression
- reinforces that simple linear models behave best in low sample and low signal data

XGBoost:
- worst of the three models
- poor training and generalization due to small dataset
- dense svd features removed sparsity, which tree models normally exploit
- parameters also indcate overfitting with low learning rate (0.01), shallow trees (depth of 2 or 3), and many trees (100-200). these are classic signs that the model is attempting not to overfit but failing to extract meaningful structure\
XGBoost is too expressive for this setup and overfits the noise and not the signal

## Design choices
Why pca after svd and why only for logistic regression? Why not pca directly on tfidf?
- tfidf is very high dimensional and sparse
- pca requires computing a dense covariance matrix, which is computationally expensive

Why pca after svd?
- svd compresses tfidf into 50 dimensional dense representations, which pca optimally reduces to 10
- this way, we test whether logistic regression benefits from a more compact low-rank structure or not

Why PCA only for Logistic Regression?
- pca removes variance that may be useful for nonlinear models like xgboost

The modeling results imply:
- headline news contains weak but non-zero predictive information, especially when distilled via svd
- previous day return is the strongest consistent feature, and adding longer return windows dilutes signal
- simple linear models generalize best
- nonlinear learners fail due to low sample size and noisy market structure
- dimensionality reduction is essential given raw TF-IDF is too high dimensional, but SVD captures coherent semantics

Overall, the market is difficult to predict, but simple linear models combined with single-day momentum can extract the limited predictive signal present in this dataset

Possible future work:
- Add sentiment lexicon scores (e.g., Loughran–McDonald, VADER) to complement TF-IDF
- incorporate other market features: volatility, volume, sector index movements
- explore topic models (LDA) or transformer embeddings (FinBERT, headline embeddings)