<a href="https://colab.research.google.com/github/submarinejuice/CP322-Final-Project-Group-9/blob/main/cp322_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Downloading dependencies
%pip install --upgrade pip
%pip install yfinance pandas numpy scikit-learn matplotlib seaborn shap tensorflow

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Attention, Input
from tensorflow.keras.optimizers import Adam

# Project Overview
#
# Neuro + Fintech + Financial Text Sentiment Predictor
#
# Goal: Predict Buy / Sell / Hold decisions by integrating multiple data modalities:
# - Stock price data: historical OHLC, returns, moving averages, volatility indicators
# - Financial news sentiment: daily sentiment scores derived from news headlines or articles
# - Simulated cognitive features: attention, stress, risk appetite, confidence (used until a real dataset is available)
#
# This project demonstrates multi-modal machine learning by combining:
# 1. Market numeric data (stocks)
# 2. Textual data (financial news sentiment)
# 3. Neuro-inspired cognitive signals
#
# Key Features of the Project:
# - Temporal modeling: cognitive features and lagged news sentiment are sequence-dependent
# - Multi-modal integration: numeric, textual, and simulated cognitive features feed into a single model
# - Evaluation & Explainability: model performance measured via accuracy and F1-score, with feature importance explored using SHAP
#
# Objectives:
# 1. Build a sequence-aware model (LSTM/GRU with attention) to predict trading actions
# 2. Demonstrate non-obvious patterns by including temporal and multi-modal dependencies
# 3. Perform ablation studies to quantify the contribution of cognitive and sentiment features
# 4. Provide interpretable insights into feature importance and model behavior

def get_stock_data(tickers, start_date, end_date):
    """Download real stock price data"""
    data = yf.download(tickers, start=start_date, end=end_date, auto_adjust=True)
    return data

TICKERS = ['AAPL', 'TSLA', 'GOOGL']
START_DATE = '2018-01-01'
END_DATE = '2024-01-01'

price_data = get_stock_data(TICKERS, START_DATE, END_DATE)
print(f"Data shape: {price_data.shape}")
print(f"Columns: {price_data.columns.tolist()}")
print(f"Date range: {price_data.index[0]} to {price_data.index[-1]}")

[*********************100%***********************]  3 of 3 completed

Data shape: (1509, 15)
Columns: [('Close', 'AAPL'), ('Close', 'GOOGL'), ('Close', 'TSLA'), ('High', 'AAPL'), ('High', 'GOOGL'), ('High', 'TSLA'), ('Low', 'AAPL'), ('Low', 'GOOGL'), ('Low', 'TSLA'), ('Open', 'AAPL'), ('Open', 'GOOGL'), ('Open', 'TSLA'), ('Volume', 'AAPL'), ('Volume', 'GOOGL'), ('Volume', 'TSLA')]
Date range: 2018-01-02 00:00:00 to 2023-12-29 00:00:00





# Uploading Data CSV from repo so we always have it and dont have to manually import
Michelle's addition

In [2]:

import os

REPO_URL = "https://github.com/submarinejuice/CP322-Final-Project-Group-9"
REPO_NAME = "CP322-Final-Project-Group-9"

if not os.path.exists(REPO_NAME):
    # First time in this Colab session: clone the repo
    !git clone {REPO_URL}
else:
    # Repo already there in this runtime: pull latest changes
    %cd {REPO_NAME}
    !git pull
    %cd /content

# Move into repo so relative paths work
%cd /content/{REPO_NAME}


Cloning into 'CP322-Final-Project-Group-9'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 35 (delta 8), reused 12 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 345.71 KiB | 4.49 MiB/s, done.
Resolving deltas: 100% (8/8), done.
[Errno 2] No such file or directory: '/content/CP322-Final-Project-Group-9'
/Users/jaypatel/Desktop/Projects/CP322-Final-Project-Group-9


In [3]:
import pandas as pd
import re

print("Current directory:", os.getcwd())
print("Repo contents:", os.listdir())
print("DATASET contents:", os.listdir("DATASET"))

df = pd.read_csv("DATASET/AE_investment_dataset.csv")
df.head()
df.info()
df.isna().mean().sort_values().head(20)
df.columns.tolist()
for c in df.columns:
    print(c)




Current directory: /Users/jaypatel/Desktop/Projects/CP322-Final-Project-Group-9
Repo contents: ['.DS_Store', 'cp322_FINAL.ipynb', 'DATASET', 'README.md', '.git']
DATASET contents: ['AE_investment_dataset.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Columns: 364 entries, Participant_code to SCR_AnticipatoryS4_T10
dtypes: float64(356), int64(5), object(3)
memory usage: 85.4+ KB
Participant_code
Age
Gender
Nationality
Ethnicity
Played_stock_market
Played_in_years
Played_how_often
Stock_amount_S1_T1
Stock_amount_S1_T2
Stock_amount_S1_T3
Stock_amount_S1_T4
Stock_amount_S1_T5
Stock_amount_S1_T6
Stock_amount_S1_T7
Stock_amount_S1_T8
Stock_amount_S1_T9
Stock_amount_S1_T10
Stock_amount_S2_T1
Stock_amount_S2_T2
Stock_amount_S2_T3
Stock_amount_S2_T4
Stock_amount_S2_T5
Stock_amount_S2_T6
Stock_amount_S2_T7
Stock_amount_S2_T8
Stock_amount_S2_T9
Stock_amount_S2_T10
Stock_amount_S3_T1
Stock_amount_S3_T2
Stock_amount_S3_T3
Stock_amount_S3_T4
Stock_amount_S3_T5
Stock_amou

##Quick note bc I didn't know what PANAS meant:
- PANAS refers to the Positive and Negative Affect Schedule, a widely used psychological scale that measures an individual's mood by assessing both positive and negative emotions. Developed in 1988 by Watson, Clark, and Tellegen, it's a 20-item self-report measure used in research and clinical settings to gauge how frequently someone experiences emotions like interest, joy, enthusiasm (positive affect) versus feelings of distress, sadness, and nervousness (negative affect).
## How it works
- 20 items: The scale consists of 20 words that describe different feelings and emotions.
- Two dimensions: These items are separated into two subscales: one for positive affect (PA) and one for negative affect (NA).
- Rating scale: Participants rate how they felt about each item over a specific time frame (e.g., "right now," "today," "over the past few weeks") on a 5-point scale.
- Scoring: Each positive and negative item is scored individually. The total positive score and total negative score are then calculated. A higher positive score indicates more positive affect, while a higher negative score indicates more negative affect.

Building a per-trial table with:
1. inputs per step:
  - money_in_stocks
  - mean_return
  - stock_fluctuation
  - scr_anticipatory
2. Static inputs:
3. Target
  - Whether they invested in the stock (money_in_stocks > 0 -> 1 else 0)

In [4]:
import re
import pandas as pd

# 1. ID & static columns we carry along
id_cols = ["Participant_code", "Age", "Gender", "Nationality", "Ethnicity", "Played_stock_market"]

# 2. Find all trial-level columns by prefix
stock_cols = [c for c in df.columns if c.startswith("Money_in_stocks_S")]
scr_cols   = [c for c in df.columns if c.startswith("SCR_AnticipatoryS")]
ret_cols   = [c for c in df.columns if c.startswith("Mean_Return_S")]
fluc_cols  = [c for c in df.columns if c.startswith("stock_fluctuation_S")]

print("n_stock_cols:", len(stock_cols))
print("n_scr_cols:", len(scr_cols))
print("n_return_cols:", len(ret_cols))
print("n_fluctuation_cols:", len(fluc_cols))


n_stock_cols: 40
n_scr_cols: 40
n_return_cols: 36
n_fluctuation_cols: 36


In [5]:
print("Example stock cols:", stock_cols[:5])
print("Example SCR cols:", scr_cols[:5])
print("Example return cols:", ret_cols[:5])
print("Example fluctuation cols:", fluc_cols[:5])


Example stock cols: ['Money_in_stocks_S1_T1', 'Money_in_stocks_S1_T2', 'Money_in_stocks_S1_T3', 'Money_in_stocks_S1_T4', 'Money_in_stocks_S1_T5']
Example SCR cols: ['SCR_AnticipatoryS1_T1', 'SCR_AnticipatoryS1_T2', 'SCR_AnticipatoryS1_T3', 'SCR_AnticipatoryS1_T4', 'SCR_AnticipatoryS1_T5']
Example return cols: ['Mean_Return_S1_T2', 'Mean_Return_S1_T3', 'Mean_Return_S1_T4', 'Mean_Return_S1_T5', 'Mean_Return_S1_T6']
Example fluctuation cols: ['stock_fluctuation_S1_T2', 'stock_fluctuation_S1_T3', 'stock_fluctuation_S1_T4', 'stock_fluctuation_S1_T5', 'stock_fluctuation_S1_T6']


## Converting the Wide Table -> Long Table
Wide format, each participant has 
- 40 stock invesment columns
- 40 anticipatroy SCR columns
- Matching retunr and fluctiation columns

not the best for our ml model

since the model expects one row = one sample, columns = features, one target lable per row

wide -> long
each row = one trail decision 
paricipant attributes are copied over (age, gender, etc.)
trial-levels features are included

so the dataset goes from 30 rows to 1200 rows, giving us enough samples to train the model. 

In [6]:
import numpy as np
import re

records = []

for _, row in df.iterrows():
    # static participant info
    base = row[id_cols].to_dict()
    
    for stock_col in stock_cols:
        # match the S1_T1 part AFTER "Money_in_stocks_"
        m = re.search(r"Money_in_stocks_(S\d+_T\d+)", stock_col)
        if m is None:
            continue
        
        tag = m.group(1)              # e.g. "S1_T1"
        game = int(tag.split("_")[0][1:])   # "S1" -> 1
        trial = int(tag.split("_")[1][1:])  # "T1" -> 1
        
        money_in_stocks = row[stock_col]
        
        # build matching column names for other signals using the same tag
        scr_col  = f"SCR_Anticipatory{tag}"       # SCR_AnticipatoryS1_T1
        ret_col  = f"Mean_Return_{tag}"           # Mean_Return_S1_T1
        fluc_col = f"stock_fluctuation_{tag}"     # stock_fluctuation_S1_T1
        
        rec = {
            **base,
            "game": game,
            "trial": trial,
            "money_in_stocks": money_in_stocks,
            "scr": row.get(scr_col, np.nan),
            "mean_return": row.get(ret_col, np.nan),
            "stock_fluctuation": row.get(fluc_col, np.nan),
        }
        records.append(rec)

long_df = pd.DataFrame(records)
print(long_df.head())
print(long_df.shape)


   Participant_code  Age  Gender  Nationality  Ethnicity Played_stock_market  \
0                 1   22       2            4          6                   1   
1                 1   22       2            4          6                   1   
2                 1   22       2            4          6                   1   
3                 1   22       2            4          6                   1   
4                 1   22       2            4          6                   1   

   game  trial  money_in_stocks    scr  mean_return  stock_fluctuation  
0     1      1          16884.0  0.806          NaN                NaN  
1     1      2           7623.2  0.028        0.045                2.0  
2     1      3           5941.6  0.232       -0.046                1.0  
3     1      4          31265.0  0.954        0.370                2.0  
4     1      5           3494.8  0.000        0.046                2.0  
(1200, 12)


High vs Low Investment 
1 = high invesment
0 = low invesment
splits the resulits in a balance data set

In [7]:
threshold = long_df["money_in_stocks"].median()
long_df["high_invest"] = (long_df["money_in_stocks"] >= threshold).astype(int)

long_df["high_invest"].value_counts(normalize=True)


high_invest
1    0.5
0    0.5
Name: proportion, dtype: float64

### Baseline Model 1 Logistic Regression
Predicts whether a patricipant makes a hihg vs. low invsment on each trial

We use a smalls et of market and physiology-based features
- game - four stock-market games the trial belongs to
- trial - number withing the game
- scr 
- mean_return - expected return of the stock
- stock_fluctuation - volatitlity condition of the trial

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer

feature_cols = ["game", "trial", "scr", "mean_return", "stock_fluctuation"]

# 1. Drop rows where the label itself is missing (shouldn’t be, but just in case)
long_df_clean = long_df.dropna(subset=["high_invest"])

X = long_df_clean[feature_cols]
y = long_df_clean["high_invest"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Preprocessing: impute missing values, then scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_cols),
    ],
    remainder="drop",
)

# 3. Logistic Regression pipeline
logreg = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=1000)),
])

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))


Accuracy: 0.6375
F1: 0.6533864541832669
AUC: 0.710902777777778


### Baseline Model 2 Random Forest

Random Forest can capture non-linear interactions between market
variables and physiological signals. This gives us a stronger,
tree-based baseline to compare against our later neuroscience-inspired
feature engineering.

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[
    ("preprocess", preprocessor),  # same imputer + scaler as before
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42
    ))
])

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print("RF F1:", f1_score(y_test, y_pred_rf))
print("RF AUC:", roc_auc_score(y_test, y_prob_rf))


RF Accuracy: 0.9333333333333333
RF F1: 0.9316239316239316
RF AUC: 0.9875694444444445


##  Random Tree vs Logistic Regression
Logistic (Linear Model)
Random Tree (Non-Linear, tree-based model)

Both Models use the same feature set
- game 
- trial
- scr
- mean_return
- stock_fluctuation

Metrics
- Accuracy
- F1 Score
- AUC (Area Under ROC Curve) - ability to seperate high vs low investments

Interpertation
- Random Forest outperforms Logistic 
Which shows
- Decisions are influenced by non-linear interactions
- SCR interacts with the market in more complex ways


In [4]:
# ---- Build temporal_df with SCR history features ----

# Start from the cleaned long_df (so label isn't missing)
temporal_df = long_df_clean.sort_values(
    ["Participant_code", "game", "trial"]
).copy()

# Group by participant so lags/rolling stay within a person
grp = temporal_df.groupby("Participant_code")

# 1-step lag of SCR
temporal_df["scr_lag1"] = grp["scr"].shift(1)

# Change from previous trial
temporal_df["scr_delta1"] = temporal_df["scr"] - temporal_df["scr_lag1"]

# Rolling mean of SCR over last 3 trials (short-term trend)
temporal_df["scr_roll_mean3"] = grp["scr"].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
)

# Replace NaNs from first trials with 0.0
for col in ["scr_lag1", "scr_delta1", "scr_roll_mean3"]:
    temporal_df[col] = temporal_df[col].fillna(0.0)

print("temporal_df shape:", temporal_df.shape)
print(temporal_df[["Participant_code", "game", "trial",
                   "scr", "scr_lag1", "scr_delta1", "scr_roll_mean3",
                   "high_invest"]].head())


NameError: name 'long_df_clean' is not defined

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Same feature sets as before
feature_sets = {
    "Market only": ["mean_return", "stock_fluctuation"],
    "SCR dynamics only": ["scr", "scr_lag1", "scr_delta1", "scr_roll_mean3"],
    "Market + SCR dynamics": [
        "mean_return", "stock_fluctuation",
        "scr", "scr_lag1", "scr_delta1", "scr_roll_mean3"
    ],
}

y = temporal_df["high_invest"].values

def run_rf(feature_cols, name):
    X = temporal_df[feature_cols].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scaling isn't strictly needed for trees, but keeping pipeline structure is okay
    rf = Pipeline([
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(
            n_estimators=300,
            random_state=42
        ))
    ])

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)[:, 1]

    print(f"\n=== Random Forest – {name} ===")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
    print("F1:",       round(f1_score(y_test, y_pred), 3))
    print("AUC:",      round(roc_auc_score(y_test, y_prob), 3))


for name, cols in feature_sets.items():
    run_rf(cols, name)


NameError: name 'temporal_df' is not defined