In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

In [2]:
import os
import random
import tensorflow as tf

seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.config.experimental.enable_op_determinism()

In [5]:
df = pd.read_csv('fnspid_prices_title_sentiment.csv')
df = df.set_index('date')
df.index = pd.to_datetime(df.index)
stocks = ['ENB','GS','WFC','GME','D','EA','CMCSA','DHI','CRM','VRTX',
        'SPWR','GILD','WDC','BX','AAL']
df = df[df['Stock_symbol'].isin(stocks)]

In [6]:
df['target_binary'] = (df['movement_percent'] >= 0).astype(int)
df['target_binary'] = df['target_binary'].shift(-1)
df.dropna(inplace=True)

In [7]:
def sequences(df: DataFrame, window_size: int, feature_cols: list, target: str):
    X, y = [], []

    features = df[feature_cols].to_numpy()
    y_vals = df[target].to_numpy()

    for i in range(len(df) - window_size):
        X.append(features[i:i + window_size])
        y.append(y_vals[i + window_size])

    return np.array(X), np.array(y)

In [15]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, matthews_corrcoef, confusion_matrix, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

window_size = 20
feature_cols = ['open_logret', 'high_logret', 'low_logret', 'close_logret', 'volume_logret']
target = 'target_binary'
l2_value = 1e-4
batch_size = 8
epochs = 50
learning_rate = 0.001

results = []

for symbol in df['Stock_symbol'].unique():
    print(f"\n{'='*40}\nTraining model for: {symbol}\n{'='*40}")

    df_symbol = df[df['Stock_symbol'] == symbol].copy()
    price_cols = ['open', 'high', 'low', 'close', 'volume']
    for col in price_cols:
      df_symbol[f'{col}_logret'] = np.log(df_symbol[col]) - np.log(df_symbol[col].shift(1))
    df_symbol.dropna(inplace=True)

    train = df_symbol.loc['2014-01-01':'2021-03-29']
    val = df_symbol.loc['2021-03-30':'2022-07-28']
    test = df_symbol.loc['2022-07-29':'2023-12-01']

    if len(train) < window_size * 2 or len(test) < window_size:
        print(f"Not enough data for {symbol}, skipping.")
        continue

    X_train, y_train = sequences(train, window_size=window_size, feature_cols=feature_cols, target=target)
    X_val, y_val = sequences(val, window_size=window_size, feature_cols=feature_cols, target=target)
    X_test, y_test = sequences(test, window_size=window_size, feature_cols=feature_cols, target=target)

    X_train = np.array(X_train); y_train = np.array(y_train)
    X_val = np.array(X_val); y_val = np.array(y_val)
    X_test = np.array(X_test); y_test = np.array(y_test)

    for X in [X_train, X_val, X_test]:
        np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0, copy=False)

    mean = np.mean(X_train, axis=(0,1))
    std = np.std(X_train, axis=(0,1))
    std[std == 0] = 1e-8

    # X_train_norm = (X_train - mean) / std
    # X_val_norm = (X_val - mean) / std
    # X_test_norm = (X_test - mean) / std

    unique, counts = np.unique(y_test, return_counts=True)
    dist = dict(zip(unique, counts))
    total = counts.sum()
    pct_0 = 100 * dist.get(0, 0) / total
    pct_1 = 100 * dist.get(1, 0) / total
    print(f"Test class distribution: 0s={pct_0:.2f}%, 1s={pct_1:.2f}%")

    model = Sequential([
        Input(shape=(window_size, len(feature_cols))),
        Conv1D(filters=16, kernel_size=3, strides=1, activation='relu', padding='same'),
        BatchNormalization(),
        Conv1D(filters=8, kernel_size=3, strides=1, activation='relu', padding='same'),
        BatchNormalization(),
        LSTM(16),
        Dropout(0.1),
        Dense(16, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid', kernel_regularizer=l2(l2_value))
    ])

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['AUC', 'accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=0
    )

    y_val_pred_prob = model.predict(X_val)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)

    y_test_pred_prob = model.predict(X_test)
    y_test_pred = (y_test_pred_prob > 0.5).astype(int)

    val_auc = roc_auc_score(y_val, y_val_pred_prob)
    val_f1 = f1_score(y_val, y_val_pred)
    val_mcc = matthews_corrcoef(y_val, y_val_pred)

    test_auc = roc_auc_score(y_test, y_test_pred_prob)
    test_f1 = f1_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)
    test_acc = np.mean(y_test_pred.flatten() == y_test.flatten())
    test_cm = confusion_matrix(y_test, y_test_pred)

    print(f"[{symbol}] Test AUC: {test_auc:.4f} | F1: {test_f1:.4f} | MCC: {test_mcc:.4f} | ACC: {test_acc:.4f}")
    print(f"Confusion matrix:\n{test_cm}\n")

    results.append({
        'Symbol': symbol,
        'Val_AUC': val_auc,
        'Val_F1': val_f1,
        'Val_MCC': val_mcc,
        'Test_AUC': test_auc,
        'Test_F1': test_f1,
        'Test_MCC': test_mcc,
        'Test_ACC': test_acc,
        'Test_0%': pct_0,
        'Test_1%': pct_1
    })

results_df = pd.DataFrame(results)
print("\n=== Summary of all stocks ===")
print(results_df.sort_values('Test_AUC', ascending=False))


Training model for: AAL
Test class distribution: 0s=48.28%, 1s=51.72%
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[AAL] Test AUC: 0.4764 | F1: 0.6173 | MCC: 0.0125 | ACC: 0.5141
Confusion matrix:
[[ 39 115]
 [ 40 125]]


Training model for: BX
Test class distribution: 0s=47.34%, 1s=52.66%
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[BX] Test AUC: 0.5274 | F1: 0.6134 | MCC: 0.0426 | ACC: 0.5298
Confusion matrix:
[[ 50 101]
 [ 49 119]]


Training model for: CMCSA
Test class distribution: 0s=48.59%, 1s=51.41%
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[CMCSA] Test AUC: 0.5524 | F1: 0.6608 | MCC: 0.0198 | ACC: 0.5172
Confusion matrix:
[[ 15 140]
 [ 14 150]]


Training model for: CR

In [16]:
print(results_df['Test_ACC'].mean())
print(results_df['Test_AUC'].mean())
print(results_df['Test_MCC'].mean())

0.525508172157489
0.5330584730216387
0.030780408905092024
