In [426]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, confusion_matrix
from scipy.special import expit, logit

In [427]:
df = pd.read_csv('fnspid/fnspid_prices_title_sentiment.csv')

In [428]:
df = df.set_index('date')
df.index = pd.to_datetime(df.index)
stocks_to_remove = ['MMM', 'SBUX', 'SIRI', 'SLB', 'SLV']
df = df[~df['Stock_symbol'].isin(stocks_to_remove)]
indexes = ['EEM' , 'EWJ', 'GDX', 'GLD', 'QQQ', 'SLV', 'USO', 'XLF', 'XLK', 'XLP', 'XLU', 'XLY']
df = df[~df['Stock_symbol'].isin(indexes)]

In [429]:
from sklearn.preprocessing import StandardScaler

features = ['open', 'high', 'low', 'close', 'volatility', 'volume']
results = []

eps = 1e-6

for stock, df_stock in df.groupby('Stock_symbol'):
    print(f"\n--- Processing {stock} ---")

    df_stock = df_stock.copy()

    df_stock['target_binary'] = (df_stock['movement_percent'] >= 0).astype(int)
    df_stock['target_binary'] = df_stock['target_binary'].shift(-1)
    df_stock['volatility'] = df_stock['close'].pct_change().rolling(window=20).std()

    df_stock.dropna(inplace=True)

    y = df_stock['target_binary'].values
    y_logodds = logit(np.clip(y, eps, 1 - eps))

    X = df_stock[features].values

    train_size = int(len(df_stock) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train, y_test = y_logodds[:train_size], y_logodds[train_size:]

    model = ARIMA(y_train, exog=X_train, order=(1,0,0))
    fit = model.fit()

    y_pred_logodds = fit.predict(start=train_size, end=len(df_stock)-1, exog=X_test)
    y_pred_prob = expit(y_pred_logodds)
    y_pred_class = (y_pred_prob > 0.5).astype(int)

    y_true = df_stock['target_binary'].values[train_size:]

    acc = accuracy_score(y_true, y_pred_class)
    f1 = f1_score(y_true, y_pred_class)
    mcc = matthews_corrcoef(y_true, y_pred_class)
    cm = confusion_matrix(y_true, y_pred_class)

    print(f"Accuracy: {acc:.4f}")
    print(f"MCC: {mcc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)

    results.append({
        'stock': stock,
        'accuracy': acc,
        'f1': f1,
        'mcc': mcc,
        'confusion_matrix': cm
    })

results_df = pd.DataFrame(results)
print("\nSummary of all stocks:")
print(results_df)

mean_metrics = results_df.drop(columns=["stock"]).mean(numeric_only=True).to_dict()
for metric_name, value in mean_metrics.items():
    print(f"{metric_name}: {value:.4f}")


--- Processing AAL ---
Accuracy: 0.5258
MCC: 0.0479
F1 Score: 0.6212
Confusion Matrix:
[[ 61 157]
 [ 54 173]]

--- Processing ABBV ---
Accuracy: 0.5213
MCC: -0.0122
F1 Score: 0.6778
Confusion Matrix:
[[  8 203]
 [ 10 224]]

--- Processing ACN ---
Accuracy: 0.5416
MCC: 0.0799
F1 Score: 0.6208
Confusion Matrix:
[[ 74 143]
 [ 61 167]]

--- Processing ADBE ---
Accuracy: 0.4989
MCC: -0.0597
F1 Score: 0.6585
Confusion Matrix:
[[  7 210]
 [ 13 215]]

--- Processing AEO ---
Accuracy: 0.5371
MCC: -0.0350
F1 Score: 0.6944
Confusion Matrix:
[[  5 197]
 [  9 234]]

--- Processing AIG ---
Accuracy: 0.5011
MCC: -0.0032
F1 Score: 0.5356
Confusion Matrix:
[[ 95 110]
 [112 128]]

--- Processing ALK ---
Accuracy: 0.5056
MCC: 0.0137
F1 Score: 0.6636
Confusion Matrix:
[[  8 213]
 [  7 217]]

--- Processing AMGN ---
Accuracy: 0.5079
MCC: 0.0042
F1 Score: 0.1205
Confusion Matrix:
[[211  15]
 [204  15]]

--- Processing AMT ---
Accuracy: 0.5056
MCC: 0.0205
F1 Score: 0.6259
Confusion Matrix:
[[ 41 183]
 [ 37 