In [57]:
import time
import numpy as np
import pandas as pd
import random
import neat
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import ElasticNetCV, SGDRegressor, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier  # CNN Stand-in
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.mixture import GaussianMixture  # Diffusion Model
from deap import base, creator, tools, algorithms  # GA-LR

In [59]:
# -----------------------------------------------
# 📌 STEP 1: PREPROCESS DATA
# -----------------------------------------------

def preprocess_data(df):
    """Handles missing values, computes market stress, and creates lagged features."""
    threshold = 0.3 * len(df)
    df_cleaned = df.dropna(axis=1, thresh=threshold)
    df_cleaned.fillna(df_cleaned.median(numeric_only=True), inplace=True)

    # ✅ Compute rolling z-scores
    def compute_rolling_zscores(df, cols, window=90):
        rolling_mean = df[cols].rolling(window=window, min_periods=1).mean()
        rolling_std = df[cols].rolling(window=window, min_periods=1).std()
        return (df[cols] - rolling_mean) / rolling_std

    zscore_cols = ["inflation", "Interest Rate", "interest rates"]
    df_zscores = compute_rolling_zscores(df_cleaned, zscore_cols)
    df_zscores.columns = [f"{col}_z" for col in zscore_cols]
    df_cleaned = pd.concat([df_cleaned, df_zscores], axis=1)

    # ✅ Define market stress periods
    df_cleaned["market_stress"] = ((df_cleaned["inflation_z"] > 1) &
                                   (df_cleaned["Interest Rate_z"] > 1) &
                                   (df_cleaned["interest rates_z"] > 1)).astype(int)

    # ✅ Create lagged features
    lag_features = ["inflation", "Interest Rate", "interest rates"]
    lags = [5, 10, 30]
    for feature in lag_features:
        for lag in lags:
            df_cleaned[f"{feature}_lag{lag}"] = df_cleaned[feature].shift(lag)

    df_cleaned.dropna(inplace=True)
    return df_cleaned

In [61]:
################
### asserts: ###
################
def test_preprocess_data():
    df = pd.read_csv("data/financial_data_cleaned2.csv")
    df_cleaned = preprocess_data(df)

    assert df_cleaned.isna().sum().sum() == 0, "NaNs found"
    assert all(col in df_cleaned.columns for col in ["inflation_z", "Interest Rate_z", "interest rates_z"]), "Missing z-score columns"
    assert df_cleaned["market_stress"].isin([0, 1]).all(), "market_stress not binary"

    for feature in ["inflation", "Interest Rate", "interest rates"]:
        for lag in [5, 10, 30]:
            assert f"{feature}_lag{lag}" in df_cleaned.columns, f"Missing {feature}_lag{lag}"

    print("✅ preprocess_data() passed.")

# ✅ Run the test
test_preprocess_data()

✅ preprocess_data() passed.


In [63]:
# -----------------------------------------------
# 📌 STEP 2: SCALE FEATURES
# -----------------------------------------------

def scale_features(df_cleaned):
    """Scales numerical features, excluding the target column."""
    scaler = StandardScaler()
    num_cols = df_cleaned.drop(columns=["market_stress"]).select_dtypes(include=[np.number]).columns.tolist()
    
    df_scaled = pd.DataFrame(scaler.fit_transform(df_cleaned[num_cols]), columns=num_cols)
    df_scaled["market_stress"] = df_cleaned["market_stress"].values  # Add back without scaling

    return df_scaled

In [65]:
def test_scale_features():
    df = pd.read_csv("data/financial_data_cleaned2.csv")
    df_cleaned = preprocess_data(df)
    df_scaled = scale_features(df_cleaned)

    # ✅ Get numerical columns
    num_cols = df_cleaned.drop(columns=["market_stress"]).select_dtypes(include=[np.number]).columns.tolist()

    # ✅ Check for expected properties
    assert "market_stress" in df_scaled, "Missing target column"
    assert not df_scaled.isna().any().any(), "NaNs found after scaling"
    assert np.allclose(df_scaled[num_cols].mean(), 0, atol=0.01), "Mean not close to zero"

    # ✅ Adjusted standard deviation tolerance to 0.1
    std_devs = df_scaled[num_cols].std()
    print(f"📊 Standard Deviations After Scaling:\n{std_devs}")
    assert np.allclose(std_devs, 1, atol=0.1), "Std dev not ~1"

    print("✅ scale_features() passed.")

# ✅ Run the test
test_scale_features()

📊 Standard Deviations After Scaling:
Adj Close_^GSPC         1.000083
Adj Close_^IXIC         1.000083
Adj Close_^VIX          1.000083
Bond Yields             1.000083
Inflation               1.000083
                          ...   
Interest Rate_lag5      1.000083
Interest Rate_lag10     1.000083
interest rates_lag5     1.000083
interest rates_lag10    1.000083
interest rates_lag30    1.000083
Length: 213, dtype: float64
✅ scale_features() passed.


In [67]:
# -----------------------------------------------
# 📌 STEP 3: APPLY PCA
# -----------------------------------------------

def apply_pca(df_scaled, n_components=50):
    """Applies PCA for dimensionality reduction."""
    df_pca_input = df_scaled.drop(columns=["market_stress"])
    pca = PCA(n_components=min(n_components, df_pca_input.shape[1]))
    principal_components = pca.fit_transform(df_pca_input)

    df_pca = pd.DataFrame(principal_components, columns=[f"PC{i+1}" for i in range(pca.n_components_)])
    df_pca["market_stress"] = df_scaled["market_stress"].values
    return df_pca

In [69]:
def test_apply_pca():
    df = pd.read_csv("data/financial_data_cleaned2.csv")
    df_cleaned = preprocess_data(df)
    df_scaled = scale_features(df_cleaned)
    df_pca = apply_pca(df_scaled, n_components=50)

    # ✅ Check that the number of principal components is correct
    expected_n_components = min(50, df_scaled.shape[1] - 1)  # -1 for 'market_stress'
    assert df_pca.shape[1] == expected_n_components + 1, f"Unexpected PCA shape: {df_pca.shape}"

    # ✅ Ensure "market_stress" column is still present
    assert "market_stress" in df_pca.columns, "market_stress column missing after PCA"

    # ✅ Check that all principal components are numerical
    assert df_pca.drop(columns=["market_stress"]).select_dtypes(include=[np.number]).shape[1] == expected_n_components, \
        "Non-numeric values found in PCA output"

    # ✅ Variance check: PCA should reduce dimensions while keeping most of the variance
    pca = PCA(n_components=expected_n_components)
    pca.fit(df_scaled.drop(columns=["market_stress"]))
    explained_variance = np.sum(pca.explained_variance_ratio_)
    print(f"📊 Explained Variance Retained: {explained_variance:.4f}")
    assert explained_variance > 0.8, "PCA did not retain enough variance (should be >80%)"

    print("✅ apply_pca() passed.")

# ✅ Run the test
test_apply_pca()

📊 Explained Variance Retained: 0.9990
✅ apply_pca() passed.


In [71]:
# -----------------------------------------------
# 📌 STEP 4: SPLIT DATA
# -----------------------------------------------

def split_data(df):
    """Splits dataset into training/testing sets."""
    X = df.drop(columns=["market_stress"])
    y = df["market_stress"]
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
def test_split_data():
    df = pd.read_csv("data/financial_data_cleaned2.csv")
    df_cleaned = preprocess_data(df)
    df_scaled = scale_features(df_cleaned)
    df_pca = apply_pca(df_scaled, n_components=50)  # Apply PCA before splitting
    
    X_train, X_test, y_train, y_test = split_data(df_pca)

    # ✅ Check split sizes
    assert len(X_train) > len(X_test), "Train set should be larger than test set"
    assert np.isclose(len(X_test) / len(df_pca), 0.2, atol=0.01), "Test set is not ~20% of total"

    # ✅ Ensure no data leakage
    assert not set(X_train.index).intersection(set(X_test.index)), "Train and test sets overlap!"

    # ✅ Feature consistency
    assert X_train.shape[1] == X_test.shape[1], "Feature count mismatch between train and test"

    # ✅ Ensure market stress label is preserved
    assert len(y_train) > len(y_test), "Market stress labels should also follow 80-20 split"

    print("✅ split_data() passed.")

    return df_cleaned, df_pca  # ✅ Return the cleaned & PCA data

# ✅ Store results globally
df_cleaned, df_pca = test_split_data()

✅ split_data() passed.


In [101]:
df_cleaned

Unnamed: 0,Date,Adj Close_^GSPC,Adj Close_^IXIC,Adj Close_^VIX,Bond Yields,Inflation,Unemployment,Interest Rate,Consumer Sentiment,GDP,...,interest rates_z,market_stress,inflation_lag5,inflation_lag10,inflation_lag30,Interest Rate_lag5,Interest Rate_lag10,interest rates_lag5,interest rates_lag10,interest rates_lag30
30,2004-02-11,1157.760010,2089.659912,15.39,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.538048,0,45.0,49.0,49.0,1.01,1.00,62.0,61.0,61.0
31,2004-02-12,1152.109985,2073.610107,15.31,4.10,186.700,5.6,1.01,94.4,11923.447,...,1.459880,0,45.0,49.0,49.0,1.01,1.00,62.0,61.0,61.0
32,2004-02-13,1145.810059,2053.560059,15.58,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.392621,0,45.0,45.0,49.0,1.01,1.01,62.0,62.0,61.0
33,2004-02-16,1145.810059,2053.560059,15.58,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.333946,0,45.0,45.0,49.0,1.01,1.01,62.0,62.0,61.0
34,2004-02-17,1156.989990,2080.350098,15.40,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.282168,0,45.0,45.0,49.0,1.01,1.01,62.0,62.0,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6586,2024-12-28,5970.839844,19722.029297,15.95,4.62,317.603,4.1,4.48,74.0,29719.647,...,-0.707416,0,55.0,55.0,65.0,4.48,4.48,60.0,60.0,73.0
6587,2024-12-29,5970.839844,19722.029297,15.95,4.62,317.603,4.1,4.48,74.0,29719.647,...,-0.703167,0,55.0,55.0,65.0,4.48,4.48,60.0,60.0,73.0
6588,2024-12-30,5906.939941,19486.789062,17.40,4.55,317.603,4.1,4.48,74.0,29719.647,...,-0.703167,0,55.0,55.0,65.0,4.48,4.48,60.0,60.0,73.0
6589,2024-12-31,5881.629883,19310.789062,17.35,4.58,317.603,4.1,4.48,74.0,29719.647,...,-0.703167,0,55.0,55.0,55.0,4.48,4.48,60.0,60.0,60.0


In [103]:
df_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC42,PC43,PC44,PC45,PC46,PC47,PC48,PC49,PC50,market_stress
0,-14.591488,0.310102,1.752340,-2.946899,4.070255,-0.391509,1.782806,-2.284270,1.047295,-1.724905,...,-0.108505,-0.152618,0.039638,-0.056689,0.318432,0.094181,0.177238,-0.042865,0.232239,0
1,-14.593776,0.311596,1.730638,-2.966807,4.065651,-0.388979,1.772567,-2.306856,1.041687,-1.740596,...,-0.091011,-0.181287,0.061216,-0.058664,0.355477,0.088092,0.217783,-0.073155,0.229153,0
2,-14.592678,0.299524,1.751443,-2.955648,4.028171,-0.414735,1.783232,-2.315173,1.086930,-1.737431,...,-0.073916,-0.212737,0.062174,-0.080795,0.404537,0.159059,0.277002,-0.080901,0.196493,0
3,-14.594709,0.303836,1.828776,-2.941739,4.001649,-0.369615,1.831647,-2.310696,1.034411,-1.735413,...,-0.121360,-0.138682,0.033736,0.007805,0.531101,0.163311,0.195994,-0.082768,0.140365,0
4,-14.590163,0.302277,1.834932,-2.942902,3.987436,-0.361740,1.842137,-2.314977,1.025038,-1.743129,...,-0.109379,-0.124586,0.052337,0.022224,0.517847,0.156034,0.190963,-0.083200,0.149717,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6040,14.998970,17.561660,-12.668745,3.676247,-6.994872,1.267893,2.736531,-0.634197,-1.915302,-2.762897,...,0.008810,-0.159059,0.200905,-0.129192,-0.185057,-0.033725,0.086331,0.308817,-0.059409,0
6041,15.000222,17.562246,-12.669341,3.678872,-6.991588,1.273421,2.742801,-0.634198,-1.913198,-2.759337,...,0.008263,-0.158443,0.201808,-0.129646,-0.184313,-0.033876,0.086809,0.309123,-0.059443,0
6042,14.982775,17.575255,-12.687313,3.723585,-6.946546,1.271543,2.824882,-0.581554,-1.930565,-2.746445,...,0.042721,-0.154027,0.208125,-0.176637,-0.197892,-0.047641,0.039248,0.319253,0.007671,0
6043,14.924456,17.488550,-12.569573,3.827467,-7.197078,1.236287,2.865805,-0.467701,-1.734840,-2.637476,...,0.061141,-0.234291,0.273872,0.043883,-0.265446,-0.193993,0.129579,0.293453,0.003689,0


In [105]:
# ✅ Save Preprocessed Data
df_cleaned.to_csv("data/financial_data_full.csv", index=False)
df_pca.to_csv("data/financial_data_pca.csv", index=False)

print("✅ Full and PCA-reduced datasets saved successfully!")

✅ Full and PCA-reduced datasets saved successfully!
