#### Ancestor/Parent Models
saved_models/Elastic Net.pkl
saved_models/SGD.pkl
saved_models/Gradient Boosting.pkl
saved_models/CNN (MLP).pkl
saved_models/Diffusion Model.pkl
saved_models/GA-Optimized LR.pkl
saved_models/NeuroEvolution (NEAT).pkl

#### Performance Metrics
saved_models/Model_Performance_Metrics.csv

#### Dataset
data/financial_data_full.csv
data/financial_data_pca.csv

In [7]:
import os
import pandas as pd
import pickle

# ✅ Define File Paths
SAVE_DIR = "saved_models"
DATA_DIR = "data"

# ✅ Model Files
model_files = {
    "Elastic Net": os.path.join(SAVE_DIR, "Elastic Net.pkl"),
    "SGD": os.path.join(SAVE_DIR, "SGD.pkl"),
    "Gradient Boosting": os.path.join(SAVE_DIR, "Gradient Boosting.pkl"),
    "CNN (MLP)": os.path.join(SAVE_DIR, "CNN (MLP).pkl"),
    "Diffusion Model": os.path.join(SAVE_DIR, "Diffusion Model.pkl"),
    "GA-Optimized LR": os.path.join(SAVE_DIR, "GA-Optimized LR.pkl"),
    "NeuroEvolution (NEAT)": os.path.join(SAVE_DIR, "NeuroEvolution (NEAT).pkl"),
}

# ✅ Data Files
data_files = {
    "Performance Metrics": os.path.join(SAVE_DIR, "Model_Performance_Metrics.csv"),
    "Full Dataset": os.path.join(DATA_DIR, "financial_data_full.csv"),
    "PCA Dataset": os.path.join(DATA_DIR, "financial_data_pca.csv"),
}

# -----------------------------------------------
# 📌 Load Trained Models
# -----------------------------------------------
models = {}

for model_name, file_path in model_files.items():
    if os.path.exists(file_path):
        try:
            with open(file_path, "rb") as f:
                models[model_name] = pickle.load(f)
            print(f"✅ Loaded Model: {model_name}")
        except Exception as e:
            print(f"❌ Error loading {model_name}: {e}")
    else:
        print(f"⚠️ Warning: {model_name} not found at {file_path}")

# -----------------------------------------------
# 📌 Load Performance Metrics Safely
# -----------------------------------------------
if os.path.exists(data_files["Performance Metrics"]):
    try:
        if os.path.getsize(data_files["Performance Metrics"]) > 0:  # Ensure file is not empty
            df_metrics = pd.read_csv(data_files["Performance Metrics"])
            print("✅ Loaded Performance Metrics CSV")
        else:
            df_metrics = pd.DataFrame()
            print("⚠️ Warning: Performance Metrics CSV is empty!")
    except Exception as e:
        df_metrics = pd.DataFrame()
        print(f"❌ Error loading Performance Metrics: {e}")
else:
    df_metrics = pd.DataFrame()
    print("⚠️ Warning: Model_Performance_Metrics.csv not found!")

# -----------------------------------------------
# 📌 Load Datasets Safely
# -----------------------------------------------
datasets = {}

for name, file_path in data_files.items():
    if name != "Performance Metrics":  # Already handled above
        if os.path.exists(file_path):
            try:
                if os.path.getsize(file_path) > 0:  # Ensure file is not empty
                    datasets[name] = pd.read_csv(file_path)
                    print(f"✅ Loaded Dataset: {name}")
                else:
                    datasets[name] = pd.DataFrame()
                    print(f"⚠️ Warning: {name} CSV is empty!")
            except Exception as e:
                datasets[name] = pd.DataFrame()
                print(f"❌ Error loading {name}: {e}")
        else:
            datasets[name] = pd.DataFrame()  # Empty DataFrame placeholder
            print(f"⚠️ Warning: {name} not found at {file_path}")

# -----------------------------------------------
# ✅ Summary of Loaded Data
# -----------------------------------------------
print("\n### 🚀 Summary of Loaded Files ###\n")
print(f"- Models Loaded: {list(models.keys())}")
print(f"- Performance Metrics Shape: {df_metrics.shape}")
for name, df in datasets.items():
    print(f"- {name} Shape: {df.shape}")

# -----------------------------------------------
# 📌 Ready for Experiment
# -----------------------------------------------
print("\n✅ All available models, datasets, and metrics have been loaded successfully!")

✅ Loaded Model: Elastic Net
✅ Loaded Model: SGD
✅ Loaded Model: Gradient Boosting
✅ Loaded Model: CNN (MLP)
✅ Loaded Model: Diffusion Model
✅ Loaded Model: GA-Optimized LR
✅ Loaded Model: NeuroEvolution (NEAT)
✅ Loaded Performance Metrics CSV
✅ Loaded Dataset: Full Dataset
✅ Loaded Dataset: PCA Dataset

### 🚀 Summary of Loaded Files ###

- Models Loaded: ['Elastic Net', 'SGD', 'Gradient Boosting', 'CNN (MLP)', 'Diffusion Model', 'GA-Optimized LR', 'NeuroEvolution (NEAT)']
- Performance Metrics Shape: (7, 10)
- Full Dataset Shape: (6045, 215)
- PCA Dataset Shape: (6045, 51)

✅ All available models, datasets, and metrics have been loaded successfully!


#### make synthetic data for battling (betting)

In [15]:
datasets["Full Dataset"]

Unnamed: 0,Date,Adj Close_^GSPC,Adj Close_^IXIC,Adj Close_^VIX,Bond Yields,Inflation,Unemployment,Interest Rate,Consumer Sentiment,GDP,...,interest rates_z,market_stress,inflation_lag5,inflation_lag10,inflation_lag30,Interest Rate_lag5,Interest Rate_lag10,interest rates_lag5,interest rates_lag10,interest rates_lag30
0,2004-02-11,1157.760010,2089.659912,15.39,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.538048,0,45.0,49.0,49.0,1.01,1.00,62.0,61.0,61.0
1,2004-02-12,1152.109985,2073.610107,15.31,4.10,186.700,5.6,1.01,94.4,11923.447,...,1.459880,0,45.0,49.0,49.0,1.01,1.00,62.0,61.0,61.0
2,2004-02-13,1145.810059,2053.560059,15.58,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.392621,0,45.0,45.0,49.0,1.01,1.01,62.0,62.0,61.0
3,2004-02-16,1145.810059,2053.560059,15.58,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.333946,0,45.0,45.0,49.0,1.01,1.01,62.0,62.0,61.0
4,2004-02-17,1156.989990,2080.350098,15.40,4.05,186.700,5.6,1.01,94.4,11923.447,...,1.282168,0,45.0,45.0,49.0,1.01,1.01,62.0,62.0,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6040,2024-12-28,5970.839844,19722.029297,15.95,4.62,317.603,4.1,4.48,74.0,29719.647,...,-0.707416,0,55.0,55.0,65.0,4.48,4.48,60.0,60.0,73.0
6041,2024-12-29,5970.839844,19722.029297,15.95,4.62,317.603,4.1,4.48,74.0,29719.647,...,-0.703167,0,55.0,55.0,65.0,4.48,4.48,60.0,60.0,73.0
6042,2024-12-30,5906.939941,19486.789062,17.40,4.55,317.603,4.1,4.48,74.0,29719.647,...,-0.703167,0,55.0,55.0,65.0,4.48,4.48,60.0,60.0,73.0
6043,2024-12-31,5881.629883,19310.789062,17.35,4.58,317.603,4.1,4.48,74.0,29719.647,...,-0.703167,0,55.0,55.0,55.0,4.48,4.48,60.0,60.0,60.0


## 📚 Synthetic Financial Data Generation: Process & Reasoning

### 🎯 Objective

The original dataset had an imbalance in the `market_stress` target variable, which could lead to biased models favoring the majority class. To create a more robust dataset, we generate synthetic financial data with:

- **Balanced classes** (`market_stress = 0 and 1`)
- **Statistical consistency** with the original data
- **Natural variation** through noise injection
- **Logical constraints** to maintain economic plausibility

---

### ⚙️ Steps Taken

#### 1️⃣ Load and Explore the Data
- Read `financial_data_full.csv`
- Identify numerical columns for feature synthesis
- Ensure `market_stress` is present and assess class balance

#### 2️⃣ Synthetic Data Strategy
- **Oversampling the minority class** using **SMOTE-like** techniques
- **Randomly perturbing** financial indicators with Gaussian noise (`μ = 0, σ = 0.05 * feature_std`)
- **Scaling back features** to prevent unrealistic values (e.g., inflation rates cannot be negative)

#### 3️⃣ Balancing Market Stress
- Compute the number of synthetic samples needed
- Generate new financial records with controlled variations
- Append synthetic rows to create a **balanced dataset**

#### 4️⃣ Final Adjustments & Export
- Ensure the dataset structure is consistent with the original
- Save as `synth_findata.csv` for further model training

---



In [22]:
#############################
### uncomment to run once ###
#############################

# import pandas as pd
# import numpy as np

# # ✅ Load Original Data
# file_path = "data/financial_data_full.csv"
# df = pd.read_csv(file_path)

# # ✅ Identify Numerical Columns (excluding market_stress)
# num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# num_cols.remove("market_stress")  # Ensure we don't perturb the target

# # ✅ Compute Class Distribution
# class_counts = df["market_stress"].value_counts()
# min_class, max_class = class_counts.idxmin(), class_counts.idxmax()

# # ✅ Define Synthetic Data Generation Function
# def generate_synthetic_data(df, target_col="market_stress", n_samples=1000):
#     """Generates synthetic financial data for underrepresented class."""
#     df_minority = df[df[target_col] == min_class]
    
#     synthetic_samples = []
#     for _ in range(n_samples):
#         sample = df_minority.sample(n=1, replace=True).copy()
#         for col in num_cols:
#             std_dev = df[col].std()
#             sample[col] += np.random.normal(0, 0.05 * std_dev)  # 5% Gaussian Noise
            
#             # ✅ Ensure no extreme negative values (e.g., inflation cannot be negative)
#             sample[col] = sample[col].clip(lower=0)  # Fixed Issue

#         synthetic_samples.append(sample)

#     return pd.concat(synthetic_samples, ignore_index=True)

# # ✅ Generate Synthetic Samples for Market Stress Balancing
# n_synthetic = abs(class_counts[max_class] - class_counts[min_class])  # Balance count
# df_synthetic = generate_synthetic_data(df, n_samples=n_synthetic)

# # ✅ Combine Original and Synthetic Data
# df_balanced = pd.concat([df, df_synthetic], ignore_index=True)

# # ✅ Shuffle Data for Fair Training
# df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# # ✅ Save Synthetic Financial Data
# synth_file_path = "data/synth_findata.csv"
# df_balanced.to_csv(synth_file_path, index=False)
# print(f"✅ Synthetic dataset saved as: {synth_file_path}")

✅ Synthetic dataset saved as: data/synth_findata.csv
