In [98]:
# import os
# import json
# import pickle
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler

# # ✅ Define Paths
# SAVE_DIR = "saved_models"
# FEATURE_FILE = os.path.join(SAVE_DIR, "feature_names.json")

# # -----------------------------------------------
# # 📌 Data Preprocessing Function (Fixing String to Float Issue)
# # -----------------------------------------------
# def preprocess_data(df):
#     """Ensures all data is numeric by converting dates and categorical features."""
    
#     df = df.copy()
#     df = df.dropna()  # ✅ Remove missing values

#     for col in df.select_dtypes(include=["object"]):
#         try:
#             df[col] = pd.to_datetime(df[col]).astype(int) / 10**9  # ✅ Convert to Unix timestamp
#         except Exception:
#             df[col] = df[col].astype("category").cat.codes  # ✅ Convert categorical to numeric

#     return df

# # ✅ Load Feature Names
# if os.path.exists(FEATURE_FILE):
#     with open(FEATURE_FILE, "r") as f:
#         expected_features = json.load(f)
#     print(f"✅ Loaded Feature Names: {len(expected_features)} features")
# else:
#     raise FileNotFoundError("❌ Feature names file not found!")

# # ✅ Load Dataset and Standardize Features
# df = pd.read_csv("data/synth_findata.csv")
# df = preprocess_data(df)  # ✅ Apply Fix

# X = df.drop(columns=["market_stress"], errors="ignore")  # Ensure "market_stress" exists
# X = X[expected_features]  # ✅ Ensure feature alignment

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# print("✅ Feature preprocessing and scaling completed successfully!")

✅ Loaded Feature Names: 214 features
✅ Feature preprocessing and scaling completed successfully!


In [110]:
import os
import json
import pickle
import random
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ✅ Define Paths
SAVE_DIR = "saved_models"
FEATURE_FILE = os.path.join(SAVE_DIR, "feature_names.json")
DATA_FILE = "data/synth_findata.csv"
PERFORMANCE_FILE = os.path.join(SAVE_DIR, "Model_Performance_Metrics_Updated.csv")

# ✅ Load Feature Names
if not os.path.exists(FEATURE_FILE):
    raise FileNotFoundError("❌ Feature names file not found!")

with open(FEATURE_FILE, "r") as f:
    expected_features = json.load(f)

expected_features = [str(f) for f in expected_features]  # Ensure all feature names are strings
print(f"✅ Loaded Feature Names: {len(expected_features)} features")

# ✅ Load Dataset
df = pd.read_csv(DATA_FILE).dropna()
X = df.drop(columns=["market_stress"], errors="ignore")
y = df["market_stress"]

# ✅ Ensure Feature Alignment
def align_features(X, model):
    """Dynamically aligns feature count to match model expectations."""
    X = X.copy()
    if hasattr(model, "n_features_in_"):
        expected_count = model.n_features_in_
        current_features = list(X.columns.astype(str))

        if len(current_features) < expected_count:
            missing_cols = expected_count - len(current_features)
            print(f"⚠️ Adding {missing_cols} dummy columns to match model input size.")
            for i in range(missing_cols):
                X[f"dummy_feature_{i}"] = 0

        elif len(current_features) > expected_count:
            print(f"⚠️ Reducing {len(current_features) - expected_count} features to match model input size.")
            X = X.iloc[:, :expected_count]

        X.columns = X.columns.astype(str)

    return X

# ✅ Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=expected_features)

# ✅ Load Models
model_files = {
    "Elastic Net": os.path.join(SAVE_DIR, "Elastic Net.pkl"),
    "SGD": os.path.join(SAVE_DIR, "SGD.pkl"),
    "Gradient Boosting": os.path.join(SAVE_DIR, "Gradient Boosting.pkl"),
    "CNN (MLP)": os.path.join(SAVE_DIR, "CNN (MLP).pkl"),
    "Diffusion Model": os.path.join(SAVE_DIR, "Diffusion Model.pkl"),
    "GA-Optimized LR": os.path.join(SAVE_DIR, "GA-Optimized LR.pkl"),
    "NeuroEvolution (NEAT)": os.path.join(SAVE_DIR, "NeuroEvolution (NEAT).pkl"),
}

models = {}
for name, path in model_files.items():
    if os.path.exists(path):
        try:
            with open(path, "rb") as f:
                model = pickle.load(f)
                if hasattr(model, "predict"):
                    models[name] = {"model": model, "lifespan": 3}  # 3 Gen lifespan
                    print(f"✅ Loaded Model: {name}")
                else:
                    print(f"⚠️ {name} does not have `predict()`. Skipping...")
        except Exception as e:
            print(f"❌ Error loading {name}: {e}")
    else:
        print(f"⚠️ {name} not found!")

if not models:
    raise RuntimeError("❌ No valid models loaded.")

# ✅ Load Performance Metrics
if os.path.exists(PERFORMANCE_FILE):
    df_performance = pd.read_csv(PERFORMANCE_FILE)
else:
    raise FileNotFoundError("❌ Performance Metrics CSV not found!")

# -----------------------------------------------
# 📌 Model Knockout Tournament
# -----------------------------------------------
def model_battle(model_1, model_2, X, y):
    """Models compete on predicting market_stress. Winners proceed."""
    try:
        X1 = align_features(pd.DataFrame(X), model_1["model"])
        X2 = align_features(pd.DataFrame(X), model_2["model"])

        pred_1 = model_1["model"].predict(X1)
        pred_2 = model_2["model"].predict(X2)

    except Exception as e:
        print(f"❌ Error during model battle: {e}")
        return None, None

    conf_1 = np.mean((pred_1 > 0.5) == y) * np.mean(np.abs(pred_1 - 0.5))
    conf_2 = np.mean((pred_2 > 0.5) == y) * np.mean(np.abs(pred_2 - 0.5))

    return (model_1, model_2) if conf_1 > conf_2 else (model_2, model_1)

# -----------------------------------------------
# 📌 Model Breeding Function
# -----------------------------------------------
def breed_models(parent_1, parent_2):
    """Hybrid model creation (favoring CNN & NEAT)."""
    p1_name, p2_name = parent_1["name"], parent_2["name"]
    
    if "CNN" in p1_name or "NEAT" in p1_name or "CNN" in p2_name or "NEAT" in p2_name:
        advantage = 1.2  # Hybrids get boosted performance

    hybrid_name = f"{p1_name}-{p2_name}_Hybrid"
    new_model = random.choice([parent_1["model"], parent_2["model"]])  # Basic cross-breeding
    return {"name": hybrid_name, "model": new_model, "lifespan": 3}  # Reset lifespan

# -----------------------------------------------
# 📌 Model Evolution Process
# -----------------------------------------------
def evolve_models(models, df_performance, X, y, generations=10):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    performance_metrics = []

    for gen in range(generations):
        print(f"\n🚀 **Generation {gen+1}: Running Evolution**")
        model_list = list(models.keys())
        random.shuffle(model_list)
        survivors = []
        
        # Tournament Battles
        for i in range(0, len(model_list), 2):
            if i + 1 < len(model_list):
                winner, loser = model_battle(models[model_list[i]], models[model_list[i+1]], X, y)
                if winner:
                    survivors.append(winner)
            else:
                survivors.append(models[model_list[i]])

        print(f"✅ Survivors: {len(survivors)} models advancing.")

        # Breeding Phase - Select Top Models
        top_models = df_performance.nlargest(2, "Fitness Score")  # Select highest-performing models
        breeding_pool = []
        for _, row in top_models.iterrows():
            if row["Model"] in models:
                breeding_pool.append({"name": row["Model"], "model": models[row["Model"]]["model"]})

        offspring = []
        for i in range(0, len(breeding_pool), 2):
            if i + 1 < len(breeding_pool):
                child = breed_models(breeding_pool[i], breeding_pool[i+1])
                offspring.append(child)

        # Reduce Lifespan for all models
        for model in models.values():
            model["lifespan"] -= 1
        models = {name: m for name, m in models.items() if m["lifespan"] > 0}  # Remove expired models

        models.update({f"Gen{gen+1}_Model{i}": m for i, m in enumerate(survivors + offspring)})

        # Save Models
        for name, model in models.items():
            file_path = f"saved_models/{name}_{timestamp}.pkl"
            with open(file_path, "wb") as f:
                pickle.dump(model["model"], f)
            print(f"✅ Saved Model: {file_path}")

    return models

# ✅ Run Evolution
evolved_models = evolve_models(models, df_performance, X_scaled, y, generations=10)

✅ Loaded Feature Names: 214 features


ValueError: could not convert string to float: '2023-01-09'

In [17]:
import pickle

model_path = "saved_models/Elastic Net.pkl"  # Example model
with open(model_path, "rb") as f:
    model = pickle.load(f)

print(type(model))  # What type is it?

<class 'dict'>


In [21]:
# Print the contents
print(model.keys())  # What keys does this dictionary have?

dict_keys(['Elastic Net Best Alpha', 'Elastic Net Best L1 Ratio', 'Mean Squared Error (MSE)', 'R² Score', 'ROC-AUC Score', 'Accuracy Score', 'Log Loss', 'Training Time (s)', 'Prediction Time (s)', 'Cross-Validation Stability', 'Fitness Score'])


In [25]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ✅ Define Paths
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# ✅ Load Data
df = pd.read_csv("data/financial_data_full.csv")  # Update with your real dataset

# ✅ Identify Date Columns (Assuming Dates are in 'YYYY-MM-DD' Format)
date_cols = df.select_dtypes(include=["object"]).columns  # Identify non-numeric columns

for col in date_cols:
    try:
        df[col] = pd.to_datetime(df[col])  # Convert to datetime format
        df[col + "_days_since_start"] = (df[col] - df[col].min()).dt.days  # Convert to numerical
        df[col + "_year"] = df[col].dt.year
        df[col + "_month"] = df[col].dt.month
        df[col + "_day"] = df[col].dt.day
    except Exception as e:
        print(f"⚠️ Skipping {col}: {e}")  # Some non-numeric columns may not be dates

# ✅ Drop Original Date Columns (Replaced by numerical versions)
df = df.drop(columns=date_cols, errors="ignore")

# ✅ Ensure Target Column Exists
if "market_stress" not in df.columns:
    raise ValueError("❌ Error: 'market_stress' column is missing from the dataset.")

# ✅ Split into Features & Target
X = df.drop(columns=["market_stress"])  # Features
y = df["market_stress"]  # Target variable

# ✅ Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Train Elastic Net Model
elastic_net = ElasticNetCV(cv=5, l1_ratio=[0.3, 0.6], alphas=np.logspace(-2, 0.5, 5), random_state=42, max_iter=5000)
elastic_net.fit(X_train, y_train)

# ✅ Train Gradient Boosting Model
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm.fit(X_train, y_train)

# ✅ Save Models Properly
with open(os.path.join(SAVE_DIR, "Elastic Net.pkl"), "wb") as f:
    pickle.dump(elastic_net, f)

with open(os.path.join(SAVE_DIR, "Gradient Boosting.pkl"), "wb") as f:
    pickle.dump(gbm, f)

print("\n✅ Models successfully trained and saved with dates converted to numerical features!")


✅ Models successfully trained and saved with dates converted to numerical features!


## now fix sgd...

In [33]:
import os
import pickle
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (
    mean_squared_error, r2_score, roc_auc_score, accuracy_score, log_loss
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

# ✅ Define Save Path
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# ✅ Load Data
df = pd.read_csv("data/financial_data_full.csv")

# ✅ Convert Date Columns to Numeric Features
date_cols = df.select_dtypes(include=["object"]).columns

for col in date_cols:
    try:
        df[col] = pd.to_datetime(df[col])
        df[col + "_days_since_start"] = (df[col] - df[col].min()).dt.days
        df[col + "_year"] = df[col].dt.year
        df[col + "_month"] = df[col].dt.month
        df[col + "_day"] = df[col].dt.day
    except Exception:
        print(f"⚠️ Skipping non-date column: {col}")

df = df.drop(columns=date_cols, errors="ignore")

# ✅ Ensure Target Column Exists
if "market_stress" not in df.columns:
    raise ValueError("❌ Error: 'market_stress' column is missing from the dataset.")

# ✅ Split Data
X = df.drop(columns=["market_stress"])
y = df["market_stress"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Train SGDClassifier (Instead of SGDRegressor)
def train_sgd(X_train, X_test, y_train, y_test):
    """Trains an SGD model for classification and saves it as a pickle file."""
    
    # ✅ Define SGD Classifier (For Classification)
    sgd = SGDClassifier(
        max_iter=2000,
        tol=1e-4,
        random_state=42,
        penalty="l2",  # Ridge-style regularization
        alpha=0.01,  # Regularization strength
        loss="log_loss",  # Enables probability estimates
    )

    # ✅ Train Model
    start_time = time.time()
    sgd.fit(X_train, y_train)
    training_time = time.time() - start_time

    # ✅ Save Model
    model_path = os.path.join(SAVE_DIR, "SGD.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(sgd, f)

    print(f"✅ Model saved as: {model_path}")

    return sgd  # Return trained model for immediate use if needed

# ✅ Train & Save Model
sgd_model = train_sgd(X_train, X_test, y_train, y_test)

print("\n✅ SGDClassifier model is now trained and properly saved as a `.pkl` file!")

✅ Model saved as: saved_models/SGD.pkl

✅ SGDClassifier model is now trained and properly saved as a `.pkl` file!


### CNN

In [38]:
import os
import pickle
import time
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    mean_squared_error, r2_score, roc_auc_score, accuracy_score, log_loss
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

# ✅ Define Save Path
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# ✅ Load Data
df = pd.read_csv("data/financial_data_full.csv")

# ✅ Convert Date Columns to Numeric Features
date_cols = df.select_dtypes(include=["object"]).columns

for col in date_cols:
    try:
        df[col] = pd.to_datetime(df[col])
        df[col + "_days_since_start"] = (df[col] - df[col].min()).dt.days
        df[col + "_year"] = df[col].dt.year
        df[col + "_month"] = df[col].dt.month
        df[col + "_day"] = df[col].dt.day
    except Exception:
        print(f"⚠️ Skipping non-date column: {col}")

df = df.drop(columns=date_cols, errors="ignore")

# ✅ Ensure Target Column Exists
if "market_stress" not in df.columns:
    raise ValueError("❌ Error: 'market_stress' column is missing from the dataset.")

# ✅ Split Data
X = df.drop(columns=["market_stress"])
y = df["market_stress"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Train CNN (MLP)
def train_cnn(X_train, X_test, y_train, y_test):
    """Trains a Multi-Layer Perceptron (MLP) as a CNN stand-in and saves it."""

    # ✅ Define MLP Model
    cnn = MLPClassifier(
        hidden_layer_sizes=(100, 50),  # Two layers: 100 neurons & 50 neurons
        activation="relu",  # ReLU activation function
        solver="adam",  # Adam optimizer
        max_iter=500,  # Max training iterations
        random_state=42
    )

    # ✅ Train Model
    start_time = time.time()
    cnn.fit(X_train, y_train)
    training_time = time.time() - start_time

    # ✅ Save Model
    model_path = os.path.join(SAVE_DIR, "CNN (MLP).pkl")
    with open(model_path, "wb") as f:
        pickle.dump(cnn, f)

    print(f"✅ Model saved as: {model_path}")

    return cnn  # Return trained model for immediate use if needed

# ✅ Train & Save Model
cnn_model = train_cnn(X_train, X_test, y_train, y_test)

print("\n✅ MLPClassifier model is now trained and properly saved as a `.pkl` file!")

✅ Model saved as: saved_models/CNN (MLP).pkl

✅ MLPClassifier model is now trained and properly saved as a `.pkl` file!


# diffusion

In [42]:
import os
import pickle
import time
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    mean_squared_error, r2_score, roc_auc_score, accuracy_score, log_loss
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

# ✅ Define Save Path
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# ✅ Load Data
df = pd.read_csv("data/financial_data_full.csv")

# ✅ Convert Date Columns to Numeric Features
date_cols = df.select_dtypes(include=["object"]).columns

for col in date_cols:
    try:
        df[col] = pd.to_datetime(df[col])
        df[col + "_days_since_start"] = (df[col] - df[col].min()).dt.days
        df[col + "_year"] = df[col].dt.year
        df[col + "_month"] = df[col].dt.month
        df[col + "_day"] = df[col].dt.day
    except Exception:
        print(f"⚠️ Skipping non-date column: {col}")

df = df.drop(columns=date_cols, errors="ignore")

# ✅ Ensure Target Column Exists
if "market_stress" not in df.columns:
    raise ValueError("❌ Error: 'market_stress' column is missing from the dataset.")

# ✅ Split Data
X = df.drop(columns=["market_stress"])
y = df["market_stress"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Train Diffusion Model (GMM)
def train_diffusion_model(X_train, X_test, y_train, y_test):
    """Trains a Gaussian Mixture Model (GMM) as a stand-in for a diffusion model and saves it."""

    # ✅ Define GMM Model
    gmm = GaussianMixture(
        n_components=5,  # Assume 5 mixture components
        covariance_type="full",
        random_state=42
    )

    # ✅ Train Model
    start_time = time.time()
    gmm.fit(X_train)
    training_time = time.time() - start_time

    # ✅ Save Model
    model_path = os.path.join(SAVE_DIR, "Diffusion Model.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(gmm, f)

    print(f"✅ Model saved as: {model_path}")

    return gmm  # Return trained model for immediate use if needed

# ✅ Train & Save Model
diffusion_model = train_diffusion_model(X_train, X_test, y_train, y_test)

print("\n✅ GaussianMixture model is now trained and properly saved as a `.pkl` file!")

✅ Model saved as: saved_models/Diffusion Model.pkl

✅ GaussianMixture model is now trained and properly saved as a `.pkl` file!


#ga-optimized lr

In [47]:
import os
import pickle
import time
import random
import numpy as np
import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error, r2_score, roc_auc_score, accuracy_score, log_loss
)

# ✅ Define Save Path
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# ✅ Load Data
df = pd.read_csv("data/financial_data_full.csv")

# ✅ Convert Date Columns to Numeric Features
date_cols = df.select_dtypes(include=["object"]).columns

for col in date_cols:
    try:
        df[col] = pd.to_datetime(df[col])
        df[col + "_days_since_start"] = (df[col] - df[col].min()).dt.days
        df[col + "_year"] = df[col].dt.year
        df[col + "_month"] = df[col].dt.month
        df[col + "_day"] = df[col].dt.day
    except Exception:
        print(f"⚠️ Skipping non-date column: {col}")

df = df.drop(columns=date_cols, errors="ignore")

# ✅ Ensure Target Column Exists
if "market_stress" not in df.columns:
    raise ValueError("❌ Error: 'market_stress' column is missing from the dataset.")

# ✅ Split Data
X = df.drop(columns=["market_stress"])
y = df["market_stress"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Prevent Duplicate Class Definitions in DEAP
if "FitnessMax" not in creator.__dict__:
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
if "Individual" not in creator.__dict__:
    creator.create("Individual", list, fitness=creator.FitnessMax)

def train_ga_lr(X_train, X_test, y_train, y_test):
    """Uses a Genetic Algorithm (GA) to optimize feature selection for Logistic Regression and saves the model."""

    num_features = X_train.shape[1]  # Number of features in dataset

    # ✅ Define GA Structure
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)  # Binary feature selection
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    def evaluate(individual):
        """Evaluates the fitness of an individual feature selection."""
        selected_features = [i for i, bit in enumerate(individual) if bit == 1]
        
        if not selected_features:
            return (0.0,)  # Prevent empty feature sets
        
        X_train_selected = X_train[:, selected_features]
        X_test_selected = X_test[:, selected_features]

        model = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
        model.fit(X_train_selected, y_train)
        
        y_pred_prob = model.predict_proba(X_test_selected)[:, 1]
        y_pred = (y_pred_prob >= 0.5).astype(int)

        roc_auc = roc_auc_score(y_test, y_pred_prob)
        accuracy = accuracy_score(y_test, y_pred)
        stability = np.mean(cross_val_score(model, X_train_selected, y_train, cv=5))

        # ✅ Fitness Score (Maximize ROC-AUC, Accuracy, and Stability)
        fitness = (roc_auc + accuracy + stability) / 3

        return (fitness,)

    toolbox.register("evaluate", evaluate)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # ✅ Create Initial Population & Run GA
    pop = toolbox.population(n=20)  # 20 individuals
    start_time = time.time()
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
    training_time = time.time() - start_time

    # ✅ Select Best Individual
    best_individual = tools.selBest(pop, k=1)[0]
    selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]

    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]

    # ✅ Final Model with Best Features
    final_model = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
    final_model.fit(X_train_selected, y_train)

    # ✅ Save Model
    model_path = os.path.join(SAVE_DIR, "GA-Optimized LR.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(final_model, f)

    print(f"✅ Model saved as: {model_path}")

    return final_model  # Return trained model for immediate use if needed

# ✅ Train & Save Model
ga_lr_model = train_ga_lr(X_train, X_test, y_train, y_test)

print("\n✅ GA-Optimized Logistic Regression model is now trained and properly saved as a `.pkl` file!")

✅ Model saved as: saved_models/GA-Optimized LR.pkl

✅ GA-Optimized Logistic Regression model is now trained and properly saved as a `.pkl` file!


# neat

In [76]:
import neat
import time
import numpy as np
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error, r2_score, roc_auc_score, accuracy_score, log_loss
)

# -----------------------------------------------
# 📌 Data Preprocessing Function
# -----------------------------------------------
def preprocess_data(df):
    """Preprocess dataset by handling missing values and encoding categorical data."""
    
    df = df.copy()
    df = df.dropna()  # Remove missing values

    # Convert categorical & date columns to numeric
    for col in df.select_dtypes(include=['object']):
        try:
            df[col] = pd.to_datetime(df[col]).astype(int) / 10**9  # Convert to timestamp
        except:
            df[col] = df[col].astype("category").cat.codes  

    return df

# -----------------------------------------------
# 📌 Custom NEAT Wrapper with Predict Methods
# -----------------------------------------------
class NEATModelWrapper:
    """Wrapper to make NEAT behave like a scikit-learn model."""
    def __init__(self, genome, config):
        self.genome = genome
        self.config = config
        self.net = neat.nn.FeedForwardNetwork.create(genome, config)

    def predict_proba(self, X):
        """Returns probability predictions (like logistic regression)."""
        X = np.array(X)
        return np.array([[1 - self.net.activate(xi)[0], self.net.activate(xi)[0]] for xi in X])  # [prob_0, prob_1]

    def predict(self, X):
        """Returns binary predictions (0 or 1)."""
        return np.argmax(self.predict_proba(X), axis=1)  # Take class with highest probability

# -----------------------------------------------
# 📌 TRAIN NEUROEVOLUTION (NEAT)
# -----------------------------------------------
def eval_genome(genome, config, X_train, y_train, valid_genomes):
    """Evaluates a single NEAT genome, computing fitness based on multiple metrics."""
    
    net = neat.nn.FeedForwardNetwork.create(genome, config)
    y_pred_prob = np.array([net.activate(xi)[0] for xi in X_train])  # **Continuous probabilities**
    y_pred = (y_pred_prob >= 0.5).astype(int)  # **Convert to binary**

    mse = mean_squared_error(y_train, y_pred_prob)  # ✅ Use probabilities
    r2 = r2_score(y_train, y_pred_prob)  # ✅ Use probabilities

    # **Fix: Convert `y_train` to Binary Labels**
    y_train_binary = (y_train >= np.median(y_train)).astype(int)

    # ✅ **Fix: Ensure roc_auc_score receives binary labels**
    if len(np.unique(y_train_binary)) > 1:
        auc = roc_auc_score(y_train_binary, y_pred_prob)  # ✅ **Use probabilities**
    else:
        auc = 0.5  # Default if only one class present

    accuracy = accuracy_score(y_train_binary, y_pred)

    eps = 1e-9  
    y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
    log_loss_value = log_loss(y_train_binary, y_pred_prob)

    valid_fitness_values = [g.fitness for g in valid_genomes[-5:] if g.fitness is not None]
    fitness_variance = np.var(valid_fitness_values) if valid_fitness_values else 1.0
    cross_validation_stability = 1 / (1 + fitness_variance)  

    fitness_score = (
        (accuracy * 0.3) + 
        (auc * 0.25) + 
        (cross_validation_stability * 0.2) -  
        (mse * 0.15) -  
        (log_loss_value * 0.1)  
    )

    return max(fitness_score, 0), cross_validation_stability  

def eval_genomes(genomes, config, X_train, y_train):
    """Evaluates all genomes in the current NEAT generation."""
    valid_genomes = [g for _, g in genomes if g.fitness is not None]

    for genome_id, genome in genomes:
        genome.fitness, _ = eval_genome(genome, config, X_train, y_train, valid_genomes)

def train_neat(X_train_full, X_test_full, y_train_full, y_test_full):
    """Trains NEAT and saves the best model in a wrapper class with `.predict()`."""
    print("\n🚀 **Starting NEAT Training...**")

    config_path = "neat_config3.txt"
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation, config_path)

    pop = neat.Population(config)

    start_time = time.time()
    pop.run(lambda genomes, cfg: eval_genomes(genomes, cfg, X_train_full, y_train_full), 10)
    training_time = time.time() - start_time

    valid_genomes = [g for g in pop.population.values() if g.fitness is not None]
    if not valid_genomes:
        raise ValueError("❌ No valid genomes with assigned fitness scores.")

    best_genome = max(valid_genomes, key=lambda g: g.fitness)

    valid_fitness_values = [g.fitness for g in valid_genomes[-5:] if g.fitness is not None]
    fitness_variance = np.var(valid_fitness_values) if valid_fitness_values else 1.0
    cross_validation_stability = 1 / (1 + fitness_variance)

    neat_model = NEATModelWrapper(best_genome, config)

    with open("saved_models/NeuroEvolution (NEAT).pkl", "wb") as f:
        pickle.dump(neat_model, f)

    print(f"\n✅ NEAT Model Saved: `saved_models/NeuroEvolution (NEAT).pkl`")

    return neat_model

# -----------------------------------------------
# 📌 TEST NEUROEVOLUTION (NEAT) - RUN TRAINING
# -----------------------------------------------
import traceback  

def test_train_neat():
    """Ensures NEAT model runs, saves, and has `predict()`."""
    print("\n🚀 **Starting NEAT Test**...\n")

    try:
        df = pd.read_csv("data/financial_data_full.csv")
        df_cleaned = preprocess_data(df)
        df_scaled = StandardScaler().fit_transform(df_cleaned)  # ✅ Apply Scaling

        X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
            df_scaled[:, :-1], df_scaled[:, -1], test_size=0.2, random_state=42
        )

        print("\n🚀 **Starting NEAT Training...**")
        neat_model = train_neat(X_train_full, X_test_full, y_train_full, y_test_full)

        assert hasattr(neat_model, "predict"), "❌ NEAT model does not have `predict()`"
        assert hasattr(neat_model, "predict_proba"), "❌ NEAT model does not have `predict_proba()`"

        print("\n✅ NEAT model is now trained and properly saved with `predict()`.")

    except Exception as e:
        print("\n❌ **Error in test_train_neat()**")
        traceback.print_exc()  
        raise  

# ✅ Run the test
test_train_neat()


🚀 **Starting NEAT Test**...


🚀 **Starting NEAT Training...**

🚀 **Starting NEAT Training...**

✅ NEAT Model Saved: `saved_models/NeuroEvolution (NEAT).pkl`

✅ NEAT model is now trained and properly saved with `predict()`.
