In [2]:
import os
import pickle
import time
import random
import numpy as np
import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error, r2_score, roc_auc_score, accuracy_score, log_loss
)

# ✅ Define Save Path
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# ✅ Load Data
df = pd.read_csv("../../model_food/data/financial_data_full.csv")

# ✅ Convert Date Columns to Numeric Features
date_cols = df.select_dtypes(include=["object"]).columns

for col in date_cols:
    try:
        df[col] = pd.to_datetime(df[col])
        df[col + "_days_since_start"] = (df[col] - df[col].min()).dt.days
        df[col + "_year"] = df[col].dt.year
        df[col + "_month"] = df[col].dt.month
        df[col + "_day"] = df[col].dt.day
    except Exception:
        print(f"⚠️ Skipping non-date column: {col}")

df = df.drop(columns=date_cols, errors="ignore")

# ✅ Ensure Target Column Exists
if "market_stress" not in df.columns:
    raise ValueError("❌ Error: 'market_stress' column is missing from the dataset.")

# ✅ Split Data
X = df.drop(columns=["market_stress"])
y = df["market_stress"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Prevent Duplicate Class Definitions in DEAP
if "FitnessMax" not in creator.__dict__:
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
if "Individual" not in creator.__dict__:
    creator.create("Individual", list, fitness=creator.FitnessMax)

def train_ga_lr(X_train, X_test, y_train, y_test):
    """Uses a Genetic Algorithm (GA) to optimize feature selection for Logistic Regression and saves the model."""

    num_features = X_train.shape[1]  # Number of features in dataset

    # ✅ Define GA Structure
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)  # Binary feature selection
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    def evaluate(individual):
        """Evaluates the fitness of an individual feature selection."""
        selected_features = [i for i, bit in enumerate(individual) if bit == 1]
        
        if not selected_features:
            return (0.0,)  # Prevent empty feature sets
        
        X_train_selected = X_train[:, selected_features]
        X_test_selected = X_test[:, selected_features]

        model = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
        model.fit(X_train_selected, y_train)
        
        y_pred_prob = model.predict_proba(X_test_selected)[:, 1]
        y_pred = (y_pred_prob >= 0.5).astype(int)

        roc_auc = roc_auc_score(y_test, y_pred_prob)
        accuracy = accuracy_score(y_test, y_pred)
        stability = np.mean(cross_val_score(model, X_train_selected, y_train, cv=5))

        # ✅ Fitness Score (Maximize ROC-AUC, Accuracy, and Stability)
        fitness = (roc_auc + accuracy + stability) / 3

        return (fitness,)

    toolbox.register("evaluate", evaluate)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # ✅ Create Initial Population & Run GA
    pop = toolbox.population(n=20)  # 20 individuals
    start_time = time.time()
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
    training_time = time.time() - start_time

    # ✅ Select Best Individual
    best_individual = tools.selBest(pop, k=1)[0]
    selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]

    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]

    # ✅ Final Model with Best Features
    final_model = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
    final_model.fit(X_train_selected, y_train)

    # ✅ Save Model
    model_path = os.path.join(SAVE_DIR, "GA-Optimized LR.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(final_model, f)

    print(f"✅ Model saved as: {model_path}")

    return final_model  # Return trained model for immediate use if needed

# ✅ Train & Save Model
ga_lr_model = train_ga_lr(X_train, X_test, y_train, y_test)

print("\n✅ GA-Optimized Logistic Regression model is now trained and properly saved as a `.pkl` file!")

✅ Model saved as: saved_models/GA-Optimized LR.pkl

✅ GA-Optimized Logistic Regression model is now trained and properly saved as a `.pkl` file!


In [4]:
results

NameError: name 'results' is not defined