In [None]:
# Cell 1: Imports and Load Data for Classification

import pandas as pd
import numpy as np
import os
from pathlib import Path
import datetime as dt
from datetime import timezone, timedelta
import logging
import json # For saving feature_columns_list
import joblib # For saving the model and scaler

from sklearn.model_selection import train_test_split # We'll do a chronological split manually
from sklearn.linear_model import LogisticRegression # CHANGED: For classification
from sklearn.preprocessing import StandardScaler # For feature scaling
# CHANGED: Classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, confusion_matrix 

# --- Logging Setup ---
logger_name = f"model_training_classifier_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}" # Updated logger name
logger = logging.getLogger(logger_name)
if not logger.handlers: # Avoid adding handlers if re-running cell
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
else:
    logger.setLevel(logging.INFO)

# --- Configuration ---
current_notebook_dir = Path.cwd() # Assumes notebook is in notebooks/train/
# Adjust FEATURES_DIR if your features are not in ../features relative to this notebook's parent
# For example, if train.ipynb is in ./notebooks/train and features are in ./notebooks/features:
FEATURES_DIR = current_notebook_dir.parent.parent / "features" # Assuming features are in project_root/features
# If features are in ./notebooks/features:
# FEATURES_DIR = current_notebook_dir.parent / "features" 

logger.info(f"Attempting to find feature files in: {FEATURES_DIR.resolve()}")

try:
    if not FEATURES_DIR.exists():
        # Let's try another common location if the above doesn't exist, e.g. within notebooks/
        alt_features_dir = current_notebook_dir.parent / "features"
        if alt_features_dir.exists():
            FEATURES_DIR = alt_features_dir
            logger.info(f"Primary FEATURES_DIR not found, using alternative: {FEATURES_DIR.resolve()}")
        else:
            raise FileNotFoundError(f"The directory {FEATURES_DIR.resolve()} (and {alt_features_dir.resolve()}) does not exist. Please check the path.")

    # Assuming feature files might still use the 'v1' from previous regression task,
    # or you might have new ones. Adjust pattern if needed.
    feature_files = sorted(FEATURES_DIR.glob("kalshi_btc_features_target_v1_*.csv"), key=os.path.getctime, reverse=True)
    if not feature_files:
        raise FileNotFoundError(f"No feature CSV files found in {FEATURES_DIR.resolve()} matching pattern 'kalshi_btc_features_target_v1_*.csv'")
    FEATURES_CSV_PATH = feature_files[0]
    logger.info(f"Using features CSV: {FEATURES_CSV_PATH.resolve()}")
except FileNotFoundError as e:
    logger.critical(str(e))
    FEATURES_CSV_PATH = None
except Exception as e:
    logger.critical(f"Error finding features CSV: {e}")
    FEATURES_CSV_PATH = None

# Output directory for trained classifier models
MODEL_OUTPUT_DIR = current_notebook_dir.parent / "trained_models/rf" # Keeps trained_models within notebooks/
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Trained classifier models will be saved in: {MODEL_OUTPUT_DIR.resolve()}")


# --- Load the Features DataFrame ---
df_model_data = pd.DataFrame()

if FEATURES_CSV_PATH and FEATURES_CSV_PATH.exists():
    try:
        df_model_data = pd.read_csv(FEATURES_CSV_PATH)
        logger.info(f"Successfully loaded features data from: {FEATURES_CSV_PATH.resolve()}")
        logger.info(f"Shape of loaded data: {df_model_data.shape}")
        
        print("--- Data Head (Raw from CSV) ---")
        print(df_model_data.head())
        print("\n--- Data Info (Raw from CSV) ---")
        df_model_data.info()
        print("\n--- Data Description (Numerical, Raw from CSV) ---")
        print(df_model_data.describe().to_string())
        
    except Exception as e:
        logger.critical(f"Error loading features CSV {FEATURES_CSV_PATH.resolve()}: {e}")
else:
    if FEATURES_CSV_PATH:
         logger.critical(f"Features CSV file not found at the specified path: {FEATURES_CSV_PATH.resolve()}")
    else:
         logger.critical("FEATURES_CSV_PATH was not set (likely due to an error finding the file). Cannot load data.")

if df_model_data.empty:
    logger.warning("DataFrame df_model_data is empty. Subsequent cells might fail.")

In [None]:
# Cell 2: Data Preprocessing, Target Transformation, Feature Selection, and Splitting

if df_model_data.empty:
    logger.error("df_model_data is empty. Cannot proceed with preprocessing and splitting. Please ensure Cell 1 ran correctly and loaded data.")
else:
    logger.info(f"Starting preprocessing for df_model_data with shape: {df_model_data.shape}")

    # --- 1. Ensure Chronological Order ---
    df_model_data.sort_values(by='decision_point_ts_utc', inplace=True)
    df_model_data.reset_index(drop=True, inplace=True)
    logger.info("Data sorted by 'decision_point_ts_utc'.")

    # --- 2. Define NEW Target Variable for Classification ---
    # Original target: 'TARGET_btc_diff_from_strike'
    # New target: 1 if (BTC price at resolution > strike price), 0 otherwise.
    # This means the Kalshi market for "YES" would win.
    original_target_col = 'TARGET_btc_diff_from_strike'
    classification_target_col = 'TARGET_market_resolves_yes' # New binary target

    if original_target_col not in df_model_data.columns:
        logger.critical(f"Original target column '{original_target_col}' not found in DataFrame. Cannot create classification target.")
        # Stop execution or handle error appropriately
        raise ValueError(f"Missing required column: {original_target_col}")
    
    # Create the binary target: 1 if positive difference (YES wins), 0 if non-positive (NO wins or ties)
    df_model_data[classification_target_col] = (df_model_data[original_target_col] > 0).astype(int)
    logger.info(f"Created binary classification target '{classification_target_col}'.")
    logger.info(f"Value counts for '{classification_target_col}':\n{df_model_data[classification_target_col].value_counts(normalize=True)}")


    # --- 3. Handle Missing Values (NaNs) in Features ---
    identifier_cols = ['kalshi_market_ticker', 'decision_point_ts_utc', 'kalshi_strike_price']
    # Feature columns: exclude identifiers, original regression target, and new classification target
    feature_columns = [
        col for col in df_model_data.columns 
        if col not in identifier_cols + [original_target_col, classification_target_col]
    ]
    
    logger.info(f"Potential feature columns ({len(feature_columns)}): {feature_columns[:10]}...") # Log first 10

    nan_summary = df_model_data[feature_columns].isnull().sum()
    nan_summary = nan_summary[nan_summary > 0].sort_values(ascending=False)
    if not nan_summary.empty:
        logger.warning(f"NaN values found in feature columns:\n{nan_summary}")
        
        # --- Imputation Strategy (Same as before, review if needed for classification) ---
        cols_to_fill_zero = [
            col for col in feature_columns if 'kalshi_mid_chg' in col or \
            'btc_mom' in col 
        ]
        cols_to_fill_median = [ 
            col for col in feature_columns if 'btc_vol' in col or \
            'btc_sma' in col or 'btc_ema' in col 
        ]
        cols_to_fill_rsi_neutral = [col for col in feature_columns if 'btc_rsi' in col]
        
        if 'kalshi_yes_bid' in df_model_data.columns and 'kalshi_yes_bid' in feature_columns: # Check if it's a feature
            df_model_data['kalshi_yes_bid'] = df_model_data['kalshi_yes_bid'].fillna(0)
            logger.info("Filled NaNs in 'kalshi_yes_bid' with 0.")
        if 'kalshi_yes_ask' in df_model_data.columns and 'kalshi_yes_ask' in feature_columns:
            df_model_data['kalshi_yes_ask'] = df_model_data['kalshi_yes_ask'].fillna(100)
            logger.info("Filled NaNs in 'kalshi_yes_ask' with 100.")
        
        if 'kalshi_yes_bid' in feature_columns and 'kalshi_yes_ask' in feature_columns:
            if 'kalshi_spread' in feature_columns:
                df_model_data['kalshi_spread'] = df_model_data['kalshi_yes_ask'] - df_model_data['kalshi_yes_bid']
                logger.info("Recalculated 'kalshi_spread' after filling bid/ask.")
            if 'kalshi_mid_price' in feature_columns:
                 df_model_data['kalshi_mid_price'] = (df_model_data['kalshi_yes_bid'] + df_model_data['kalshi_yes_ask']) / 2
                 logger.info("Recalculated 'kalshi_mid_price' after filling bid/ask.")
        
        for col in cols_to_fill_zero:
            if col in df_model_data.columns and col in feature_columns:
                df_model_data[col] = df_model_data[col].fillna(0)
                logger.info(f"Filled NaNs in '{col}' with 0.")

        for col in cols_to_fill_median:
            if col in df_model_data.columns and col in feature_columns:
                median_val = df_model_data[col].median()
                df_model_data[col] = df_model_data[col].fillna(median_val)
                logger.info(f"Filled NaNs in '{col}' with its median ({median_val:.4f}).")

        for col in cols_to_fill_rsi_neutral:
            if col in df_model_data.columns and col in feature_columns:
                df_model_data[col] = df_model_data[col].fillna(50)
                logger.info(f"Filled NaNs in '{col}' with 50.")

        original_row_count = len(df_model_data)
        df_model_data.dropna(subset=feature_columns, inplace=True) # Drop rows with NaNs in any *feature* column
        # Also drop rows where the classification target might be NaN (though astype(int) should handle it from boolean)
        df_model_data.dropna(subset=[classification_target_col], inplace=True) 
        logger.info(f"Dropped {original_row_count - len(df_model_data)} rows due to remaining NaNs in features or target after imputation attempts.")
        
        final_nan_summary = df_model_data[feature_columns].isnull().sum()
        final_nan_summary = final_nan_summary[final_nan_summary > 0]
        if not final_nan_summary.empty:
            logger.error(f"Still have NaNs after processing feature columns! Columns:\n{final_nan_summary}")
        else:
            logger.info("Successfully handled NaNs in feature columns.")
    else:
        logger.info("No NaNs found in the selected feature columns.")
        
    # --- 4. Define Features (X) and New Target (y) ---
    if not df_model_data.empty:
        X = df_model_data[feature_columns].copy()
        y = df_model_data[classification_target_col].copy() # Use the new binary target
        logger.info(f"Defined X (features) with shape: {X.shape}")
        logger.info(f"Defined y (binary target) with shape: {y.shape}")
        logger.info(f"Target y value counts:\n{y.value_counts(normalize=True)}")


        # --- 5. Split Data (Chronological) ---
        split_ratio = 0.8
        split_index = int(len(X) * split_ratio)

        X_train = X.iloc[:split_index]
        y_train = y.iloc[:split_index] # y_train is now binary
        X_test = X.iloc[split_index:]
        y_test = y.iloc[split_index:]   # y_test is now binary

        logger.info(f"Data split chronologically:")
        logger.info(f"  X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        logger.info(f"  X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
        
        train_start_ts = df_model_data['decision_point_ts_utc'].iloc[0]
        train_end_ts = df_model_data['decision_point_ts_utc'].iloc[split_index - 1]
        test_start_ts = df_model_data['decision_point_ts_utc'].iloc[split_index]
        test_end_ts = df_model_data['decision_point_ts_utc'].iloc[-1]

        logger.info(f"  Training data from: {dt.datetime.fromtimestamp(train_start_ts, tz=timezone.utc).isoformat()} to {dt.datetime.fromtimestamp(train_end_ts, tz=timezone.utc).isoformat()}")
        logger.info(f"  Test data from:     {dt.datetime.fromtimestamp(test_start_ts, tz=timezone.utc).isoformat()} to {dt.datetime.fromtimestamp(test_end_ts, tz=timezone.utc).isoformat()}")
        
        # --- 6. Feature Scaling ---
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
        X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

        logger.info("Features scaled using StandardScaler.")
        print("\nSample of scaled training features (X_train_scaled_df head):")
        print(X_train_scaled_df.head())
        
        # Save the scaler (filename indicates it's for classifier v1)
        scaler_path = MODEL_OUTPUT_DIR / "feature_scaler_classifier_v1.joblib"
        joblib.dump(scaler, scaler_path)
        logger.info(f"Scaler saved to: {scaler_path}")
        
        # Save the list of feature columns (filename indicates it's for classifier v1)
        # This list *should* be the same as for regression if using same features,
        # but good to save it associated with this model run.
        feature_columns_list_path = MODEL_OUTPUT_DIR / "feature_columns_classifier_v1.json"
        with open(feature_columns_list_path, 'w') as f:
            json.dump(feature_columns, f) # feature_columns is already a list here
        logger.info(f"List of feature columns saved to: {feature_columns_list_path}")

    else:
        logger.error("df_model_data is empty after NaN handling. Cannot proceed to define X, y, or split.")
        X, y, X_train, y_train, X_test, y_test, X_train_scaled_df, X_test_scaled_df = [pd.DataFrame()]*8 
        scaler = None

In [None]:
# Cell 3: Classification Model Training and Evaluation

# NEW: Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
# NEW: Consider importing calibration if you want to try it later
# from sklearn.calibration import CalibratedClassifierCV

if 'X_train_scaled_df' not in globals() or X_train_scaled_df.empty:
    logger.error("Scaled training data (X_train_scaled_df) not found or is empty. Please ensure Cell 2 ran successfully.")
else:
    logger.info("--- Starting Classification Model Training (Random Forest) ---") # UPDATED MODEL NAME

    # --- 1. Initialize and Train RandomForestClassifier Model ---
    # Key parameters for RandomForestClassifier:
    #   n_estimators: Number of trees in the forest. More is usually better but increases training time.
    #   max_depth: Maximum depth of each tree. Deeper trees can overfit.
    #   min_samples_split: Minimum number of samples required to split an internal node.
    #   min_samples_leaf: Minimum number of samples required to be at a leaf node.
    #   class_weight: 'balanced' or 'balanced_subsample' can be very useful for imbalanced datasets.
    #   random_state: For reproducibility.
    #   n_jobs: -1 to use all available cores for training (can speed things up significantly).
    
    # Start with some reasonable defaults. These often need tuning (e.g., with GridSearchCV or RandomizedSearchCV).
    classifier_model = RandomForestClassifier(
        n_estimators=200,         # Increased from default 100 for potentially better performance
        max_depth=15,             # Limit depth to prevent overfitting; adjust based on data
        min_samples_split=10,     # Require at least 10 samples to split a node
        min_samples_leaf=5,       # Require at least 5 samples in a leaf
        class_weight='balanced_subsample', # Good option for large datasets, each tree gets balanced bootstrap sample
        random_state=42,
        n_jobs=-1,                # Use all available cores
        oob_score=True            # Use out-of-bag samples to estimate generalization accuracy
    )
    
    model_name = "RandomForest_classifier_v1" # For saving artifacts
    logger.info(f"Training {model_name} model on {X_train_scaled_df.shape[0]} samples...")
    logger.info(f"Model parameters: {classifier_model.get_params()}")
    
    if 'y_train' not in globals() or y_train.empty:
        logger.error("y_train (binary target) is not available. Cannot train model.")
    else:
        try:
            classifier_model.fit(X_train_scaled_df, y_train)
            logger.info(f"{model_name} model training complete.")
            if hasattr(classifier_model, 'oob_score_'):
                 logger.info(f"  Out-of-Bag (OOB) Score: {classifier_model.oob_score_:.4f}")


            # --- 2. Make Predictions on the Test Set ---
            logger.info(f"Making predictions on the test set ({X_test_scaled_df.shape[0]} samples)...")
            y_pred_test_class = classifier_model.predict(X_test_scaled_df)
            y_pred_test_proba = classifier_model.predict_proba(X_test_scaled_df)[:, 1]

            # --- 3. Evaluate Model Performance (Classification Metrics) ---
            if 'y_test' not in globals() or y_test.empty:
                logger.error("y_test (binary target) is not available. Cannot evaluate model.")
            else:
                accuracy = accuracy_score(y_test, y_pred_test_class)
                precision = precision_score(y_test, y_pred_test_class, zero_division=0)
                recall = recall_score(y_test, y_pred_test_class, zero_division=0)
                f1 = f1_score(y_test, y_pred_test_class, zero_division=0)
                try:
                    roc_auc = roc_auc_score(y_test, y_pred_test_proba)
                except ValueError as e:
                    logger.warning(f"Could not calculate ROC AUC: {e}")
                    roc_auc = np.nan
                logloss = log_loss(y_test, y_pred_test_proba)

                logger.info(f"\n--- {model_name} Evaluation Metrics (Test Set) ---")
                logger.info(f"  Accuracy:          {accuracy:.4f}")
                logger.info(f"  Precision:         {precision:.4f}")
                logger.info(f"  Recall (TPR):      {recall:.4f}")
                logger.info(f"  F1-Score:          {f1:.4f}")
                logger.info(f"  ROC AUC:           {roc_auc:.4f}")
                logger.info(f"  Log Loss:          {logloss:.4f}")

                logger.info(f"\n--- Confusion Matrix (Test Set) - {model_name} ---")
                cm = confusion_matrix(y_test, y_pred_test_class)
                logger.info(f"\n{cm}")
                try:
                    tn, fp, fn, tp = cm.ravel()
                    logger.info(f"  True Negatives (TN):  {tn}")
                    logger.info(f"  False Positives (FP): {fp} (Type I Error)")
                    logger.info(f"  False Negatives (FN): {fn} (Type II Error)")
                    logger.info(f"  True Positives (TP):  {tp}")
                except ValueError:
                    logger.warning("Could not unpack full confusion matrix.")

                df_results_class = pd.DataFrame({
                    'actual_target_resolves_yes': y_test,
                    'predicted_class_resolves_yes': y_pred_test_class,
                    'predicted_proba_resolves_yes': y_pred_test_proba
                })
                if 'original_target_col' in globals() and original_target_col in df_model_data.columns:
                    df_results_class['original_target_diff'] = df_model_data.loc[y_test.index, original_target_col].values
                if 'kalshi_market_ticker' in df_model_data.columns:
                    df_results_class['kalshi_market_ticker'] = df_model_data.loc[y_test.index, 'kalshi_market_ticker'].values
                if 'decision_point_ts_utc' in df_model_data.columns:
                    df_results_class['decision_point_ts_utc'] = df_model_data.loc[y_test.index, 'decision_point_ts_utc'].values
                
                print(f"\n--- Sample of Test Set Predictions vs Actuals ({model_name}) ---")
                print(df_results_class.head(10).to_string())

                # --- Detailed Analysis by Actual Outcome (Test Set) ---
                logger.info(f"\n\n--- Detailed Analysis by Actual Outcome (Test Set) - {model_name} ---")
                # (Your existing detailed analysis code for Actual NO and Actual YES markets goes here unchanged)
                # ... (copy the section from your previous Cell 3 here) ...
                 # --- Analysis for Actual 'NO' markets (target = 0) ---
                df_actual_no = df_results_class[df_results_class['actual_target_resolves_yes'] == 0].copy()
                if not df_actual_no.empty:
                    logger.info(f"\n  --- For Actual 'NO' Markets (Total in Test Set: {len(df_actual_no)}) ---")
                    fp_count_manual = len(df_actual_no[df_actual_no['predicted_class_resolves_yes'] == 1])
                    tn_count_manual = len(df_actual_no[df_actual_no['predicted_class_resolves_yes'] == 0])
                    
                    if len(df_actual_no) > 0:
                        logger.info(f"    Predicted as YES (False Positives): {fp_count_manual} ({fp_count_manual/len(df_actual_no):.2%})")
                        logger.info(f"    Predicted as NO (True Negatives):   {tn_count_manual} ({tn_count_manual/len(df_actual_no):.2%})")
                    else:
                        logger.info("    No actual 'NO' markets to calculate percentages.")

                    logger.info(f"    Distribution of P(model predicts YES) when actual is NO:")
                    logger.info(f"{df_actual_no['predicted_proba_resolves_yes'].describe().to_string()}")
                    
                    high_fp_threshold = 0.7 
                    df_high_prob_fp = df_actual_no[
                        (df_actual_no['predicted_class_resolves_yes'] == 1) & 
                        (df_actual_no['predicted_proba_resolves_yes'] > high_fp_threshold)
                    ].sort_values(by='predicted_proba_resolves_yes', ascending=False)
                    
                    if not df_high_prob_fp.empty:
                        logger.warning(f"    Examples of high-confidence False Positives (Actual NO, P(model YES) > {high_fp_threshold}):")
                        with pd.option_context('display.max_rows', 10, 'display.max_columns', None, 'display.width', 1000): 
                            logger.warning(f"\n{df_high_prob_fp.head(10).to_string()}") 
                    else:
                        logger.info(f"    No False Positives found with P(model YES) > {high_fp_threshold} when actual was NO.")
                else:
                    logger.info("\n  No 'Actual NO' markets found in the test set for this detailed analysis.")

                # --- Analysis for Actual 'YES' markets (target = 1) ---
                df_actual_yes = df_results_class[df_results_class['actual_target_resolves_yes'] == 1].copy()
                if not df_actual_yes.empty:
                    logger.info(f"\n  --- For Actual 'YES' Markets (Total in Test Set: {len(df_actual_yes)}) ---")
                    tp_count_manual = len(df_actual_yes[df_actual_yes['predicted_class_resolves_yes'] == 1])
                    fn_count_manual = len(df_actual_yes[df_actual_yes['predicted_class_resolves_yes'] == 0])

                    if len(df_actual_yes) > 0:
                        logger.info(f"    Predicted as YES (True Positives):  {tp_count_manual} ({tp_count_manual/len(df_actual_yes):.2%})")
                        logger.info(f"    Predicted as NO (False Negatives):  {fn_count_manual} ({fn_count_manual/len(df_actual_yes):.2%})")
                    else:
                        logger.info("    No actual 'YES' markets to calculate percentages.")
                        
                    logger.info(f"    Distribution of P(model predicts YES) when actual is YES:")
                    logger.info(f"{df_actual_yes['predicted_proba_resolves_yes'].describe().to_string()}")
                    
                    low_fn_threshold = 0.3 
                    df_low_prob_fn = df_actual_yes[
                        (df_actual_yes['predicted_class_resolves_yes'] == 0) & 
                        (df_actual_yes['predicted_proba_resolves_yes'] < low_fn_threshold)
                    ].sort_values(by='predicted_proba_resolves_yes', ascending=True)

                    if not df_low_prob_fn.empty:
                        logger.warning(f"    Examples of high-confidence False Negatives (Actual YES, P(model YES) < {low_fn_threshold}):")
                        with pd.option_context('display.max_rows', 10, 'display.max_columns', None, 'display.width', 1000):
                            logger.warning(f"\n{df_low_prob_fn.head(10).to_string()}")
                    else:
                        logger.info(f"    No False Negatives found with P(model YES) < {low_fn_threshold} when actual was YES.")
                else:
                    logger.info("\n  No 'Actual YES' markets found in the test set for this detailed analysis.")


                # --- 4. Inspect Model Feature Importances (for RandomForest) ---
                logger.info(f"\n\n--- {model_name} Feature Importances ---")
                if hasattr(classifier_model, 'feature_importances_'):
                    if 'feature_columns' not in globals(): # Load if not already loaded
                        feature_columns_list_path = MODEL_OUTPUT_DIR / "feature_columns_classifier_v1.json" # Assuming this filename is still relevant
                        if feature_columns_list_path.exists():
                            with open(feature_columns_list_path, 'r') as f:
                                feature_columns = json.load(f)
                            logger.info(f"Loaded feature_columns list from {feature_columns_list_path}")
                        else:
                            logger.warning("feature_columns list not found. Cannot display importance names.")
                            feature_columns = [f"feature_{i}" for i in range(len(X_train_scaled_df.columns))]
                    
                    importances = pd.DataFrame({
                        'feature': feature_columns, # Use columns from X_train_scaled_df
                        'importance': classifier_model.feature_importances_
                    })
                    importances.sort_values(by='importance', ascending=False, inplace=True)
                    
                    print("\nTop Feature Importances:")
                    print(importances.head(20).to_string())
                else:
                    logger.warning(f"Could not retrieve feature importances from the {model_name} model.")

                # --- 5. Save the Trained Model ---
                # Update filenames to reflect the new model type
                model_path = MODEL_OUTPUT_DIR / f"{model_name}.joblib"
                joblib.dump(classifier_model, model_path)
                logger.info(f"Trained {model_name} model saved to: {model_path}")

                # For RandomForest, saving parameters like intercept/coefficients isn't directly applicable
                # The .joblib file contains the entire ensemble of trees.
                # If you need to store hyperparameters, you can do so:
                model_hyperparams = classifier_model.get_params()
                params_path = MODEL_OUTPUT_DIR / f"{model_name}_hyperparams.json"
                with open(params_path, 'w') as f:
                    # Convert numpy types to native Python types for JSON serialization if any exist
                    serializable_params = {k: (v.tolist() if isinstance(v, np.ndarray) else v) for k, v in model_hyperparams.items()}
                    json.dump(serializable_params, f, indent=4)
                logger.info(f"{model_name} hyperparameters saved to: {params_path}")
                
                # Note: The feature_scaler_classifier_v1.joblib and feature_columns_classifier_v1.json
                # are still relevant and will be used by the strategy for preprocessing before prediction.
                # No need to resave them here unless the feature set changes.

        except Exception as e:
            logger.critical(f"An error occurred during {model_name} model training or evaluation: {e}", exc_info=True)
            if 'classifier_model' in locals():
                 logger.info(f"{model_name} model training might have partially completed or failed during evaluation.")