In [5]:
# Cell 1: Imports and Load Data for Classification

import pandas as pd
import numpy as np
import os
from pathlib import Path
import datetime as dt
from datetime import timezone, timedelta
import logging
import json # For saving feature_columns_list
import joblib # For saving the model and scaler

from sklearn.model_selection import train_test_split # We'll do a chronological split manually
from sklearn.linear_model import LogisticRegression # CHANGED: For classification
from sklearn.preprocessing import StandardScaler # For feature scaling
# CHANGED: Classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, confusion_matrix 

# --- Logging Setup ---
logger_name = f"model_training_classifier_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}" # Updated logger name
logger = logging.getLogger(logger_name)
if not logger.handlers: # Avoid adding handlers if re-running cell
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
else:
    logger.setLevel(logging.INFO)

# --- Configuration ---
current_notebook_dir = Path.cwd() # Assumes notebook is in notebooks/train/
# Adjust FEATURES_DIR if your features are not in ../features relative to this notebook's parent
# For example, if train.ipynb is in ./notebooks/train and features are in ./notebooks/features:
FEATURES_DIR = current_notebook_dir.parent.parent / "features" # Assuming features are in project_root/features
# If features are in ./notebooks/features:
# FEATURES_DIR = current_notebook_dir.parent / "features" 

logger.info(f"Attempting to find feature files in: {FEATURES_DIR.resolve()}")

try:
    if not FEATURES_DIR.exists():
        # Let's try another common location if the above doesn't exist, e.g. within notebooks/
        alt_features_dir = current_notebook_dir.parent / "features"
        if alt_features_dir.exists():
            FEATURES_DIR = alt_features_dir
            logger.info(f"Primary FEATURES_DIR not found, using alternative: {FEATURES_DIR.resolve()}")
        else:
            raise FileNotFoundError(f"The directory {FEATURES_DIR.resolve()} (and {alt_features_dir.resolve()}) does not exist. Please check the path.")

    # Assuming feature files might still use the 'v1' from previous regression task,
    # or you might have new ones. Adjust pattern if needed.
    feature_files = sorted(FEATURES_DIR.glob("kalshi_btc_features_target_v1_*.csv"), key=os.path.getctime, reverse=True)
    if not feature_files:
        raise FileNotFoundError(f"No feature CSV files found in {FEATURES_DIR.resolve()} matching pattern 'kalshi_btc_features_target_v1_*.csv'")
    FEATURES_CSV_PATH = feature_files[0]
    logger.info(f"Using features CSV: {FEATURES_CSV_PATH.resolve()}")
except FileNotFoundError as e:
    logger.critical(str(e))
    FEATURES_CSV_PATH = None
except Exception as e:
    logger.critical(f"Error finding features CSV: {e}")
    FEATURES_CSV_PATH = None

# Output directory for trained classifier models
MODEL_OUTPUT_DIR = current_notebook_dir.parent / "trained_models" # Keeps trained_models within notebooks/
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Trained classifier models will be saved in: {MODEL_OUTPUT_DIR.resolve()}")


# --- Load the Features DataFrame ---
df_model_data = pd.DataFrame()

if FEATURES_CSV_PATH and FEATURES_CSV_PATH.exists():
    try:
        df_model_data = pd.read_csv(FEATURES_CSV_PATH)
        logger.info(f"Successfully loaded features data from: {FEATURES_CSV_PATH.resolve()}")
        logger.info(f"Shape of loaded data: {df_model_data.shape}")
        
        print("--- Data Head (Raw from CSV) ---")
        print(df_model_data.head())
        print("\n--- Data Info (Raw from CSV) ---")
        df_model_data.info()
        print("\n--- Data Description (Numerical, Raw from CSV) ---")
        print(df_model_data.describe().to_string())
        
    except Exception as e:
        logger.critical(f"Error loading features CSV {FEATURES_CSV_PATH.resolve()}: {e}")
else:
    if FEATURES_CSV_PATH:
         logger.critical(f"Features CSV file not found at the specified path: {FEATURES_CSV_PATH.resolve()}")
    else:
         logger.critical("FEATURES_CSV_PATH was not set (likely due to an error finding the file). Cannot load data.")

if df_model_data.empty:
    logger.warning("DataFrame df_model_data is empty. Subsequent cells might fail.")

2025-05-20 22:47:13,375 - INFO - model_training_classifier_20250520_224713.<module>:39 - Attempting to find feature files in: /Users/omarabul-hassan/Desktop/projects/kalshi/features
2025-05-20 22:47:13,376 - INFO - model_training_classifier_20250520_224713.<module>:47 - Primary FEATURES_DIR not found, using alternative: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/features
2025-05-20 22:47:13,377 - INFO - model_training_classifier_20250520_224713.<module>:57 - Using features CSV: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/features/kalshi_btc_features_target_v1_20250520_224529.csv
2025-05-20 22:47:13,378 - INFO - model_training_classifier_20250520_224713.<module>:68 - Trained classifier models will be saved in: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/trained_models
2025-05-20 22:47:15,329 - INFO - model_training_classifier_20250520_224713.<module>:77 - Successfully loaded features data from: /Users/omarabul-hassan/Desktop/projects/kalshi/notebo

--- Data Head (Raw from CSV) ---
          kalshi_market_ticker  decision_point_ts_utc  kalshi_strike_price  \
0  KXBTCD-25MAY1522-T106249.99             1747357200            106249.99   
1  KXBTCD-25MAY1522-T106249.99             1747357260            106249.99   
2  KXBTCD-25MAY1522-T106249.99             1747357320            106249.99   
3  KXBTCD-25MAY1522-T106249.99             1747357380            106249.99   
4  KXBTCD-25MAY1522-T106249.99             1747357440            106249.99   

   btc_price_t_minus_1  btc_mom_5m  btc_mom_10m  btc_mom_15m  btc_mom_30m  \
0            103764.81       73.29       -69.79      -182.17        22.80   
1            103709.10       17.67      -111.60      -246.90       -68.90   
2            103785.66       83.21         7.89      -114.10      -109.26   
3            103691.25      -34.40         5.91      -227.89      -285.74   
4            103629.36     -110.19       -98.42      -303.61      -299.95   

   btc_vol_15m  btc_sma_10m  ...  T

In [6]:
# Cell 2: Data Preprocessing, Target Transformation, Feature Selection, and Splitting

if df_model_data.empty:
    logger.error("df_model_data is empty. Cannot proceed with preprocessing and splitting. Please ensure Cell 1 ran correctly and loaded data.")
else:
    logger.info(f"Starting preprocessing for df_model_data with shape: {df_model_data.shape}")

    # --- 1. Ensure Chronological Order ---
    df_model_data.sort_values(by='decision_point_ts_utc', inplace=True)
    df_model_data.reset_index(drop=True, inplace=True)
    logger.info("Data sorted by 'decision_point_ts_utc'.")

    # --- 2. Define NEW Target Variable for Classification ---
    # Original target: 'TARGET_btc_diff_from_strike'
    # New target: 1 if (BTC price at resolution > strike price), 0 otherwise.
    # This means the Kalshi market for "YES" would win.
    original_target_col = 'TARGET_btc_diff_from_strike'
    classification_target_col = 'TARGET_market_resolves_yes' # New binary target

    if original_target_col not in df_model_data.columns:
        logger.critical(f"Original target column '{original_target_col}' not found in DataFrame. Cannot create classification target.")
        # Stop execution or handle error appropriately
        raise ValueError(f"Missing required column: {original_target_col}")
    
    # Create the binary target: 1 if positive difference (YES wins), 0 if non-positive (NO wins or ties)
    df_model_data[classification_target_col] = (df_model_data[original_target_col] > 0).astype(int)
    logger.info(f"Created binary classification target '{classification_target_col}'.")
    logger.info(f"Value counts for '{classification_target_col}':\n{df_model_data[classification_target_col].value_counts(normalize=True)}")


    # --- 3. Handle Missing Values (NaNs) in Features ---
    identifier_cols = ['kalshi_market_ticker', 'decision_point_ts_utc', 'kalshi_strike_price']
    # Feature columns: exclude identifiers, original regression target, and new classification target
    feature_columns = [
        col for col in df_model_data.columns 
        if col not in identifier_cols + [original_target_col, classification_target_col]
    ]
    
    logger.info(f"Potential feature columns ({len(feature_columns)}): {feature_columns[:10]}...") # Log first 10

    nan_summary = df_model_data[feature_columns].isnull().sum()
    nan_summary = nan_summary[nan_summary > 0].sort_values(ascending=False)
    if not nan_summary.empty:
        logger.warning(f"NaN values found in feature columns:\n{nan_summary}")
        
        # --- Imputation Strategy (Same as before, review if needed for classification) ---
        cols_to_fill_zero = [
            col for col in feature_columns if 'kalshi_mid_chg' in col or \
            'btc_mom' in col 
        ]
        cols_to_fill_median = [ 
            col for col in feature_columns if 'btc_vol' in col or \
            'btc_sma' in col or 'btc_ema' in col 
        ]
        cols_to_fill_rsi_neutral = [col for col in feature_columns if 'btc_rsi' in col]
        
        if 'kalshi_yes_bid' in df_model_data.columns and 'kalshi_yes_bid' in feature_columns: # Check if it's a feature
            df_model_data['kalshi_yes_bid'] = df_model_data['kalshi_yes_bid'].fillna(0)
            logger.info("Filled NaNs in 'kalshi_yes_bid' with 0.")
        if 'kalshi_yes_ask' in df_model_data.columns and 'kalshi_yes_ask' in feature_columns:
            df_model_data['kalshi_yes_ask'] = df_model_data['kalshi_yes_ask'].fillna(100)
            logger.info("Filled NaNs in 'kalshi_yes_ask' with 100.")
        
        if 'kalshi_yes_bid' in feature_columns and 'kalshi_yes_ask' in feature_columns:
            if 'kalshi_spread' in feature_columns:
                df_model_data['kalshi_spread'] = df_model_data['kalshi_yes_ask'] - df_model_data['kalshi_yes_bid']
                logger.info("Recalculated 'kalshi_spread' after filling bid/ask.")
            if 'kalshi_mid_price' in feature_columns:
                 df_model_data['kalshi_mid_price'] = (df_model_data['kalshi_yes_bid'] + df_model_data['kalshi_yes_ask']) / 2
                 logger.info("Recalculated 'kalshi_mid_price' after filling bid/ask.")
        
        for col in cols_to_fill_zero:
            if col in df_model_data.columns and col in feature_columns:
                df_model_data[col] = df_model_data[col].fillna(0)
                logger.info(f"Filled NaNs in '{col}' with 0.")

        for col in cols_to_fill_median:
            if col in df_model_data.columns and col in feature_columns:
                median_val = df_model_data[col].median()
                df_model_data[col] = df_model_data[col].fillna(median_val)
                logger.info(f"Filled NaNs in '{col}' with its median ({median_val:.4f}).")

        for col in cols_to_fill_rsi_neutral:
            if col in df_model_data.columns and col in feature_columns:
                df_model_data[col] = df_model_data[col].fillna(50)
                logger.info(f"Filled NaNs in '{col}' with 50.")

        original_row_count = len(df_model_data)
        df_model_data.dropna(subset=feature_columns, inplace=True) # Drop rows with NaNs in any *feature* column
        # Also drop rows where the classification target might be NaN (though astype(int) should handle it from boolean)
        df_model_data.dropna(subset=[classification_target_col], inplace=True) 
        logger.info(f"Dropped {original_row_count - len(df_model_data)} rows due to remaining NaNs in features or target after imputation attempts.")
        
        final_nan_summary = df_model_data[feature_columns].isnull().sum()
        final_nan_summary = final_nan_summary[final_nan_summary > 0]
        if not final_nan_summary.empty:
            logger.error(f"Still have NaNs after processing feature columns! Columns:\n{final_nan_summary}")
        else:
            logger.info("Successfully handled NaNs in feature columns.")
    else:
        logger.info("No NaNs found in the selected feature columns.")
        
    # --- 4. Define Features (X) and New Target (y) ---
    if not df_model_data.empty:
        X = df_model_data[feature_columns].copy()
        y = df_model_data[classification_target_col].copy() # Use the new binary target
        logger.info(f"Defined X (features) with shape: {X.shape}")
        logger.info(f"Defined y (binary target) with shape: {y.shape}")
        logger.info(f"Target y value counts:\n{y.value_counts(normalize=True)}")


        # --- 5. Split Data (Chronological) ---
        split_ratio = 0.8
        split_index = int(len(X) * split_ratio)

        X_train = X.iloc[:split_index]
        y_train = y.iloc[:split_index] # y_train is now binary
        X_test = X.iloc[split_index:]
        y_test = y.iloc[split_index:]   # y_test is now binary

        logger.info(f"Data split chronologically:")
        logger.info(f"  X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        logger.info(f"  X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
        
        train_start_ts = df_model_data['decision_point_ts_utc'].iloc[0]
        train_end_ts = df_model_data['decision_point_ts_utc'].iloc[split_index - 1]
        test_start_ts = df_model_data['decision_point_ts_utc'].iloc[split_index]
        test_end_ts = df_model_data['decision_point_ts_utc'].iloc[-1]

        logger.info(f"  Training data from: {dt.datetime.fromtimestamp(train_start_ts, tz=timezone.utc).isoformat()} to {dt.datetime.fromtimestamp(train_end_ts, tz=timezone.utc).isoformat()}")
        logger.info(f"  Test data from:     {dt.datetime.fromtimestamp(test_start_ts, tz=timezone.utc).isoformat()} to {dt.datetime.fromtimestamp(test_end_ts, tz=timezone.utc).isoformat()}")
        
        # --- 6. Feature Scaling ---
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
        X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

        logger.info("Features scaled using StandardScaler.")
        print("\nSample of scaled training features (X_train_scaled_df head):")
        print(X_train_scaled_df.head())
        
        # Save the scaler (filename indicates it's for classifier v1)
        scaler_path = MODEL_OUTPUT_DIR / "feature_scaler_classifier_v1.joblib"
        joblib.dump(scaler, scaler_path)
        logger.info(f"Scaler saved to: {scaler_path}")
        
        # Save the list of feature columns (filename indicates it's for classifier v1)
        # This list *should* be the same as for regression if using same features,
        # but good to save it associated with this model run.
        feature_columns_list_path = MODEL_OUTPUT_DIR / "feature_columns_classifier_v1.json"
        with open(feature_columns_list_path, 'w') as f:
            json.dump(feature_columns, f) # feature_columns is already a list here
        logger.info(f"List of feature columns saved to: {feature_columns_list_path}")

    else:
        logger.error("df_model_data is empty after NaN handling. Cannot proceed to define X, y, or split.")
        X, y, X_train, y_train, X_test, y_test, X_train_scaled_df, X_test_scaled_df = [pd.DataFrame()]*8 
        scaler = None

2025-05-20 22:47:26,610 - INFO - model_training_classifier_20250520_224713.<module>:6 - Starting preprocessing for df_model_data with shape: (1294800, 29)
2025-05-20 22:47:26,698 - INFO - model_training_classifier_20250520_224713.<module>:11 - Data sorted by 'decision_point_ts_utc'.
2025-05-20 22:47:26,701 - INFO - model_training_classifier_20250520_224713.<module>:27 - Created binary classification target 'TARGET_market_resolves_yes'.
2025-05-20 22:47:26,706 - INFO - model_training_classifier_20250520_224713.<module>:28 - Value counts for 'TARGET_market_resolves_yes':
TARGET_market_resolves_yes
1    0.518072
0    0.481928
Name: proportion, dtype: float64
2025-05-20 22:47:26,706 - INFO - model_training_classifier_20250520_224713.<module>:39 - Potential feature columns (25): ['btc_price_t_minus_1', 'btc_mom_5m', 'btc_mom_10m', 'btc_mom_15m', 'btc_mom_30m', 'btc_vol_15m', 'btc_sma_10m', 'btc_sma_30m', 'btc_ema_12m', 'btc_ema_26m']...
kalshi_mid_chg_5m                 500715
kalshi_mid_ch


Sample of scaled training features (X_train_scaled_df head):
    btc_price_t_minus_1  btc_mom_5m  btc_mom_10m  btc_mom_15m  btc_mom_30m  \
18            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
19            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
20            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
21            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
22            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   

    btc_vol_15m  btc_sma_10m  btc_sma_30m  btc_ema_12m  btc_ema_26m  ...  \
18    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
19    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
20    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
21    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
22    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   

    hour_of_

In [7]:
# Cell 3: Classification Model Training and Evaluation

if 'X_train_scaled_df' not in globals() or X_train_scaled_df.empty:
    logger.error("Scaled training data (X_train_scaled_df) not found or is empty. Please ensure Cell 2 ran successfully.")
    # Optionally, raise an error or stop notebook execution
else:
    logger.info("--- Starting Classification Model Training (Logistic Regression) ---")

    # --- 1. Initialize and Train Logistic Regression Model ---
    # You can adjust parameters like C (inverse of regularization strength) or solver.
    # Using class_weight='balanced' can be helpful if classes are imbalanced, though ours are fairly balanced.
    # sag solver is good for large datasets, liblinear for smaller. lbfgs is a good default.
    classifier_model = LogisticRegression(
        solver='lbfgs', # A good default solver
        max_iter=1000,  # Increased for convergence with potentially many features
        random_state=42,
        C=1.0, # Regularization strength
        class_weight='balanced' # Optional: helps if classes are imbalanced
    )
    
    logger.info(f"Training LogisticRegression model on {X_train_scaled_df.shape[0]} samples...")
    
    if 'y_train' not in globals() or y_train.empty:
        logger.error("y_train (binary target) is not available. Cannot train model.")
    else:
        try:
            classifier_model.fit(X_train_scaled_df, y_train)
            logger.info("LogisticRegression model training complete.")

            # --- 2. Make Predictions on the Test Set ---
            logger.info(f"Making predictions on the test set ({X_test_scaled_df.shape[0]} samples)...")
            y_pred_test_class = classifier_model.predict(X_test_scaled_df) # Predicts class labels (0 or 1)
            y_pred_test_proba = classifier_model.predict_proba(X_test_scaled_df)[:, 1] # Probabilities for the positive class (class 1)

            # --- 3. Evaluate Model Performance (Classification Metrics) ---
            if 'y_test' not in globals() or y_test.empty:
                logger.error("y_test (binary target) is not available. Cannot evaluate model.")
            else:
                accuracy = accuracy_score(y_test, y_pred_test_class)
                precision = precision_score(y_test, y_pred_test_class, zero_division=0)
                recall = recall_score(y_test, y_pred_test_class, zero_division=0)
                f1 = f1_score(y_test, y_pred_test_class, zero_division=0)
                try:
                    roc_auc = roc_auc_score(y_test, y_pred_test_proba) # Use probabilities for AUC
                except ValueError as e:
                    logger.warning(f"Could not calculate ROC AUC, possibly due to only one class present in y_test or y_pred_test_proba. Error: {e}")
                    roc_auc = np.nan
                logloss = log_loss(y_test, y_pred_test_proba) # Use probabilities for log loss

                logger.info("\n--- Classification Model Evaluation Metrics (Test Set) ---")
                logger.info(f"  Accuracy:          {accuracy:.4f}")
                logger.info(f"  Precision:         {precision:.4f} (Portion of predicted YES that were actually YES)")
                logger.info(f"  Recall (TPR):      {recall:.4f} (Portion of actual YES that were correctly identified)")
                logger.info(f"  F1-Score:          {f1:.4f}")
                logger.info(f"  ROC AUC:           {roc_auc:.4f}")
                logger.info(f"  Log Loss:          {logloss:.4f}")

                logger.info("\n--- Confusion Matrix (Test Set) ---")
                # Rows: Actual, Columns: Predicted
                # [[TN, FP],
                #  [FN, TP]]
                cm = confusion_matrix(y_test, y_pred_test_class)
                logger.info(f"\n{cm}")
                try:
                    tn, fp, fn, tp = cm.ravel()
                    logger.info(f"  True Negatives (TN) - Actual NO, Predicted NO:  {tn}")
                    logger.info(f"  False Positives (FP) - Actual NO, Predicted YES: {fp} (Type I Error)")
                    logger.info(f"  False Negatives (FN) - Actual YES, Predicted NO: {fn} (Type II Error)")
                    logger.info(f"  True Positives (TP) - Actual YES, Predicted YES: {tp}")
                except ValueError: # If cm doesn't have 4 values (e.g. predicts only one class)
                    logger.warning("Could not unpack full confusion matrix (TN,FP,FN,TP).")


                # Create a DataFrame for easier analysis of predictions vs actuals
                df_results_class = pd.DataFrame({
                    'actual_target_resolves_yes': y_test,
                    'predicted_class_resolves_yes': y_pred_test_class,
                    'predicted_proba_resolves_yes': y_pred_test_proba
                })
                # Add back the original regression target for context if needed
                if original_target_col in df_model_data.columns: # From Cell 2
                    df_results_class['original_target_diff'] = df_model_data.loc[y_test.index, original_target_col]

                print("\n--- Sample of Test Set Predictions vs Actuals (Classification) ---")
                print(df_results_class.head(10).to_string())

                # --- 4. Inspect Model Coefficients (for Logistic Regression) ---
                logger.info("\n--- Logistic Regression Model Coefficients ---")
                # For Logistic Regression, intercept is an array if multi-class, or a float if binary.
                # coef_ is also 2D for multi-class, (1, n_features) for binary.
                if hasattr(classifier_model, 'intercept_') and hasattr(classifier_model, 'coef_'):
                    logger.info(f"Intercept: {classifier_model.intercept_[0]:.4f}") # Assuming binary classification, intercept_ is array of one
                    
                    # Ensure feature_columns is available
                    if 'feature_columns' not in globals(): # Should be defined in Cell 2
                        feature_columns_list_path = MODEL_OUTPUT_DIR / "feature_columns_classifier_v1.json"
                        if feature_columns_list_path.exists():
                            with open(feature_columns_list_path, 'r') as f:
                                feature_columns = json.load(f)
                            logger.info(f"Loaded feature_columns list from {feature_columns_list_path}")
                        else:
                            logger.warning("feature_columns list not found. Cannot display coefficient names.")
                            feature_columns = [f"feature_{i}" for i in range(classifier_model.coef_.shape[1])]
                    
                    # Coefficients are for the positive class (class 1) in binary classification
                    coefficients = pd.DataFrame({'feature': feature_columns, 'coefficient': classifier_model.coef_[0]})
                    coefficients['abs_coefficient'] = np.abs(coefficients['coefficient'])
                    coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
                    
                    print("\nTop Coefficients (by absolute value) for P(TARGET_market_resolves_yes = 1):")
                    print(coefficients.head(20).to_string())
                else:
                    logger.warning("Could not retrieve coefficients from the trained classifier model.")

                # --- 5. Save the Trained Model ---
                model_path = MODEL_OUTPUT_DIR / "logistic_regression_btc_classifier_v1.joblib" # New name
                joblib.dump(classifier_model, model_path)
                logger.info(f"Trained Logistic Regression model saved to: {model_path}")

                # Save model parameters for potential use in a strategy that doesn't load the full joblib
                # For Logistic Regression, this includes coefficients and intercept.
                # The strategy will need to apply the sigmoid function to the raw score if using these.
                model_params_for_backtest = {
                    'model_type': 'logistic_regression',
                    'intercept': classifier_model.intercept_[0].tolist() if isinstance(classifier_model.intercept_, np.ndarray) else classifier_model.intercept_,
                    'coefficients': dict(zip(feature_columns, classifier_model.coef_[0])),
                    'feature_order': feature_columns, # From Cell 2
                    'classes': classifier_model.classes_.tolist() # [0, 1] for binary
                }
                params_path = MODEL_OUTPUT_DIR / "logreg_model_params_v1.json" # New name
                with open(params_path, 'w') as f:
                    json.dump(model_params_for_backtest, f, indent=4)
                logger.info(f"Logistic Regression model parameters saved to: {params_path}")

        except Exception as e:
            logger.critical(f"An error occurred during classification model training or evaluation: {e}", exc_info=True)
            if 'classifier_model' in locals():
                 logger.info("Model training might have partially completed or failed during evaluation.")

2025-05-20 22:48:49,136 - INFO - model_training_classifier_20250520_224713.<module>:7 - --- Starting Classification Model Training (Logistic Regression) ---
2025-05-20 22:48:49,137 - INFO - model_training_classifier_20250520_224713.<module>:21 - Training LogisticRegression model on 669507 samples...
2025-05-20 22:48:49,584 - INFO - model_training_classifier_20250520_224713.<module>:28 - LogisticRegression model training complete.
2025-05-20 22:48:49,584 - INFO - model_training_classifier_20250520_224713.<module>:31 - Making predictions on the test set (167377 samples)...
2025-05-20 22:48:49,643 - INFO - model_training_classifier_20250520_224713.<module>:50 - 
--- Classification Model Evaluation Metrics (Test Set) ---
2025-05-20 22:48:49,643 - INFO - model_training_classifier_20250520_224713.<module>:51 -   Accuracy:          0.9129
2025-05-20 22:48:49,644 - INFO - model_training_classifier_20250520_224713.<module>:52 -   Precision:         0.8872 (Portion of predicted YES that were act


--- Sample of Test Set Predictions vs Actuals (Classification) ---
         actual_target_resolves_yes  predicted_class_resolves_yes  predicted_proba_resolves_yes  original_target_diff
1105142                           0                             0                      0.071834               -608.82
1105143                           0                             0                      0.100146              -2053.99
1105144                           0                             0                      0.061299               -858.82
1105145                           0                             1                      0.925200                -53.99
1105146                           1                             1                      0.982893               1446.01
1105147                           0                             0                      0.057713              -1858.82
1105148                           1                             1                      0.980323           