In [None]:
# Cell 1: Imports and Setup

import pandas as pd
import numpy as np
import datetime as dt
from datetime import timezone
from pathlib import Path
import logging
import joblib 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import json 

# --- Logging Setup ---
logger = logging.getLogger("training_logreg_per_minute_v2") # Changed logger name
if not logger.handlers:
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
else:
    # Clear existing handlers if re-running cell
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
    logger.setLevel(logging.INFO) # Ensure level is set after clearing
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)


# --- Directories ---
BASE_PROJECT_DIR = Path("/Users/omarabul-hassan/Desktop/projects/kalshi")
NOTEBOOKS_DIR = BASE_PROJECT_DIR / "notebooks"
FEATURES_DIR = NOTEBOOKS_DIR / "features" # Using historical features for this retraining

# *** MODIFIED: New output directory for this version of the model ***
MODEL_VERSION_SUFFIX = "no_vol_oi" 
TRAINED_MODELS_OUTPUT_DIR = NOTEBOOKS_DIR / "trained_models" / f"logreg_per_minute_{MODEL_VERSION_SUFFIX}"

TRAINED_MODELS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

logger.info(f"Features expected from: {FEATURES_DIR}")
logger.info(f"Trained models (per-minute, {MODEL_VERSION_SUFFIX}) will be saved to: {TRAINED_MODELS_OUTPUT_DIR}")

# --- Constants for Train/Test Split (based on market RESOLUTION time) ---
# These remain the same as you're using your historical data for retraining
TRAIN_END_DATE_STR = "2025-05-08" 
TEST_START_DATE_STR = "2025-05-09" 
TEST_END_DATE_STR = "2025-05-15"   

TRAIN_UPTO_TS = int(dt.datetime.strptime(TRAIN_END_DATE_STR + " 23:59:59", "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp())
TEST_FROM_TS = int(dt.datetime.strptime(TEST_START_DATE_STR + " 00:00:00", "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp())
TEST_UPTO_TS = int(dt.datetime.strptime(TEST_END_DATE_STR + " 23:59:59", "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp())

logger.info(f"Training data: Market resolutions up to {TRAIN_END_DATE_STR} (ts: {TRAIN_UPTO_TS})")
logger.info(f"Testing data: Market resolutions from {TEST_START_DATE_STR} (ts: {TEST_FROM_TS}) to {TEST_END_DATE_STR} (ts: {TEST_UPTO_TS})")

logger.info("Cell 1: Setup complete.")

In [None]:
# Cell 2: Load Features 

# Find the latest "per-minute decision features" CSV file
# This uses the filename pattern from the original (historical) feature_engineering.ipynb
list_of_feature_files = sorted(
    glob.glob(str(FEATURES_DIR / "kalshi_per_minute_decision_features_*.csv")), 
    key=os.path.getctime,
    reverse=True 
)

if not list_of_feature_files:
    logger.critical(f"CRITICAL: No 'kalshi_per_minute_decision_features_*.csv' files found in {FEATURES_DIR}.")
    raise FileNotFoundError(f"No historical 'per_minute_decision_features' CSV files found. Ensure historical feature_engineering.ipynb has run.")
else:
    LATEST_FEATURES_CSV_PATH = Path(list_of_feature_files[0]) 
    
    logger.info(f"Loading HISTORICAL PER-MINUTE features from: {LATEST_FEATURES_CSV_PATH} for retraining.")
    try:
        features_df = pd.read_csv(LATEST_FEATURES_CSV_PATH, low_memory=False)
        logger.info(f"Loaded HISTORICAL PER-MINUTE features DataFrame with {features_df.shape[0]} rows and {features_df.shape[1]} columns.")
        
        essential_cols = ['market_ticker', 'decision_timestamp_s', 'resolution_time_ts', 'target']
        if any(col not in features_df.columns for col in essential_cols):
            missing = [col for col in essential_cols if col not in features_df.columns]
            logger.critical(f"Essential columns ({missing}) not found in features DataFrame.")
            raise ValueError(f"Features DataFrame missing essential columns: {missing}")
            
    except Exception as e:
        logger.critical(f"Error loading features CSV {LATEST_FEATURES_CSV_PATH}: {e}", exc_info=True)
        raise

logger.info("Cell 2: Historical per-minute feature loading complete for retraining.")

In [None]:
# Cell 3: Data Preprocessing and Feature Selection (MODIFIED for no_vol_oi model)

if 'features_df' not in locals() or features_df.empty: 
    logger.error("Features DataFrame is empty or not loaded. Cannot proceed.")
    X, y, processed_df, MODEL_FEATURE_NAMES = pd.DataFrame(), pd.Series(dtype='float64'), pd.DataFrame(), [] 
else:
    # --- 1. Define Feature Columns for the Per-Minute Model (NO VOLUME/OI) ---
    
    MODEL_FEATURE_NAMES = [
        'strike_price',
        'time_to_resolution_minutes',
        'current_btc_price',
        'current_dist_strike_abs',
        'current_dist_strike_pct',
        # Add lag features
        'btc_price_change_pct_1m', 'btc_price_change_pct_3m', 'btc_price_change_pct_5m',
        'btc_price_change_pct_10m', 'btc_price_change_pct_15m', 'btc_price_change_pct_30m',
        # Add volatility features
        'btc_volatility_5m', 'btc_volatility_15m', 'btc_volatility_30m',
        # Add current Kalshi market state features
        'current_kalshi_yes_bid', 'current_kalshi_yes_ask', 'current_kalshi_mid_price',
        'current_kalshi_spread_abs', 'current_kalshi_spread_pct'
        # 'current_kalshi_volume', 'current_kalshi_oi'  <--- REMOVED THESE
    ]
    
    logger.info(f"MODEL_VERSION_SUFFIX: {MODEL_VERSION_SUFFIX}")
    logger.info(f"Selected {len(MODEL_FEATURE_NAMES)} features for the '{MODEL_VERSION_SUFFIX}' model.")
    logger.info(f"Selected features: {MODEL_FEATURE_NAMES}")

    # Verify that all selected feature names actually exist in the loaded DataFrame
    actual_columns_in_df = features_df.columns.tolist()
    missing_model_features = [name for name in MODEL_FEATURE_NAMES if name not in actual_columns_in_df]
    if missing_model_features:
        logger.critical(f"CRITICAL: The following selected MODEL_FEATURE_NAMES are MISSING from the loaded features_df: {missing_model_features}")
        logger.critical(f"Available columns in features_df: {actual_columns_in_df}")
        raise ValueError("Selected model features are not present in the loaded data.")
    
    # --- 2. Handle Missing Values (NaNs) for SELECTED FEATURES ---
    # Your historical features *should* have values for volume and OI.
    # If you run this cell as-is, the imputation will still run on the original features_df
    # which includes current_kalshi_volume and current_kalshi_oi.
    # However, X will only be selected from the new MODEL_FEATURE_NAMES.
    
    processed_df = features_df.copy() 

    # Impute NaNs in ALL original features_df columns that might be selected or related,
    # just to be safe, even if some are dropped from MODEL_FEATURE_NAMES later.
    # The historical features for volume/OI might have some NaNs, so impute them.
    potential_features_to_impute = [
        'strike_price', 'time_to_resolution_minutes', 'current_btc_price',
        'current_dist_strike_abs', 'current_dist_strike_pct',
        'btc_price_change_pct_1m', 'btc_price_change_pct_3m', 'btc_price_change_pct_5m',
        'btc_price_change_pct_10m', 'btc_price_change_pct_15m', 'btc_price_change_pct_30m',
        'btc_volatility_5m', 'btc_volatility_15m', 'btc_volatility_30m',
        'current_kalshi_yes_bid', 'current_kalshi_yes_ask', 'current_kalshi_mid_price',
        'current_kalshi_spread_abs', 'current_kalshi_spread_pct',
        'current_kalshi_volume', 'current_kalshi_oi' # Impute these too, even if not used by this model version
    ]

    for col in potential_features_to_impute:
        if col in processed_df.columns and processed_df[col].isnull().any(): 
            median_val = processed_df[col].median()
            if pd.isna(median_val): 
                # This can happen if a column is ALL NaN, or if a non-numeric column slips through.
                # The historical features should generally have valid medians for numeric cols.
                logger.warning(f"Median for feature '{col}' is NaN. Filling with 0 as a fallback.")
                processed_df[col].fillna(0, inplace=True) 
            else:
                processed_df[col].fillna(median_val, inplace=True)
                # logger.debug(f"Imputed NaNs in '{col}' with median {median_val_temp}.")
    
    # --- Define X (features) and y (target) using the NEW MODEL_FEATURE_NAMES ---
    X = processed_df[MODEL_FEATURE_NAMES] # This uses the list without volume/OI
    y = processed_df['target']

    # Final NaN check on X (selected features)
    if X.isnull().sum().sum() > 0:
        logger.warning(f"NaNs still present in X (final selected features) after imputation. This is unexpected.")
        logger.warning(f"Columns with NaNs in X: \n{X.isnull().sum()[X.isnull().sum() > 0]}")
        logger.info("Dropping rows with any remaining NaNs in X for robust training.")
        nan_rows_mask = X.isnull().any(axis=1)
        X = X[~nan_rows_mask]
        y = y[~nan_rows_mask] 
        logger.info(f"New X shape after dropping NaN rows: {X.shape}")
    else:
        logger.info("Imputation for relevant features complete. No NaNs remaining in the final selected X.")

    logger.info(f"Shape of X (features for {MODEL_VERSION_SUFFIX} model): {X.shape}")
    logger.info(f"Shape of y (target): {y.shape}")
    if not y.empty:
        logger.info(f"Target value counts:\n{y.value_counts(normalize=True)}")
    
    if not X.empty:
        print(f"Head of X for {MODEL_VERSION_SUFFIX} model:")
        display(X.head())
    else:
        logger.warning("Feature set X is empty after selection/preprocessing.")

    if X.shape[0] < len(features_df): 
         processed_df = processed_df.loc[X.index] 
         logger.info(f"Aligned 'processed_df' with cleaned X. New processed_df shape: {processed_df.shape}")

In [None]:
# Cell 4: Time-Based Train-Test Split 

if 'processed_df' not in locals() or processed_df.empty or 'X' not in locals() or X.empty:
    logger.error("processed_df or X is not available or empty from Cell 3. Cannot split.")
    X_train, y_train, X_test, y_test, test_identifiers_df = pd.DataFrame(), pd.Series(dtype='float64'), pd.DataFrame(), pd.Series(dtype='float64'), pd.DataFrame()
else:
    if 'resolution_time_ts' not in processed_df.columns:
        raise ValueError("'resolution_time_ts' column is missing from processed_df for splitting.")
    processed_df['resolution_time_ts'] = pd.to_numeric(processed_df['resolution_time_ts'], errors='coerce')
    
    initial_rows_before_ts_dropna = len(processed_df)
    processed_df.dropna(subset=['resolution_time_ts'], inplace=True)
    if len(processed_df) < initial_rows_before_ts_dropna:
        logger.warning(f"Dropped {initial_rows_before_ts_dropna - len(processed_df)} rows due to NaN in 'resolution_time_ts' for splitting.")
        # Re-align X and y if rows were dropped from processed_df
        X = processed_df[MODEL_FEATURE_NAMES] # MODEL_FEATURE_NAMES is now the shorter list
        y = processed_df['target']
        logger.info(f"Re-aligned X and y after 'resolution_time_ts' NaN drop. New X shape: {X.shape}")

    train_mask = (processed_df['resolution_time_ts'] <= TRAIN_UPTO_TS)
    test_mask = (processed_df['resolution_time_ts'] >= TEST_FROM_TS) & \
                  (processed_df['resolution_time_ts'] <= TEST_UPTO_TS)

    X_train = X[train_mask]
    y_train = y[train_mask]
    
    X_test = X[test_mask]
    y_test = y[test_mask]

    test_identifiers_df = processed_df[test_mask][['market_ticker', 'decision_timestamp_s', 'resolution_time_ts', 'strike_price']].copy()

    logger.info(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    logger.info(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    if X_train.empty or X_test.empty:
        logger.critical("Training or testing set is empty after time-based split! Check date ranges and data volume.")
        min_res_ts = processed_df['resolution_time_ts'].min()
        max_res_ts = processed_df['resolution_time_ts'].max()
        logger.info(f"Min resolution_time_ts in data: {dt.datetime.fromtimestamp(min_res_ts, tz=timezone.utc) if pd.notna(min_res_ts) else 'N/A'}")
        logger.info(f"Max resolution_time_ts in data: {dt.datetime.fromtimestamp(max_res_ts, tz=timezone.utc) if pd.notna(max_res_ts) else 'N/A'}")
        logger.info(f"TRAIN_UPTO_TS: {dt.datetime.fromtimestamp(TRAIN_UPTO_TS, tz=timezone.utc)}")
        logger.info(f"TEST_FROM_TS: {dt.datetime.fromtimestamp(TEST_FROM_TS, tz=timezone.utc)}")
        logger.info(f"TEST_UPTO_TS: {dt.datetime.fromtimestamp(TEST_UPTO_TS, tz=timezone.utc)}")
    else:
        logger.info(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")
        logger.info(f"Testing target distribution:\n{y_test.value_counts(normalize=True)}")
        
        if X_train.isnull().sum().sum() > 0: logger.warning(f"NaNs found in X_train after split: {X_train.isnull().sum().sum()}")
        if X_test.isnull().sum().sum() > 0: logger.warning(f"NaNs found in X_test after split: {X_test.isnull().sum().sum()}")

In [None]:
# Cell 5: Model Training (Logistic Regression)

if 'X_train' not in locals() or X_train.empty:
    logger.error("X_train is not available or empty. Cannot train model.")
else:
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train) # X_train now has fewer features
    
    if not X_test.empty:
        X_test_scaled = scaler.transform(X_test) # X_test also has fewer features
    else:
        X_test_scaled = np.array([]) 
        logger.warning("X_test is empty, X_test_scaled will be empty. Evaluation might not be possible.")
    
    logger.info("Numeric features scaled using StandardScaler.")

    logreg_model = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced', C=1.0, max_iter=1000)
    
    logger.info(f"Training Logistic Regression model ({MODEL_VERSION_SUFFIX} version) on {X_train_scaled.shape[0]} samples...")
    logreg_model.fit(X_train_scaled, y_train)
    logger.info(f"Model training complete ({MODEL_VERSION_SUFFIX} version).")

    # *** MODIFIED: Filenames to reflect the new model version ***
    base_filename = f"logreg_per_minute_{MODEL_VERSION_SUFFIX}"
    scaler_filename = f"{base_filename}_scaler.joblib"
    model_filename = f"{base_filename}_model.joblib"
    feature_names_filename = f"{base_filename}_feature_names.json" 

    try:
        joblib.dump(scaler, TRAINED_MODELS_OUTPUT_DIR / scaler_filename)
        logger.info(f"Scaler saved to {TRAINED_MODELS_OUTPUT_DIR / scaler_filename}")
        
        joblib.dump(logreg_model, TRAINED_MODELS_OUTPUT_DIR / model_filename)
        logger.info(f"Model saved to {TRAINED_MODELS_OUTPUT_DIR / model_filename}")

        if 'MODEL_FEATURE_NAMES' in locals() and MODEL_FEATURE_NAMES: # MODEL_FEATURE_NAMES is now the shorter list
            with open(TRAINED_MODELS_OUTPUT_DIR / feature_names_filename, 'w') as f:
                json.dump(MODEL_FEATURE_NAMES, f)
            logger.info(f"Feature names used for model saved to {TRAINED_MODELS_OUTPUT_DIR / feature_names_filename}")
        else:
            logger.warning("MODEL_FEATURE_NAMES not defined or empty. Not saving feature names list.")

    except Exception as e:
        logger.error(f"Error saving model or scaler: {e}", exc_info=True)

In [None]:
# Cell 6: Model Evaluation

model_ready = 'logreg_model' in locals()
test_data_available = ('X_test_scaled' in locals() and isinstance(X_test_scaled, np.ndarray) and X_test_scaled.size > 0 and \
                       'y_test' in locals() and not y_test.empty and \
                       'test_identifiers_df' in locals() and not test_identifiers_df.empty)


if not model_ready or not test_data_available:
    logger.error(f"Model ({MODEL_VERSION_SUFFIX}) not trained or test data not available/empty. Cannot evaluate.")
else:
    logger.info(f"Evaluating model ({MODEL_VERSION_SUFFIX} version) on the test set (per-minute decision points)...")
    
    y_pred_test = logreg_model.predict(X_test_scaled)
    y_pred_proba_test = logreg_model.predict_proba(X_test_scaled)[:, 1] 

    accuracy = accuracy_score(y_test, y_pred_test)
    try:
        roc_auc = roc_auc_score(y_test, y_pred_proba_test)
        logger.info(f"Test Set ROC AUC Score: {roc_auc:.4f}")
    except ValueError as e:
        logger.warning(f"Could not calculate ROC AUC: {e}. This can happen if only one class is present in y_test.")
        roc_auc = np.nan 
    
    logger.info(f"Test Set Accuracy: {accuracy:.4f}")
    logger.info("\nTest Set Classification Report:\n" + classification_report(y_test, y_pred_test, zero_division=0))
    
    cm = confusion_matrix(y_test, y_pred_test)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Pred No', 'Pred Yes'], yticklabels=['Actual No', 'Actual Yes'])
    plt.title(f'Test Set Confusion Matrix ({MODEL_VERSION_SUFFIX} Decisions)')
    plt.ylabel('Actual Market Outcome')
    plt.xlabel('Predicted Outcome at Decision Point')
    plt.show()

    test_predictions_df = test_identifiers_df.copy()
    test_predictions_df['actual_target'] = y_test.values 
    test_predictions_df['predicted_target_logreg'] = y_pred_test
    test_predictions_df['predicted_proba_yes_logreg'] = y_pred_proba_test
    
    if 'time_to_resolution_minutes' in X_test.columns:
        test_predictions_df['time_to_resolution_at_pred'] = X_test['time_to_resolution_minutes'].values
    
    # *** MODIFIED: Prediction filename to reflect the new model version ***
    base_pred_filename = f"logreg_per_minute_{MODEL_VERSION_SUFFIX}"
    predictions_filename = f"{base_pred_filename}_test_predictions_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    predictions_filepath = TRAINED_MODELS_OUTPUT_DIR / predictions_filename
    test_predictions_df.to_csv(predictions_filepath, index=False)
    logger.info(f"Test set (per-minute, {MODEL_VERSION_SUFFIX} model) predictions saved to: {predictions_filepath}")
    print(f"Head of test predictions for {MODEL_VERSION_SUFFIX} model:")
    display(test_predictions_df.head())

logger.info(f"Cell 6: Model evaluation complete ({MODEL_VERSION_SUFFIX} version).")