In [None]:
# Cell 1: Imports and Setup

import pandas as pd
import numpy as np
import datetime as dt
from datetime import timezone, timedelta
from pathlib import Path
import logging
import json
import os
import re
import glob
from tqdm.notebook import tqdm

# --- Logging Setup ---
logger = logging.getLogger("feature_engineering_per_minute")
if not logger.handlers:
    logger.setLevel(logging.INFO) 
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
else:
    logger.setLevel(logging.INFO)

# --- Base Directories ---
BASE_PROJECT_DIR = Path("/Users/omarabul-hassan/Desktop/projects/kalshi") 
NOTEBOOKS_DIR = BASE_PROJECT_DIR / "notebooks"
DATA_DIR = NOTEBOOKS_DIR / "data"

KALSHI_DATA_BASE_DIR = DATA_DIR / "kalshi_data"
BINANCE_DATA_BASE_DIR = DATA_DIR / "binance_data"
FEATURES_OUTPUT_DIR = NOTEBOOKS_DIR / "features"

FEATURES_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

logger.info(f"Kalshi data expected at: {KALSHI_DATA_BASE_DIR}")
logger.info(f"Binance data expected at: {BINANCE_DATA_BASE_DIR}")
logger.info(f"Per-minute decision features will be saved to: {FEATURES_OUTPUT_DIR}")

# --- Constants ---
MIN_MINUTES_BEFORE_RESOLUTION_FOR_DECISION = 1
LAG_WINDOWS_MINUTES = [1, 3, 5, 10, 15, 30] 
ROLLING_WINDOWS_MINUTES = [5, 15, 30]

# For detailed debugging of BTC price fetching for the first N original markets AND their first few decision points
DEBUG_FIRST_N_ORIG_MARKETS = 1 # Set >0 for targeted debug logs
DEBUG_FIRST_N_DECISION_POINTS_PER_MARKET = 3 # How many decision points to log for for the debugged markets
debug_orig_market_count = 0 # Counter for markets being debugged

logger.info(f"Decision points will be generated up to T-{MIN_MINUTES_BEFORE_RESOLUTION_FOR_DECISION}m before market resolution.")
logger.info(f"Using lag windows: {LAG_WINDOWS_MINUTES} minutes.")
logger.info(f"Using rolling windows: {ROLLING_WINDOWS_MINUTES} minutes.")
logger.info(f"Debug logging for first {DEBUG_FIRST_N_ORIG_MARKETS} markets, first {DEBUG_FIRST_N_DECISION_POINTS_PER_MARKET} decision points each.")
logger.info("Cell 1: Setup complete.")

In [None]:
# Cell 2: Utility Functions (Data Loading & Parsing)

_binance_day_data_cache = {}
_kalshi_market_data_cache = {} 

def clear_all_caches():
    global _binance_day_data_cache, _kalshi_market_data_cache, debug_orig_market_count # Use new counter
    _binance_day_data_cache = {}
    _kalshi_market_data_cache = {}
    debug_orig_market_count = 0 # Reset this counter
    logger.info("Cleared Binance, Kalshi caches and debug_orig_market_count.")

def load_binance_day_data(date_str_yyyy_mm_dd: str) -> pd.DataFrame | None:
    global _binance_day_data_cache
    # Return cache if available
    if date_str_yyyy_mm_dd in _binance_day_data_cache:
        return _binance_day_data_cache[date_str_yyyy_mm_dd]

    filename_base = f"BTCUSDT-1m-{date_str_yyyy_mm_dd}"
    filepath = BINANCE_DATA_BASE_DIR / f"{filename_base}.csv"
    if not filepath.exists():
        _binance_day_data_cache[date_str_yyyy_mm_dd] = None
        return None

    try:
        # --- 1. Read CSV ---
        column_names = [
            "open_time_raw", "open", "high", "low", "close", "volume",
            "close_time_ms_raw", "quote_asset_volume", "number_of_trades",
            "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
        ]
        df = pd.read_csv(filepath, header=None, names=column_names, low_memory=False)
        if df.empty:
            logger.warning(f"[LOAD_BINANCE] File is empty: {filepath}")
            _binance_day_data_cache[date_str_yyyy_mm_dd] = None
            return None

        # --- 2. Detect units of open_time_raw and convert to seconds ---
        first_raw = df["open_time_raw"].iloc[0]
        # MICROSECONDS (≈1e15), MILLISECONDS (≈1e12), or SECONDS (≈1e9)
        if first_raw > 1e14:
            logger.info(f"[LOAD_BINANCE DEBUG {filepath.name}] Detected MICROSECONDS. Dividing by 1,000,000.")
            df["timestamp_s"] = df["open_time_raw"] // 1_000_000
        elif 1e12 < first_raw <= 1e14:
            logger.info(f"[LOAD_BINANCE DEBUG {filepath.name}] Detected MILLISECONDS. Dividing by 1,000.")
            df["timestamp_s"] = df["open_time_raw"] // 1_000
        elif 1e9 < first_raw <= 1e10:
            logger.info(f"[LOAD_BINANCE DEBUG {filepath.name}] Detected SECONDS. Using as is.")
            df["timestamp_s"] = df["open_time_raw"]
        else:
            logger.warning(f"[LOAD_BINANCE WARNING {filepath.name}] Unusual timestamp magnitude: {first_raw}. Attempting to use as is.")
            df["timestamp_s"] = df["open_time_raw"]

        # --- 3. Set index, sort, and coerce numerics ---
        df.set_index("timestamp_s", inplace=True)
        if not df.index.is_monotonic_increasing:
            df.sort_index(inplace=True)

        for col in ["open", "high", "low", "close", "volume"]:
            df[col] = pd.to_numeric(df[col], errors="coerce")

        # --- 4. Cache and return ---
        _binance_day_data_cache[date_str_yyyy_mm_dd] = df
        return df

    except Exception as e:
        logger.error(f"[LOAD_BINANCE] Error loading Binance data from {filepath}: {e}", exc_info=True)
        _binance_day_data_cache[date_str_yyyy_mm_dd] = None
        return None



def get_btc_kline_at_or_before_ts(target_timestamp_s: int, current_market_ticker_for_debug: str = None, decision_point_count_for_debug: int = 0) -> pd.Series | None:
    global debug_orig_market_count 
    target_dt_utc = dt.datetime.fromtimestamp(target_timestamp_s, tz=timezone.utc); date_str_needed = target_dt_utc.strftime("%Y-%m-%d")
    perform_debug_logging = (DEBUG_FIRST_N_ORIG_MARKETS > 0 and current_market_ticker_for_debug is not None and 
                             debug_orig_market_count < DEBUG_FIRST_N_ORIG_MARKETS and
                             decision_point_count_for_debug < DEBUG_FIRST_N_DECISION_POINTS_PER_MARKET)
    if perform_debug_logging: logger.info(f"[DEBUG BTC KLINE Market: {current_market_ticker_for_debug} (Overall #{debug_orig_market_count}), DecisionPt#{decision_point_count_for_debug}] Request kline for ts: {target_timestamp_s} ({target_dt_utc.isoformat()})")
    binance_df = load_binance_day_data(date_str_needed)
    if binance_df is None or binance_df.empty:
        if target_dt_utc.hour == 0 and target_dt_utc.minute < 5: 
            prev_date_dt_utc = target_dt_utc - timedelta(days=1); prev_date_str = prev_date_dt_utc.strftime("%Y-%m-%d")
            binance_df_prev = load_binance_day_data(prev_date_str)
            if binance_df_prev is not None and not binance_df_prev.empty:
                idx_pos_prev = binance_df_prev.index.searchsorted(target_timestamp_s, side='right')
                if idx_pos_prev > 0:
                    kline_data = binance_df_prev.iloc[idx_pos_prev - 1]
                    if kline_data.name <= target_timestamp_s: return kline_data # Check against lookahead
        return None
    try:
        idx_pos = binance_df.index.searchsorted(target_timestamp_s, side='right')
        if idx_pos == 0: return None
        kline_data = binance_df.iloc[idx_pos - 1]
        if kline_data.name > target_timestamp_s: logger.error(f"LOOKAHEAD (get_btc_kline)! Kline ts {kline_data.name} > target {target_timestamp_s}"); return None 
        if perform_debug_logging: logger.info(f"[DEBUG BTC KLINE Market: {current_market_ticker_for_debug}] Found kline ending ts {kline_data.name} for target {target_timestamp_s}")
        return kline_data
    except Exception: return None

def get_kalshi_candle_at_or_before_ts(market_df: pd.DataFrame, target_timestamp_s: int) -> pd.Series | None:
    if market_df is None or market_df.empty: return None
    try:
        idx_pos = market_df.index.searchsorted(target_timestamp_s, side='right')
        if idx_pos == 0: return None
        candle_data = market_df.iloc[idx_pos - 1]
        if candle_data.name > target_timestamp_s: logger.error(f"LOOKAHEAD (get_kalshi_candle)! Candle ts {candle_data.name} > target {target_timestamp_s}"); return None
        if target_timestamp_s - candle_data.name > (3 * 60): return None 
        return candle_data
    except Exception: return None

def get_event_details_from_ticker(ticker_string: str | None) -> dict | None:
    if not ticker_string: return None
    m = re.match(r"^(.*?)-(\d{2}[A-Z]{3}\d{2})(\d{2})(?:-(T(\d+\.?\d*)))?$", ticker_string) or \
        re.match(r"^(.*?)-(\d{2}[A-Z]{3}\d{2})(\d{2})$", ticker_string)
    if not m: return None
    g = m.groups(); strike = float(g[4]) if len(g) >=5 and g[4] else None
    return {"series":g[0],"date_str_yymmmdd":g[1],"hour_str_edt":g[2],"strike_price_from_ticker":strike}

def parse_iso_to_unix_timestamp(ds: str|None) -> int|None:
    if not ds: return None
    try:
        dt_obj = dt.datetime.fromisoformat(ds.replace('Z','+00:00')) if ds.endswith('Z') else dt.datetime.fromisoformat(ds)
        return int((dt_obj.replace(tzinfo=timezone.utc) if dt_obj.tzinfo is None else dt_obj).timestamp())
    except Exception: return None

def load_kalshi_market_data(market_ticker: str) -> pd.DataFrame | None:
    global _kalshi_market_data_cache
    if market_ticker in _kalshi_market_data_cache: return _kalshi_market_data_cache[market_ticker]
    details = get_event_details_from_ticker(market_ticker)
    if not details: return None
    fp = KALSHI_DATA_BASE_DIR/details['date_str_yymmmdd']/(details['hour_str_edt'].zfill(2))/f"{market_ticker}.csv"
    if not fp.exists(): return None
    try:
        df = pd.read_csv(fp, low_memory=False); _kalshi_market_data_cache[market_ticker] = df
        if df.empty: return None
        df['timestamp_s'] = pd.to_numeric(df['timestamp_s'], errors='coerce').astype('Int64'); df.dropna(subset=['timestamp_s'], inplace=True) 
        df.set_index('timestamp_s', inplace=True); 
        if not df.index.is_monotonic_increasing: df.sort_index(inplace=True)
        cols = [c for c in df.columns if 'cents' in c]; df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') / 100.0
        for v_col in ['volume', 'open_interest']: 
            if v_col in df.columns: df[v_col] = pd.to_numeric(df[v_col], errors='coerce')
        return df
    except Exception as e: logger.error(f"Err load Kalshi {fp}: {e}", exc_info=True); return None

logger.info("Cell 2: Utility functions defined/updated.")

In [None]:
# Cell 3: Load NTM Outcomes Manifest
# (Same as your previous version that successfully loaded 9192 markets)

list_of_outcome_files = sorted(
    glob.glob(str(KALSHI_DATA_BASE_DIR / "kalshi_btc_hourly_NTM_filtered_market_outcomes_*.csv")),
    key=os.path.getctime,
    reverse=True 
)
ntm_outcomes_df = pd.DataFrame() 
if not list_of_outcome_files:
    logger.critical(f"CRITICAL: No NTM outcome CSV files found in {KALSHI_DATA_BASE_DIR}.")
else:
    LATEST_NTM_OUTCOMES_CSV_PATH = Path(list_of_outcome_files[0])
    logger.info(f"Using NTM outcomes manifest from: {LATEST_NTM_OUTCOMES_CSV_PATH}")
    try:
        ntm_outcomes_df = pd.read_csv(LATEST_NTM_OUTCOMES_CSV_PATH, low_memory=False)
        logger.info(f"Loaded NTM outcomes manifest with {len(ntm_outcomes_df)} markets initially.")
        required_cols = ['market_ticker', 'result', 'event_resolution_time_iso', 'kalshi_strike_price', 'market_open_time_iso', 'market_close_time_iso']
        if any(col not in ntm_outcomes_df.columns for col in required_cols):
            logger.critical(f"NTM outcomes CSV is missing required columns."); ntm_outcomes_df = pd.DataFrame() 
        if not ntm_outcomes_df.empty:
            ntm_outcomes_df['target'] = ntm_outcomes_df['result'].astype(str).str.upper().apply(lambda x: 1 if x=='YES' else (0 if x=='NO' else np.nan))
            ntm_outcomes_df.dropna(subset=['target'], inplace=True)
            if not ntm_outcomes_df.empty:
                ntm_outcomes_df['target'] = ntm_outcomes_df['target'].astype(int)
                for col, func_col_name in {'resolution_time_ts':'event_resolution_time_iso', 'market_open_ts':'market_open_time_iso', 'market_close_ts':'market_close_time_iso'}.items():
                    ntm_outcomes_df[col] = ntm_outcomes_df[func_col_name].apply(parse_iso_to_unix_timestamp)
                ntm_outcomes_df['kalshi_strike_price'] = pd.to_numeric(ntm_outcomes_df['kalshi_strike_price'], errors='coerce')
                ntm_outcomes_df.dropna(subset=['market_ticker', 'resolution_time_ts', 'market_open_ts', 'market_close_ts', 'kalshi_strike_price', 'target'], how='any', inplace=True)
                logger.info(f"Processed NTM outcomes. {len(ntm_outcomes_df)} markets remain for feature engineering.")
                if not ntm_outcomes_df.empty: display(ntm_outcomes_df.head())
    except Exception as e: logger.critical(f"Error loading NTM outcomes CSV: {e}", exc_info=True); ntm_outcomes_df = pd.DataFrame() 
if ntm_outcomes_df.empty: logger.warning("No NTM markets loaded. Feature engineering will not proceed.")
logger.info("Cell 3: NTM Outcomes Manifest loading complete.")

In [None]:
# Cell 4: Per-Minute Feature Engineering Loop (REFINED BTC STATS)

all_decision_point_features_list = []

if 'ntm_outcomes_df' not in locals() or ntm_outcomes_df.empty:
    logger.warning("Skipping per-minute feature engineering: NTM outcomes manifest is empty.")
else:
    logger.info(f"Starting PER-MINUTE feature engineering for {len(ntm_outcomes_df)} NTM markets...")
    
    clear_all_caches() 

    for index, ntm_market_row in tqdm(ntm_outcomes_df.iterrows(), total=ntm_outcomes_df.shape[0], desc="Processing NTM Markets"):
        market_ticker = ntm_market_row['market_ticker']
        kalshi_strike_price = ntm_market_row['kalshi_strike_price']
        resolution_time_ts = int(ntm_market_row['resolution_time_ts'])
        market_open_ts = int(ntm_market_row['market_open_ts'])
        target_outcome = ntm_market_row['target']

        # Debug control for the outer loop (per NTM market)
        is_market_being_debugged = (DEBUG_FIRST_N_ORIG_MARKETS > 0 and 
                                    debug_orig_market_count < DEBUG_FIRST_N_ORIG_MARKETS)
        if is_market_being_debugged:
            logger.info(f"--- Debugging Market #{debug_orig_market_count}: {market_ticker} ---")
            
        kalshi_market_df = load_kalshi_market_data(market_ticker)

        first_possible_decision_ts = market_open_ts + 60 
        last_possible_decision_ts = resolution_time_ts - (MIN_MINUTES_BEFORE_RESOLUTION_FOR_DECISION * 60)

        if first_possible_decision_ts > last_possible_decision_ts:
            if is_market_being_debugged: debug_orig_market_count += 1 # Count it as debugged even if skipped
            continue
        
        decision_point_counter_for_this_market = 0 # For debugging first N decision points

        for decision_minute_ts in range(first_possible_decision_ts, last_possible_decision_ts + 1, 60):
            # Debug control for inner loop (per decision point of a debugged market)
            should_log_this_decision_point = (is_market_being_debugged and 
                                              decision_point_counter_for_this_market < DEBUG_FIRST_N_DECISION_POINTS_PER_MARKET)

            features = {'market_ticker': market_ticker, 'decision_timestamp_s': decision_minute_ts,
                        'resolution_time_ts': resolution_time_ts, 'strike_price': kalshi_strike_price,
                        'target': target_outcome,
                        'time_to_resolution_minutes': round((resolution_time_ts - decision_minute_ts) / 60.0, 2)}

            current_btc_kline = get_btc_kline_at_or_before_ts(decision_minute_ts, 
                                                              current_market_ticker_for_debug=market_ticker if is_market_being_debugged else None,
                                                              decision_point_count_for_debug=decision_point_counter_for_this_market if is_market_being_debugged else -1)
            
            if current_btc_kline is not None and pd.notna(current_btc_kline['close']):
                features['current_btc_price'] = float(current_btc_kline['close'])
                features['current_dist_strike_abs'] = features['current_btc_price'] - kalshi_strike_price
                features['current_dist_strike_pct'] = (features['current_dist_strike_abs'] / kalshi_strike_price) if kalshi_strike_price != 0 else np.nan
                
                # --- REFINED: Build BTC Price History for Lags/Rolling ---
                # Determine the earliest timestamp needed for any stat based on current_btc_kline.name
                max_lookback_seconds = (max(LAG_WINDOWS_MINUTES + ROLLING_WINDOWS_MINUTES) + 5) * 60 # Add buffer
                history_needed_start_ts = current_btc_kline.name - max_lookback_seconds

                # Efficiently gather historical klines up to current_btc_kline.name
                # This part requires a helper or careful iteration if crossing many day boundaries.
                # For simplicity, let's assume a helper function `get_btc_history_series` could do this.
                # For now, we will adapt the previous multi-day loading logic.
                
                btc_price_series_for_stats = pd.Series(dtype=float)
                relevant_day_dfs_data = []

                # Iterate backwards from current_btc_kline's day until history_needed_start_ts is covered
                # or we run out of data. Start with current kline's day.
                current_eval_day_ts = current_btc_kline.name
                num_days_to_check = (current_btc_kline.name - history_needed_start_ts) // (24*60*60) + 2 # Estimate days needed

                for i in range(num_days_to_check):
                    day_str_to_load = (dt.datetime.fromtimestamp(current_eval_day_ts, tz=timezone.utc) - timedelta(days=i)).strftime("%Y-%m-%d")
                    daily_df = load_binance_day_data(day_str_to_load)
                    if daily_df is not None and not daily_df.empty:
                        # Slice relevant part of this day's data
                        day_slice = daily_df.loc[
                            (daily_df.index >= history_needed_start_ts) & # Data must be after needed start
                            (daily_df.index <= current_btc_kline.name)   # And not after current kline
                        ]['close']
                        if not day_slice.empty:
                            relevant_day_dfs_data.append(day_slice)
                        # If the earliest data loaded already covers history_needed_start_ts, we can stop for this day's df
                        if daily_df.index.min() <= history_needed_start_ts:
                            break 
                    elif i == 0 and daily_df is None : # Current day data missing, something is wrong
                        if should_log_this_decision_point: logger.warning(f"  [DEBUG] Current day Binance data missing for {day_str_to_load}")
                        break # Can't get current price or history
                
                if relevant_day_dfs_data:
                    btc_price_series_for_stats = pd.concat(relevant_day_dfs_data)
                    if not btc_price_series_for_stats.empty:
                        btc_price_series_for_stats = btc_price_series_for_stats[
                            ~btc_price_series_for_stats.index.duplicated(keep='last')
                        ].sort_index()
                
                if should_log_this_decision_point and (DEBUG_FIRST_N_ORIG_MARKETS > 0):
                    logger.info(f"  [DEBUG] BTC History for {market_ticker} @ decision {decision_minute_ts} (kline_ts {current_btc_kline.name}):")
                    logger.info(f"  [DEBUG]   Needed from: {dt.datetime.fromtimestamp(history_needed_start_ts, tz=timezone.utc).isoformat()}")
                    logger.info(f"  [DEBUG]   Series len: {len(btc_price_series_for_stats)}, min_ts: {dt.datetime.fromtimestamp(btc_price_series_for_stats.index.min(), tz=timezone.utc).isoformat() if not btc_price_series_for_stats.empty else 'N/A'}, max_ts: {dt.datetime.fromtimestamp(btc_price_series_for_stats.index.max(), tz=timezone.utc).isoformat() if not btc_price_series_for_stats.empty else 'N/A'}")

                if not btc_price_series_for_stats.empty:
                    # Convert index to DatetimeIndex for asof
                    temp_series_for_asof = pd.Series(btc_price_series_for_stats.values, 
                                                     index=pd.to_datetime(btc_price_series_for_stats.index, unit='s', utc=True))

                    for lag in LAG_WINDOWS_MINUTES:
                        # target_lag_ts is the exact point in the past we're looking for data at/before
                        target_lag_ts = current_btc_kline.name - (lag * 60)
                        past_price = temp_series_for_asof.asof(pd.Timestamp(target_lag_ts, unit='s', tz='utc'))
                        
                        if pd.notna(past_price) and pd.notna(features.get('current_btc_price')):
                            features[f'btc_price_change_pct_{lag}m'] = (features['current_btc_price'] - past_price) / past_price if past_price != 0 else np.nan
                        else:
                            features[f'btc_price_change_pct_{lag}m'] = np.nan
                        if should_log_this_decision_point and (DEBUG_FIRST_N_ORIG_MARKETS > 0): logger.info(f"  [DEBUG]   Lag {lag}m: target_ts={dt.datetime.fromtimestamp(target_lag_ts, tz=timezone.utc).isoformat()}, past_price={past_price}, calc_pct={features[f'btc_price_change_pct_{lag}m']}")
                    
                    # Rolling window calculations using the same series, index already DatetimeIndex
                    for window in ROLLING_WINDOWS_MINUTES:
                        # We need 'window' number of 1-minute klines.
                        # The series `temp_series_for_asof` contains history up to `current_btc_kline.name`.
                        # We want the rolling std of the last `window` points of this series.
                        if len(temp_series_for_asof) >= window:
                            # .std() will be calculated on the values of the last 'window' elements.
                            std_val = temp_series_for_asof.iloc[-window:].std() 
                        elif len(temp_series_for_asof) >= 2: # Fallback: std of available points if fewer than window but at least 2
                            std_val = temp_series_for_asof.std()
                        else: # Not enough data for any std calculation
                            std_val = np.nan
                        features[f'btc_volatility_{window}m'] = std_val # Already NaN if std_val is NaN
                        if should_log_this_decision_point and (DEBUG_FIRST_N_ORIG_MARKETS > 0): logger.info(f"  [DEBUG]   Roll {window}m: std_val={std_val}, assigned_feature={features[f'btc_volatility_{window}m']}")
                else: 
                    if should_log_this_decision_point and (DEBUG_FIRST_N_ORIG_MARKETS > 0): logger.info(f"  [DEBUG]   BTC price series FOR STATS was EMPTY for decision_ts {decision_minute_ts}.")
                    for lag in LAG_WINDOWS_MINUTES: features[f'btc_price_change_pct_{lag}m'] = np.nan
                    for window in ROLLING_WINDOWS_MINUTES: features[f'btc_volatility_{window}m'] = np.nan
            else: # current_btc_kline is None
                features.update({f:np.nan for f in ['current_btc_price','current_dist_strike_abs','current_dist_strike_pct']})
                for lag in LAG_WINDOWS_MINUTES: features[f'btc_price_change_pct_{lag}m'] = np.nan
                for window in ROLLING_WINDOWS_MINUTES: features[f'btc_volatility_{window}m'] = np.nan

            # --- Kalshi Market Features ---
            # (This part remains the same as your previous version)
            if kalshi_market_df is not None:
                current_kalshi_candle = get_kalshi_candle_at_or_before_ts(kalshi_market_df, decision_minute_ts)
                if current_kalshi_candle is not None:
                    features['current_kalshi_yes_bid'] = current_kalshi_candle.get('yes_bid_close_cents', np.nan)
                    features['current_kalshi_yes_ask'] = current_kalshi_candle.get('yes_ask_close_cents', np.nan)
                    features['current_kalshi_volume'] = current_kalshi_candle.get('volume', np.nan)
                    features['current_kalshi_oi'] = current_kalshi_candle.get('open_interest', np.nan)
                    if pd.notna(features['current_kalshi_yes_bid']) and pd.notna(features['current_kalshi_yes_ask']):
                        features['current_kalshi_mid_price']=(features['current_kalshi_yes_bid']+features['current_kalshi_yes_ask'])/2.0
                        features['current_kalshi_spread_abs']=features['current_kalshi_yes_ask']-features['current_kalshi_yes_bid']
                        features['current_kalshi_spread_pct']=(features['current_kalshi_spread_abs']/features['current_kalshi_mid_price']) if features['current_kalshi_mid_price']!=0 else np.nan
                    else: features.update({f:np.nan for f in ['current_kalshi_mid_price','current_kalshi_spread_abs','current_kalshi_spread_pct']})
                else: features.update({f:np.nan for f in ['current_kalshi_yes_bid','current_kalshi_yes_ask','current_kalshi_mid_price','current_kalshi_spread_abs','current_kalshi_spread_pct','current_kalshi_volume','current_kalshi_oi']})
            else: features.update({f:np.nan for f in ['current_kalshi_yes_bid','current_kalshi_yes_ask','current_kalshi_mid_price','current_kalshi_spread_abs','current_kalshi_spread_pct','current_kalshi_volume','current_kalshi_oi']})
            
            all_decision_point_features_list.append(features)
            if is_market_being_debugged: decision_point_counter_for_this_market +=1
        
        if is_market_being_debugged: 
            debug_orig_market_count += 1 # Increment after all decision points for this market

    if all_decision_point_features_list:
        output_features_df = pd.DataFrame(all_decision_point_features_list)
        logger.info(f"Successfully engineered features for {len(output_features_df)} (market, decision_minute) points.")
    else:
        output_features_df = pd.DataFrame(); logger.warning("No (market, decision_minute) features generated.")
logger.info("Cell 4: Per-Minute Feature engineering loop complete.")

In [None]:
# Cell 5: Save Features

if 'output_features_df' in locals() and not output_features_df.empty:
    timestamp_str = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    features_filename = f"kalshi_per_minute_decision_features_{timestamp_str}.csv" 
    features_filepath = FEATURES_OUTPUT_DIR / features_filename
    try:
        output_features_df.to_csv(features_filepath, index=False)
        logger.info(f"Successfully saved per-minute decision features for {len(output_features_df)} points to: {features_filepath}")
        print(f"Features saved to: {features_filepath}")
    except Exception as e: logger.error(f"Error saving features: {e}", exc_info=True)
elif 'output_features_df' in locals() and output_features_df.empty: logger.warning("output_features_df empty. Nothing to save.")
else: logger.warning("output_features_df not defined. Nothing to save.")
logger.info("Cell 5: Feature saving process complete.")

In [None]:
# Cell 6: Inspect Output CSV

LATEST_PER_MINUTE_FEATURES_CSV_PATH = None
if 'features_filepath' in locals() and Path(features_filepath).exists(): 
    LATEST_PER_MINUTE_FEATURES_CSV_PATH = features_filepath
else: 
    list_of_feature_files = sorted(glob.glob(str(FEATURES_OUTPUT_DIR / "kalshi_per_minute_decision_features_*.csv")), key=os.path.getctime, reverse=True)
    if list_of_feature_files: LATEST_PER_MINUTE_FEATURES_CSV_PATH = Path(list_of_feature_files[0])

if LATEST_PER_MINUTE_FEATURES_CSV_PATH and LATEST_PER_MINUTE_FEATURES_CSV_PATH.exists():
    logger.info(f"Inspecting features from: {LATEST_PER_MINUTE_FEATURES_CSV_PATH}")
    df_inspect = pd.read_csv(LATEST_PER_MINUTE_FEATURES_CSV_PATH, nrows=10000) 
    logger.info(f"Shape of loaded sample: {df_inspect.shape}"); logger.info("\nFirst 5 rows:"); display(df_inspect.head())
    logger.info("\nBasic Info:"); df_inspect.info()
    logger.info("\nNaN Percentage per column (for the loaded sample):")
    nan_summary_inspect = ((df_inspect.isnull().sum() / len(df_inspect)) * 100)[lambda x: x > 0].sort_values(ascending=False)
    if not nan_summary_inspect.empty: print(nan_summary_inspect.to_string())
    else: logger.info("No NaNs found in the loaded sample of feature columns.")
    if 'time_to_resolution_minutes' in df_inspect.columns:
        logger.info("\nValue counts for 'time_to_resolution_minutes' (sample):")
        display(df_inspect['time_to_resolution_minutes'].value_counts().sort_index().head(10))
        display(df_inspect['time_to_resolution_minutes'].value_counts().sort_index().tail(10))
    if 'market_ticker' in df_inspect.columns:
        logger.info("\nNumber of decision points per market (sample of first few markets):")
        display(df_inspect['market_ticker'].value_counts().head(10))
    numeric_cols_to_describe = [c for c in ['strike_price', 'current_btc_price', 'current_dist_strike_pct', 'time_to_resolution_minutes', 'current_kalshi_mid_price', 'current_kalshi_spread_pct'] if c in df_inspect.columns]
    if numeric_cols_to_describe: logger.info("\nDescriptive statistics for key numeric features (sample):"); display(df_inspect[numeric_cols_to_describe].describe())
else: logger.warning("No per-minute features CSV file found for inspection.")
logger.info("Cell 6: Inspection complete.")