In [None]:
# Cell 1: Imports and Setup

import pandas as pd
import numpy as np
import os
from pathlib import Path
import datetime as dt
from datetime import timezone, timedelta
import re
from tqdm.notebook import tqdm
import logging
# For TA features (optional, install if needed: pip install ta)
# import ta

# --- Logging Setup ---\n",
logger_name = f"feature_engineering_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}"
logger = logging.getLogger(logger_name)
if not logger.handlers:
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
else:
    logger.setLevel(logging.INFO)

# --- Configuration ---\n",
# Path.cwd() will be /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/train/
current_notebook_dir = Path.cwd()

# Navigate to the project's 'notebooks/data/' directory
# current_notebook_dir.parent is 'notebooks/'
# current_notebook_dir.parent.parent is 'kalshi/' (project root if notebooks is direct child)
# More robust: current_notebook_dir.parent = 'notebooks', so we go one level up to 'notebooks', then into 'data'
# Correction: if cwd is .../notebooks/train, then:
# current_notebook_dir.parent is .../notebooks/
# DATA_ROOT_DIR is .../notebooks/data/
DATA_ROOT_DIR = current_notebook_dir.parent / "data"

KALSHI_NTM_DATA_DIR = DATA_ROOT_DIR / "kalshi_data"
BINANCE_FLAT_DATA_DIR = DATA_ROOT_DIR / "binance_data" # This was BINANCE_FLAT_DATA_DIR in your script for features

logger.info(f"Current working directory (notebook location): {current_notebook_dir.resolve()}")
logger.info(f"KALSHI_NTM_DATA_DIR set to: {KALSHI_NTM_DATA_DIR.resolve()}")
logger.info(f"BINANCE_FLAT_DATA_DIR set to: {BINANCE_FLAT_DATA_DIR.resolve()}")


# Find the latest outcomes CSV
KALSHI_OUTCOMES_CSV_PATH = None # Initialize
try:
    if not KALSHI_NTM_DATA_DIR.exists():
        raise FileNotFoundError(f"KALSHI_NTM_DATA_DIR does not exist: {KALSHI_NTM_DATA_DIR.resolve()}")

    outcomes_files = sorted(
        list(KALSHI_NTM_DATA_DIR.glob("kalshi_btc_hourly_NTM_filtered_market_outcomes_*.csv")),
        key=os.path.getctime,
        reverse=True
    )
    if not outcomes_files:
        raise FileNotFoundError(f"No NTM outcomes CSV (kalshi_btc_hourly_NTM_filtered_market_outcomes_*.csv) found in {KALSHI_NTM_DATA_DIR.resolve()}")
    KALSHI_OUTCOMES_CSV_PATH = outcomes_files[0]
    logger.info(f"Using Kalshi NTM outcomes CSV: {KALSHI_OUTCOMES_CSV_PATH.resolve()}")
except FileNotFoundError as e:
    logger.critical(str(e))
except Exception as e:
    logger.critical(f"Error finding outcomes CSV: {e}", exc_info=True)

# --- Parameters for Feature Engineering ---\n",
# For BTC features
BTC_MOMENTUM_WINDOWS = [5, 10, 15, 30, 60] # Added 60 min momentum
BTC_VOLATILITY_WINDOW = 15 # In minutes
BTC_SMA_WINDOWS = [10, 30, 50] # Added 50 min SMA
BTC_EMA_WINDOWS = [12, 26, 50] # Added 50 min EMA
BTC_RSI_WINDOW = 14
BTC_ATR_WINDOW = 14 # For Average True Range (new)

# For Kalshi features
KALSHI_PRICE_CHANGE_WINDOWS = [1, 3, 5, 10] # Added 10 min change
KALSHI_VOLATILITY_WINDOWS = [5, 10] # For Kalshi mid-price volatility (new)

# *** MODIFIED: More Aggressive Filtering ***
# Start with 15. If train.ipynb AUC is still > 0.8, try 20 or 25.
DECISION_OFFSET_MINUTES_BEFORE_CLOSE = 15 # Changed from 5
logger.info(f"Feature generation will stop generating records {DECISION_OFFSET_MINUTES_BEFORE_CLOSE} minutes before market close.")

KALSHI_MAX_STALENESS_SECONDS = 120

logger.info("Feature Engineering Setup Complete.")
if not KALSHI_OUTCOMES_CSV_PATH:
    logger.warning("KALSHI_OUTCOMES_CSV_PATH is not set or was not found. Data loading in subsequent cells will likely fail.")

In [None]:
# Cell 2: Load Kalshi Outcomes and Define Target Variable

if KALSHI_OUTCOMES_CSV_PATH and KALSHI_OUTCOMES_CSV_PATH.exists():
    try:
        df_outcomes = pd.read_csv(KALSHI_OUTCOMES_CSV_PATH)
        logger.info(f"Loaded {len(df_outcomes)} NTM market outcomes from {KALSHI_OUTCOMES_CSV_PATH}")
        
        # Convert relevant columns to correct types
        df_outcomes['event_resolution_time_iso'] = pd.to_datetime(df_outcomes['event_resolution_time_iso'], errors='coerce', utc=True)
        df_outcomes['market_open_time_iso'] = pd.to_datetime(df_outcomes['market_open_time_iso'], errors='coerce', utc=True)
        df_outcomes['market_close_time_iso'] = pd.to_datetime(df_outcomes['market_close_time_iso'], errors='coerce', utc=True)
        
        # Drop rows where essential date conversions failed or key info is missing
        df_outcomes.dropna(subset=['market_ticker', 'event_resolution_time_iso', 
                                   'market_open_time_iso', 'market_close_time_iso',
                                   'kalshi_strike_price'], inplace=True)
        
        logger.info(f"Outcomes DataFrame shape after initial cleaning: {df_outcomes.shape}")
        print("Outcomes DataFrame head:")
        print(df_outcomes.head())
    except Exception as e:
        logger.critical(f"Error loading or processing outcomes CSV {KALSHI_OUTCOMES_CSV_PATH}: {e}")
        df_outcomes = pd.DataFrame() # Empty df if load fails
else:
    logger.critical("Kalshi NTM outcomes CSV path not found or not set. Cannot proceed.")
    df_outcomes = pd.DataFrame()

# --- Define Target Variable: BTC_price_at_resolution - kalshi_strike_price ---
_binance_daily_data_cache_for_target = {}

def get_btc_price_at_resolution(resolution_dt_utc: pd.Timestamp) -> float | None:
    if pd.isna(resolution_dt_utc):
        return None
    global _binance_daily_data_cache_for_target
    date_str = resolution_dt_utc.strftime("%Y-%m-%d")
    
    if date_str not in _binance_daily_data_cache_for_target:
        filepath = BINANCE_FLAT_DATA_DIR / f"BTCUSDT-1m-{date_str}.csv"
        if not filepath.exists():
            logger.warning(f"Target: Binance data file not found for {date_str} at {filepath}")
            _binance_daily_data_cache_for_target[date_str] = None
            return None
        try:
            column_names = ["open_time_raw", "open", "high", "low", "close", "volume",
                            "close_time_ms", "quote_asset_volume", "number_of_trades",
                            "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"]
            df_binance_day = pd.read_csv(filepath, header=None, names=column_names)
            df_binance_day['timestamp_s'] = df_binance_day['open_time_raw'] // 1_000_000
            df_binance_day.set_index('timestamp_s', inplace=True)
            df_binance_day['close'] = pd.to_numeric(df_binance_day['close'])
            _binance_daily_data_cache_for_target[date_str] = df_binance_day
        except Exception as e:
            logger.error(f"Target: Error loading Binance data from {filepath}: {e}")
            _binance_daily_data_cache_for_target[date_str] = None
            return None
            
    df_binance_day_cached = _binance_daily_data_cache_for_target[date_str]
    if df_binance_day_cached is None:
        return None

    resolution_timestamp_s = int(resolution_dt_utc.timestamp())
    idx_pos = df_binance_day_cached.index.searchsorted(resolution_timestamp_s, side='right')
    
    if idx_pos == 0:
        if resolution_dt_utc.time() < dt.time(0,1,0): 
             logger.warning(f"Target: Resolution time {resolution_dt_utc.isoformat()} is too early in {date_str}, BTC price might be from previous day or ambiguous. Skipping.")
        return None 
        
    btc_price_at_resolution = df_binance_day_cached.iloc[idx_pos - 1]['close']
    return float(btc_price_at_resolution)

if not df_outcomes.empty:
    tqdm.pandas(desc="Fetching BTC price at resolution for target")
    df_outcomes['btc_price_at_resolution'] = df_outcomes['event_resolution_time_iso'].progress_apply(get_btc_price_at_resolution)
    
    df_outcomes['target_btc_diff_from_strike'] = df_outcomes['btc_price_at_resolution'] - df_outcomes['kalshi_strike_price']
    
    original_len = len(df_outcomes)
    df_outcomes.dropna(subset=['btc_price_at_resolution', 'target_btc_diff_from_strike'], inplace=True)
    logger.info(f"Dropped {original_len - len(df_outcomes)} rows due to missing BTC price at resolution for target calculation.")
    
    logger.info(f"Target variable 'target_btc_diff_from_strike' calculated for {len(df_outcomes)} markets.")
    print("\nOutcomes DataFrame with target variable (head):")
    print(df_outcomes[['market_ticker', 'kalshi_strike_price', 'event_resolution_time_iso', 'btc_price_at_resolution', 'target_btc_diff_from_strike']].head())
    print("\nTarget variable statistics:")
    print(df_outcomes['target_btc_diff_from_strike'].describe())
else:
    logger.warning("Outcomes DataFrame is empty, cannot calculate target variable.")

In [None]:
# Cell 3: Feature Generation Loop

# --- Helper function to parse Kalshi tickers ---
def get_event_resolution_details(ticker_string: str | None):
    # ... (this function remains the same as your last version) ...
    if not ticker_string: return None
    event_match = re.match(r"^(.*?)-(\d{2}[A-Z]{3}\d{2})(\d{2})$", ticker_string)
    market_match = re.match(r"^(.*?)-(\d{2}[A-Z]{3}\d{2})(\d{2})-(T(\d+\.?\d*))$", ticker_string)
    
    match_to_use = market_match if market_match else event_match
    if not match_to_use:
        return None
        
    groups = match_to_use.groups()
    series, date_str_yymmmdd, hour_str_edt = groups[0], groups[1], groups[2]
    strike_price = float(groups[4]) if market_match and len(groups) > 4 and groups[4] else None
    
    try:
        year_int = 2000 + int(date_str_yymmmdd[:2])
        month_str = date_str_yymmmdd[2:5].upper()
        day_int = int(date_str_yymmmdd[5:])
        month_map = {'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
                     'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12}
        month_int = month_map[month_str]
        hour_edt_int = int(hour_str_edt)
        
        event_resolution_dt_naive_edt = dt.datetime(year_int, month_int, day_int, hour_edt_int, 0, 0)
        utc_offset_hours = 4 
        event_resolution_dt_utc_aware = event_resolution_dt_naive_edt.replace(tzinfo=timezone(timedelta(hours=-utc_offset_hours)))
        event_resolution_dt_utc = event_resolution_dt_utc_aware.astimezone(timezone.utc)
        
        return {
            "series": series,
            "date_str_yymmmdd": date_str_yymmmdd,
            "hour_str_edt": hour_str_edt,
            "strike_price": strike_price,
            "event_resolution_dt_utc": event_resolution_dt_utc
        }
    except Exception as e:
        logger.error(f"Error parsing ticker '{ticker_string}' in get_event_resolution_details: {e}")
        return None

# --- Binance Data Loading and Feature Calculation Helpers (Modified for ATR) ---
_binance_daily_data_with_features_cache = {}

def get_binance_data_with_features(date_str: str) -> pd.DataFrame | None:
    global _binance_daily_data_with_features_cache
    required_configs = [
        'BINANCE_FLAT_DATA_DIR', 'BTC_MOMENTUM_WINDOWS', 'BTC_VOLATILITY_WINDOW',
        'BTC_SMA_WINDOWS', 'BTC_EMA_WINDOWS', 'BTC_RSI_WINDOW', 'BTC_ATR_WINDOW' # Added ATR
    ]
    for config_var_name in required_configs:
        if config_var_name not in globals():
            logger.error(f"Global configuration variable '{config_var_name}' not defined. Cannot calculate Binance features.")
            return None

    if date_str in _binance_daily_data_with_features_cache:
        cached_df = _binance_daily_data_with_features_cache[date_str]
        return cached_df.copy() if cached_df is not None else None

    filepath = BINANCE_FLAT_DATA_DIR / f"BTCUSDT-1m-{date_str}.csv"
    if not filepath.exists():
        _binance_daily_data_with_features_cache[date_str] = None
        return None
        
    try:
        column_names = ["open_time_raw", "open", "high", "low", "close", "volume",
                        "close_time_ms", "quote_asset_volume", "number_of_trades",
                        "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"]
        df = pd.read_csv(filepath, header=None, names=column_names)
        df['timestamp_s'] = df['open_time_raw'] // 1_000_000
        df.set_index('timestamp_s', inplace=True)
        
        for col in ['open', 'high', 'low', 'close', 'volume']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=['close', 'high', 'low'], inplace=True) # Ensure H,L,C are present for ATR

        for window in BTC_MOMENTUM_WINDOWS: 
            df[f'btc_mom_{window}m'] = df['close'].diff(periods=window)
        df[f'btc_vol_{BTC_VOLATILITY_WINDOW}m'] = df['close'].rolling(window=BTC_VOLATILITY_WINDOW, min_periods=1).std()
        for window in BTC_SMA_WINDOWS: 
            df[f'btc_sma_{window}m'] = df['close'].rolling(window=window, min_periods=1).mean()
        for window in BTC_EMA_WINDOWS: 
            df[f'btc_ema_{window}m'] = df['close'].ewm(span=window, adjust=False, min_periods=1).mean()
        
        if BTC_RSI_WINDOW > 0:
            delta = df['close'].diff(1)
            gain = delta.where(delta > 0, 0)
            loss = -delta.where(delta < 0, 0)
            avg_gain = gain.rolling(window=BTC_RSI_WINDOW, min_periods=1).mean()
            avg_loss = loss.rolling(window=BTC_RSI_WINDOW, min_periods=1).mean()
            rs = avg_gain / avg_loss.replace(0, 1e-9) 
            df['btc_rsi'] = 100 - (100 / (1 + rs))
            df['btc_rsi'].fillna(50, inplace=True)

        # *** NEW: Calculate ATR ***
        if BTC_ATR_WINDOW > 0:
            high_low = df['high'] - df['low']
            high_close_prev = np.abs(df['high'] - df['close'].shift(1))
            low_close_prev = np.abs(df['low'] - df['close'].shift(1))
            tr = pd.concat([high_low, high_close_prev, low_close_prev], axis=1).max(axis=1)
            df[f'btc_atr_{BTC_ATR_WINDOW}'] = tr.ewm(alpha=1/BTC_ATR_WINDOW, adjust=False, min_periods=BTC_ATR_WINDOW).mean()
            # df[f'btc_atr_{BTC_ATR_WINDOW}'] = tr.rolling(window=BTC_ATR_WINDOW, min_periods=1).mean() # Simpler rolling mean ATR
            df[f'btc_atr_{BTC_ATR_WINDOW}'].fillna(method='bfill', inplace=True) # Backfill initial NaNs

        _binance_daily_data_with_features_cache[date_str] = df
        return df.copy()
        
    except Exception as e:
        logger.error(f"FeatureGen: Error loading/processing Binance data from {filepath}: {e}")
        _binance_daily_data_with_features_cache[date_str] = None
        return None

# --- Kalshi Data Loading Helper ---
_kalshi_market_data_cache = {}
# ... (load_kalshi_market_data function remains the same as your last version) ...
def load_kalshi_market_data(market_ticker: str, date_str_yymmmdd: str, hour_str_edt: str) -> pd.DataFrame | None:
    global _kalshi_market_data_cache
    if 'KALSHI_NTM_DATA_DIR' not in globals():
        logger.error("KALSHI_NTM_DATA_DIR not defined globally. Cannot load Kalshi data.")
        return None

    if market_ticker in _kalshi_market_data_cache:
        cached_df = _kalshi_market_data_cache[market_ticker]
        return cached_df.copy() if cached_df is not None else None
    
    filepath = KALSHI_NTM_DATA_DIR / date_str_yymmmdd / hour_str_edt.zfill(2) / f"{market_ticker}.csv"
    if not filepath.exists():
        _kalshi_market_data_cache[market_ticker] = None
        return None
    try:
        df = pd.read_csv(filepath)
        if df.empty:
            _kalshi_market_data_cache[market_ticker] = pd.DataFrame()
            return pd.DataFrame()
            
        df['timestamp_s'] = pd.to_numeric(df['timestamp_s'])
        df.set_index('timestamp_s', inplace=True)
        # Add 'mid_price' calculation here for later use in Kalshi volatility
        if 'yes_bid_close_cents' in df.columns and 'yes_ask_close_cents' in df.columns:
            df['mid_price'] = (pd.to_numeric(df['yes_bid_close_cents'], errors='coerce') + 
                               pd.to_numeric(df['yes_ask_close_cents'], errors='coerce')) / 2
            
        for col in df.columns:
            if 'cents' in col or 'volume' in col or 'interest' in col or 'mid_price' in col:
                 if col in df.columns: # Check if column exists after potential creation (mid_price)
                    df[col] = pd.to_numeric(df[col], errors='coerce')
        _kalshi_market_data_cache[market_ticker] = df
        return df.copy()
    except Exception as e:
        logger.error(f"FeatureGen: Error loading Kalshi market data from {filepath}: {e}")
        _kalshi_market_data_cache[market_ticker] = None
        return None

# --- Main Feature List ---
all_feature_records = []

if 'df_outcomes' not in globals() or df_outcomes.empty:
    logger.error("df_outcomes is not defined or is empty. Please run Cell 2 first.")
else:
    logger.info(f"Starting feature generation for {len(df_outcomes)} Kalshi markets.")
    DECISION_OFFSET = globals().get('DECISION_OFFSET_MINUTES_BEFORE_CLOSE', 15) # Default to 15 if not set
    logger.info(f"Feature generation will use DECISION_OFFSET_MINUTES_BEFORE_CLOSE = {DECISION_OFFSET}")

    for idx, market_row in tqdm(df_outcomes.iterrows(), total=len(df_outcomes), desc="Processing Kalshi Markets"):
        kalshi_market_ticker = market_row['market_ticker']
        kalshi_strike_price = market_row['kalshi_strike_price']
        kalshi_market_open_dt = market_row['market_open_time_iso']
        kalshi_market_close_dt = market_row['market_close_time_iso']
        target_value = market_row['target_btc_diff_from_strike']

        if pd.isna(kalshi_market_open_dt) or pd.isna(kalshi_market_close_dt):
            continue
            
        parsed_ticker_info = get_event_resolution_details(kalshi_market_ticker)
        if not parsed_ticker_info: continue
        if not all(k in parsed_ticker_info for k in ["date_str_yymmmdd", "hour_str_edt"]): continue

        date_dir_str = parsed_ticker_info["date_str_yymmmdd"]
        hour_dir_str = parsed_ticker_info["hour_str_edt"]

        df_kalshi_market = load_kalshi_market_data(kalshi_market_ticker, date_dir_str, hour_dir_str)
        if df_kalshi_market is None or df_kalshi_market.empty:
            continue

        latest_permissible_decision_dt_for_features = kalshi_market_close_dt - timedelta(minutes=DECISION_OFFSET)
        
        current_minute_dt = kalshi_market_open_dt
        while current_minute_dt < kalshi_market_close_dt :
            if current_minute_dt >= latest_permissible_decision_dt_for_features:
                current_minute_dt += timedelta(minutes=1)
                continue 

            decision_point_dt_utc = current_minute_dt 
            decision_point_ts_utc = int(decision_point_dt_utc.timestamp())
            signal_ts_utc = decision_point_ts_utc - 60
            
            signal_dt_utc_obj = dt.datetime.fromtimestamp(signal_ts_utc, tz=timezone.utc)
            binance_day_str = signal_dt_utc_obj.strftime("%Y-%m-%d")
            df_binance_day_features = get_binance_data_with_features(binance_day_str)

            btc_features = {}
            # *** BTC FEATURES (including new ATR and relative price) ***
            if df_binance_day_features is not None and not df_binance_day_features.empty:
                if signal_ts_utc in df_binance_day_features.index:
                    btc_row = df_binance_day_features.loc[signal_ts_utc]
                    btc_price_t_minus_1 = btc_row.get('close')
                    if pd.isna(btc_price_t_minus_1): # Critical check
                        current_minute_dt += timedelta(minutes=1); continue
                        
                    btc_features['btc_price_t_minus_1'] = btc_price_t_minus_1
                    for window in BTC_MOMENTUM_WINDOWS: btc_features[f'btc_mom_{window}m'] = btc_row.get(f'btc_mom_{window}m')
                    btc_features[f'btc_vol_{BTC_VOLATILITY_WINDOW}m'] = btc_row.get(f'btc_vol_{BTC_VOLATILITY_WINDOW}m')
                    for window in BTC_SMA_WINDOWS: 
                        sma_val = btc_row.get(f'btc_sma_{window}m')
                        btc_features[f'btc_sma_{window}m'] = sma_val
                        # NEW: BTC Price vs SMA
                        if pd.notna(sma_val) and sma_val != 0:
                             btc_features[f'btc_price_vs_sma_{window}m'] = btc_price_t_minus_1 / sma_val
                        else:
                             btc_features[f'btc_price_vs_sma_{window}m'] = 1.0 # Avoid div by zero, implies price is at SMA
                    for window in BTC_EMA_WINDOWS: 
                        ema_val = btc_row.get(f'btc_ema_{window}m')
                        btc_features[f'btc_ema_{window}m'] = ema_val
                        # NEW: BTC Price vs EMA
                        if pd.notna(ema_val) and ema_val != 0:
                             btc_features[f'btc_price_vs_ema_{window}m'] = btc_price_t_minus_1 / ema_val
                        else:
                             btc_features[f'btc_price_vs_ema_{window}m'] = 1.0
                    if BTC_RSI_WINDOW > 0: btc_features['btc_rsi'] = btc_row.get('btc_rsi')
                    if BTC_ATR_WINDOW > 0: btc_features[f'btc_atr_{BTC_ATR_WINDOW}'] = btc_row.get(f'btc_atr_{BTC_ATR_WINDOW}')
            
            if 'btc_price_t_minus_1' not in btc_features: # Check again after potential NaN skip
                current_minute_dt += timedelta(minutes=1)
                continue
            
            # *** KALSHI FEATURES (including new volatility and relative features) ***
            kalshi_features = {}
            kalshi_mid_price_t_minus_1 = np.nan # Initialize
            
            relevant_kalshi_rows = df_kalshi_market[df_kalshi_market.index <= signal_ts_utc]
            if not relevant_kalshi_rows.empty:
                latest_kalshi_row = relevant_kalshi_rows.iloc[-1]
                latest_kalshi_ts = latest_kalshi_row.name
                STALENESS_LIMIT = globals().get('KALSHI_MAX_STALENESS_SECONDS', 120)
                
                if (signal_ts_utc - latest_kalshi_ts) <= STALENESS_LIMIT:
                    kalshi_features['kalshi_yes_bid'] = latest_kalshi_row.get('yes_bid_close_cents')
                    kalshi_features['kalshi_yes_ask'] = latest_kalshi_row.get('yes_ask_close_cents')
                    
                    if pd.notna(kalshi_features.get('kalshi_yes_bid')) and pd.notna(kalshi_features.get('kalshi_yes_ask')):
                        kalshi_features['kalshi_spread'] = kalshi_features['kalshi_yes_ask'] - kalshi_features['kalshi_yes_bid']
                        kalshi_mid_price_t_minus_1 = (kalshi_features['kalshi_yes_bid'] + kalshi_features['kalshi_yes_ask']) / 2.0
                        kalshi_features['kalshi_mid_price'] = kalshi_mid_price_t_minus_1
                    
                    PRICE_CHANGE_WINDOWS = globals().get('KALSHI_PRICE_CHANGE_WINDOWS', [1,3,5,10])
                    for window in PRICE_CHANGE_WINDOWS:
                        prev_mid_price_ts = signal_ts_utc - (window * 60)
                        prev_mid_rows = df_kalshi_market[df_kalshi_market.index <= prev_mid_price_ts]
                        if not prev_mid_rows.empty and pd.notna(kalshi_mid_price_t_minus_1):
                            prev_mid_latest_row = prev_mid_rows.iloc[-1]
                            # Check if 'mid_price' column exists from load_kalshi_market_data
                            prev_mid_val = prev_mid_latest_row.get('mid_price') 
                            if pd.notna(prev_mid_val):
                                kalshi_features[f'kalshi_mid_chg_{window}m'] = kalshi_mid_price_t_minus_1 - prev_mid_val
                    
                    kalshi_features['kalshi_volume_t_minus_1'] = latest_kalshi_row.get('volume')
                    kalshi_features['kalshi_open_interest_t_minus_1'] = latest_kalshi_row.get('open_interest')

                    # NEW: Kalshi Mid Price Volatility
                    # Need to ensure 'mid_price' was pre-calculated in df_kalshi_market by load_kalshi_market_data
                    if 'mid_price' in df_kalshi_market.columns:
                        kalshi_history_for_vol = df_kalshi_market[df_kalshi_market.index <= signal_ts_utc]['mid_price']
                        for window in KALSHI_VOLATILITY_WINDOWS:
                            if len(kalshi_history_for_vol) >= window :
                                kalshi_features[f'kalshi_mid_vol_{window}m'] = kalshi_history_for_vol.tail(window).std()
                            else:
                                kalshi_features[f'kalshi_mid_vol_{window}m'] = np.nan # or 0 if preferred

            # Original distance to strike
            kalshi_features['distance_to_strike'] = btc_features['btc_price_t_minus_1'] - kalshi_strike_price
            
            # NEW: Distance to strike normalized by ATR
            current_atr = btc_features.get(f'btc_atr_{BTC_ATR_WINDOW}')
            if pd.notna(current_atr) and current_atr > 1e-6: # Avoid division by zero or tiny ATR
                kalshi_features['distance_to_strike_norm_atr'] = kalshi_features['distance_to_strike'] / current_atr
            else:
                kalshi_features['distance_to_strike_norm_atr'] = kalshi_features['distance_to_strike'] # Fallback or set to a large number/NaN

            # NEW: Kalshi implied prob vs BTC "market prob" (simplified)
            # This is a very rough estimate, (BTC price - strike) / ATR can act as a Z-score
            if pd.notna(kalshi_mid_price_t_minus_1) and pd.notna(kalshi_features.get('distance_to_strike_norm_atr')):
                kalshi_implied_yes_prob = kalshi_mid_price_t_minus_1 / 100.0
                # A simple way to map distance_to_strike_norm_atr to a 0-1 scale (e.g. using a sigmoid or erf, or just capping)
                # For now, let's just make a diff. A positive diff means Kalshi implies higher prob than BTC's position
                # This is very conceptual and needs refinement. Let's use a simpler 'kalshi_btc_price_spread_points'
                # Representing the difference in cents if BTC price were the Kalshi mid.
                # No, let's keep the original complex one commented out for now as it's too experimental.
                # Let's calculate something like: kalshi_mid_price_t_minus_1 - (50 + (distance_to_strike / some_avg_move_range))
                # This 'some_avg_move_range' could be related to ATR.
                # For instance, if distance_to_strike is +1 ATR, maybe that's like +15c over 50. Very heuristic.
                if pd.notna(current_atr) and current_atr > 1e-6:
                     # If BTC is 1 ATR above strike, add 15 cents to 50. If 1 ATR below, subtract 15. Capped.
                    btc_implied_value_offset = np.clip( (kalshi_features['distance_to_strike'] / current_atr) * 15, -45, 45)
                    btc_equiv_kalshi_price = 50 + btc_implied_value_offset
                    kalshi_features['kalshi_vs_btc_implied_spread'] = kalshi_mid_price_t_minus_1 - btc_equiv_kalshi_price
                else:
                    kalshi_features['kalshi_vs_btc_implied_spread'] = 0.0


            time_features = {}
            time_features['time_until_market_close_min'] = (kalshi_market_close_dt - decision_point_dt_utc).total_seconds() / 60
            time_features['hour_of_day_utc'] = decision_point_dt_utc.hour 
            time_features['day_of_week_utc'] = decision_point_dt_utc.weekday()
            decision_point_dt_edt = decision_point_dt_utc.astimezone(timezone(timedelta(hours=-4)))
            time_features['hour_of_day_edt'] = decision_point_dt_edt.hour

            current_record = {
                'kalshi_market_ticker': kalshi_market_ticker,
                'decision_point_ts_utc': decision_point_ts_utc,
                'kalshi_strike_price': kalshi_strike_price,
                **btc_features, **kalshi_features, **time_features,
                'TARGET_btc_diff_from_strike': target_value
            }
            all_feature_records.append(current_record)
            current_minute_dt += timedelta(minutes=1)

        if kalshi_market_ticker in _kalshi_market_data_cache:
            _kalshi_market_data_cache.pop(kalshi_market_ticker, None)
    
    _binance_daily_data_with_features_cache = {}

df_features = pd.DataFrame(all_feature_records)
logger.info(f"Generated {len(df_features)} feature records in total after applying decision offset and adding new features.")

if not df_features.empty:
    print("\nSample of generated features (first 5 rows):")
    # Print more columns to see new features
    with pd.option_context('display.max_columns', None):
        print(df_features.head().to_string())
    
    save_dir = Path.cwd().parent / "features" # Save to project_root/features
    save_dir.mkdir(parents=True, exist_ok=True) 
    
    features_csv_path = save_dir / f"kalshi_btc_features_target_v2_filtered_{DECISION_OFFSET}m_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    try:
        df_features.to_csv(features_csv_path, index=False)
        logger.info(f"Successfully saved FILTERED features (v2) and target to: {features_csv_path.resolve()}")
    except Exception as e:
        logger.error(f"Error saving features DataFrame to CSV: {e}")
else:
    logger.warning("No feature records were generated. Check DECISION_OFFSET and data availability.")