In [1]:
# Cell 1: Imports and Setup

import pandas as pd
import numpy as np
import os
from pathlib import Path
import datetime as dt
from datetime import timezone, timedelta
import re
from tqdm.notebook import tqdm
import logging
# For TA features (optional, install if needed: pip install ta)
# import ta 

# --- Logging Setup ---
logger_name = f"feature_engineering_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}"
logger = logging.getLogger(logger_name)
if not logger.handlers:
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
else:
    logger.setLevel(logging.INFO)

# --- Configuration ---
# Paths to your data directories (adjust if your notebook is not in the 'notebooks' folder)
BASE_PROJECT_DIR = Path.cwd() # Assumes notebook is in 'notebooks' directory, and data is relative to project root
# If notebook is in ./notebooks/ and data is in ./kalshi_data/ and ./binance_data/
# then Path.cwd().parent would be the project root.
# For simplicity, let's assume data directories are directly accessible or paths are adjusted.

# KALSHI_NTM_DATA_DIR = Path.cwd().parent / "kalshi_data" # Example if notebook is in 'notebooks'
# BINANCE_FLAT_DATA_DIR = Path.cwd().parent / "binance_data" # Example for flat Binance CSVs
KALSHI_NTM_DATA_DIR = Path("./kalshi_data") # If kalshi_data is in the same dir as notebook or ./notebooks/kalshi_data
BINANCE_FLAT_DATA_DIR = Path("./binance_data") # If binance_data is in the same dir or ./notebooks/binance_data

# Find the latest outcomes CSV
try:
    outcomes_files = list(KALSHI_NTM_DATA_DIR.glob("kalshi_btc_hourly_NTM_filtered_market_outcomes_*.csv"))
    if not outcomes_files:
        raise FileNotFoundError("No NTM outcomes CSV found in KALSHI_NTM_DATA_DIR.")
    KALSHI_OUTCOMES_CSV_PATH = max(outcomes_files, key=os.path.getctime)
    logger.info(f"Using Kalshi NTM outcomes CSV: {KALSHI_OUTCOMES_CSV_PATH}")
except FileNotFoundError as e:
    logger.critical(str(e))
    KALSHI_OUTCOMES_CSV_PATH = None # Handle this in loading
except Exception as e:
    logger.critical(f"Error finding outcomes CSV: {e}")
    KALSHI_OUTCOMES_CSV_PATH = None

# --- Parameters for Feature Engineering ---
# For BTC features
BTC_MOMENTUM_WINDOWS = [5, 10, 15, 30] # In minutes
BTC_VOLATILITY_WINDOW = 15 # In minutes
BTC_SMA_WINDOWS = [10, 30] # Short and long SMA
BTC_EMA_WINDOWS = [12, 26] # For MACD-like features or direct EMA
BTC_RSI_WINDOW = 14

# For Kalshi features
KALSHI_PRICE_CHANGE_WINDOWS = [1, 3, 5] # In minutes, for changes in bid/ask/mid

# Decision point: How many minutes before Kalshi market close do we make a prediction?
# Or, iterate every minute? For now, let's aim for a fixed offset.
# This matches the backtest logic.
DECISION_OFFSET_MINUTES_BEFORE_CLOSE = 5 

# How far back from the decision point do we look for Kalshi data (staleness)?
KALSHI_MAX_STALENESS_SECONDS = 120 # 2 minutes

logger.info("Feature Engineering Setup Complete.")
if not KALSHI_OUTCOMES_CSV_PATH:
    logger.warning("KALSHI_OUTCOMES_CSV_PATH is not set. Data loading will likely fail.")

2025-05-19 12:06:49,374 - INFO - feature_engineering_20250519_120649.<module>:45 - Using Kalshi NTM outcomes CSV: kalshi_data/kalshi_btc_hourly_NTM_filtered_market_outcomes_20250519_014250.csv
2025-05-19 12:06:49,374 - INFO - feature_engineering_20250519_120649.<module>:72 - Feature Engineering Setup Complete.


In [2]:
# Cell 2: Load Kalshi Outcomes and Define Target Variable

if KALSHI_OUTCOMES_CSV_PATH and KALSHI_OUTCOMES_CSV_PATH.exists():
    try:
        df_outcomes = pd.read_csv(KALSHI_OUTCOMES_CSV_PATH)
        logger.info(f"Loaded {len(df_outcomes)} NTM market outcomes from {KALSHI_OUTCOMES_CSV_PATH}")
        
        # Convert relevant columns to correct types
        df_outcomes['event_resolution_time_iso'] = pd.to_datetime(df_outcomes['event_resolution_time_iso'], errors='coerce', utc=True)
        df_outcomes['market_open_time_iso'] = pd.to_datetime(df_outcomes['market_open_time_iso'], errors='coerce', utc=True)
        df_outcomes['market_close_time_iso'] = pd.to_datetime(df_outcomes['market_close_time_iso'], errors='coerce', utc=True)
        
        # Drop rows where essential date conversions failed or key info is missing
        df_outcomes.dropna(subset=['market_ticker', 'event_resolution_time_iso', 
                                   'market_open_time_iso', 'market_close_time_iso',
                                   'kalshi_strike_price'], inplace=True)
        
        logger.info(f"Outcomes DataFrame shape after initial cleaning: {df_outcomes.shape}")
        print("Outcomes DataFrame head:")
        print(df_outcomes.head())
    except Exception as e:
        logger.critical(f"Error loading or processing outcomes CSV {KALSHI_OUTCOMES_CSV_PATH}: {e}")
        df_outcomes = pd.DataFrame() # Empty df if load fails
else:
    logger.critical("Kalshi NTM outcomes CSV path not found or not set. Cannot proceed.")
    df_outcomes = pd.DataFrame()

# --- Define Target Variable: BTC_price_at_resolution - kalshi_strike_price ---
# We need the actual BTC price at the Kalshi event_resolution_time_iso.
# This requires loading Binance data for the resolution time of each market.

# Create a cache for daily Binance data to avoid reloading the same day multiple times
_binance_daily_data_cache_for_target = {}

def get_btc_price_at_resolution(resolution_dt_utc: pd.Timestamp) -> float | None:
    """
    Fetches the BTC closing price from the 1-minute candle that contains or immediately precedes
    the Kalshi market's resolution_dt_utc.
    """
    if pd.isna(resolution_dt_utc):
        return None
        
    global _binance_daily_data_cache_for_target
    date_str = resolution_dt_utc.strftime("%Y-%m-%d")
    
    if date_str not in _binance_daily_data_cache_for_target:
        filepath = BINANCE_FLAT_DATA_DIR / f"BTCUSDT-1m-{date_str}.csv"
        if not filepath.exists():
            logger.warning(f"Target: Binance data file not found for {date_str} at {filepath}")
            _binance_daily_data_cache_for_target[date_str] = None # Mark as tried
            return None
        try:
            column_names = ["open_time_raw", "open", "high", "low", "close", "volume",
                            "close_time_ms", "quote_asset_volume", "number_of_trades",
                            "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"]
            df_binance_day = pd.read_csv(filepath, header=None, names=column_names)
            df_binance_day['timestamp_s'] = df_binance_day['open_time_raw'] // 1_000_000
            df_binance_day.set_index('timestamp_s', inplace=True)
            df_binance_day['close'] = pd.to_numeric(df_binance_day['close'])
            _binance_daily_data_cache_for_target[date_str] = df_binance_day
        except Exception as e:
            logger.error(f"Target: Error loading Binance data from {filepath}: {e}")
            _binance_daily_data_cache_for_target[date_str] = None
            return None
            
    df_binance_day_cached = _binance_daily_data_cache_for_target[date_str]
    if df_binance_day_cached is None:
        return None

    # Find the candle whose open_time_s <= resolution_timestamp_s < next_candle_open_time_s
    resolution_timestamp_s = int(resolution_dt_utc.timestamp())
    
    # searchsorted finds where resolution_timestamp_s would be inserted to maintain order.
    # 'right' means if resolution_timestamp_s is an exact match, it gives the index AFTER the match.
    # So, the relevant candle is at index position - 1.
    idx_pos = df_binance_day_cached.index.searchsorted(resolution_timestamp_s, side='right')
    
    if idx_pos == 0:
        # Resolution time is before the first candle of the day. This might happen if
        # resolution is exactly at 00:00:00 UTC and our timestamp logic is off by a bit,
        # or data for the day is missing its first minute.
        # Try to get last candle of previous day.
        prev_day_dt = resolution_dt_utc.date() - timedelta(days=1)
        prev_day_str = prev_day_dt.strftime("%Y-%m-%d")
        # logger.debug(f"Resolution {resolution_dt_utc.isoformat()} at start of day {date_str}, trying previous day {prev_day_str} for target.")
        # Recursively call, but prevent infinite loop with a depth counter or by ensuring it only goes back once.
        # For simplicity here, just return None if it's at the very start. A more robust solution may be needed.
        if resolution_dt_utc.time() < dt.time(0,1,0): # If within first minute of UTC day
             logger.warning(f"Target: Resolution time {resolution_dt_utc.isoformat()} is too early in {date_str}, BTC price might be from previous day or ambiguous. Skipping.")
        return None # Or attempt to load previous day's last minute
        
    btc_price_at_resolution = df_binance_day_cached.iloc[idx_pos - 1]['close']
    # actual_candle_ts = df_binance_day_cached.index[idx_pos - 1]
    # logger.debug(f"Target: For resolution {resolution_dt_utc.isoformat()} ({resolution_timestamp_s}), BTC price is {btc_price_at_resolution} from candle at {actual_candle_ts}")
    return float(btc_price_at_resolution)

if not df_outcomes.empty:
    tqdm.pandas(desc="Fetching BTC price at resolution for target")
    df_outcomes['btc_price_at_resolution'] = df_outcomes['event_resolution_time_iso'].progress_apply(get_btc_price_at_resolution)
    
    # Calculate the target variable
    df_outcomes['target_btc_diff_from_strike'] = df_outcomes['btc_price_at_resolution'] - df_outcomes['kalshi_strike_price']
    
    # Drop rows where target could not be calculated
    original_len = len(df_outcomes)
    df_outcomes.dropna(subset=['btc_price_at_resolution', 'target_btc_diff_from_strike'], inplace=True)
    logger.info(f"Dropped {original_len - len(df_outcomes)} rows due to missing BTC price at resolution for target calculation.")
    
    logger.info(f"Target variable 'target_btc_diff_from_strike' calculated for {len(df_outcomes)} markets.")
    print("\nOutcomes DataFrame with target variable (head):")
    print(df_outcomes[['market_ticker', 'kalshi_strike_price', 'event_resolution_time_iso', 'btc_price_at_resolution', 'target_btc_diff_from_strike']].head())
    print("\nTarget variable statistics:")
    print(df_outcomes['target_btc_diff_from_strike'].describe())
else:
    logger.warning("Outcomes DataFrame is empty, cannot calculate target variable.")

2025-05-19 12:06:52,682 - INFO - feature_engineering_20250519_120649.<module>:6 - Loaded 9192 NTM market outcomes from kalshi_data/kalshi_btc_hourly_NTM_filtered_market_outcomes_20250519_014250.csv


2025-05-19 12:06:52,690 - INFO - feature_engineering_20250519_120649.<module>:18 - Outcomes DataFrame shape after initial cleaning: (9192, 8)


Outcomes DataFrame head:
                 market_ticker result event_resolution_time_iso  \
0  KXBTCD-25MAY1522-T106249.99     no 2025-05-16 02:00:00+00:00   
1  KXBTCD-25MAY1522-T105999.99     no 2025-05-16 02:00:00+00:00   
2  KXBTCD-25MAY1522-T105749.99     no 2025-05-16 02:00:00+00:00   
3  KXBTCD-25MAY1522-T105499.99     no 2025-05-16 02:00:00+00:00   
4  KXBTCD-25MAY1522-T105249.99     no 2025-05-16 02:00:00+00:00   

   reference_btc_price_for_ntm  kalshi_strike_price      market_open_time_iso  \
0                     103709.1            106249.99 2025-05-16 01:00:00+00:00   
1                     103709.1            105999.99 2025-05-16 01:00:00+00:00   
2                     103709.1            105749.99 2025-05-16 01:00:00+00:00   
3                     103709.1            105499.99 2025-05-16 01:00:00+00:00   
4                     103709.1            105249.99 2025-05-16 01:00:00+00:00   

      market_close_time_iso event_ticker_parent  
0 2025-05-16 02:00:00+00:00    KXBT

Fetching BTC price at resolution for target:   0%|          | 0/9192 [00:00<?, ?it/s]

2025-05-19 12:06:53,076 - INFO - feature_engineering_20250519_120649.<module>:107 - Dropped 0 rows due to missing BTC price at resolution for target calculation.
2025-05-19 12:06:53,076 - INFO - feature_engineering_20250519_120649.<module>:109 - Target variable 'target_btc_diff_from_strike' calculated for 9192 markets.



Outcomes DataFrame with target variable (head):
                 market_ticker  kalshi_strike_price event_resolution_time_iso  \
0  KXBTCD-25MAY1522-T106249.99            106249.99 2025-05-16 02:00:00+00:00   
1  KXBTCD-25MAY1522-T105999.99            105999.99 2025-05-16 02:00:00+00:00   
2  KXBTCD-25MAY1522-T105749.99            105749.99 2025-05-16 02:00:00+00:00   
3  KXBTCD-25MAY1522-T105499.99            105499.99 2025-05-16 02:00:00+00:00   
4  KXBTCD-25MAY1522-T105249.99            105249.99 2025-05-16 02:00:00+00:00   

   btc_price_at_resolution  target_btc_diff_from_strike  
0                 104238.1                     -2011.89  
1                 104238.1                     -1761.89  
2                 104238.1                     -1511.89  
3                 104238.1                     -1261.89  
4                 104238.1                     -1011.89  

Target variable statistics:
count    9192.000000
mean       21.333182
std      1434.677324
min     -3932.710000
25%

In [4]:
# Cell 3: Feature Generation Loop

# --- Helper function to parse Kalshi tickers (needed for directory lookup) ---
# This function was previously in the NTM data download notebook.
# We need it here to correctly locate Kalshi market CSV files.
def get_event_resolution_details(ticker_string: str | None):
    if not ticker_string: return None
    # Pattern for event ticker like KXBTCD-25MAY1523 (less common here, mostly market tickers)
    event_match = re.match(r"^(.*?)-(\d{2}[A-Z]{3}\d{2})(\d{2})$", ticker_string)
    # Pattern for market ticker like KXBTCD-25MAY1523-T104999.99
    market_match = re.match(r"^(.*?)-(\d{2}[A-Z]{3}\d{2})(\d{2})-(T(\d+\.?\d*))$", ticker_string)
    
    match_to_use = market_match if market_match else event_match
    if not match_to_use:
        # logger.debug(f"Ticker {ticker_string} did not match event/market pattern in get_event_resolution_details.")
        return None
        
    groups = match_to_use.groups()
    series, date_str_yymmmdd, hour_str_edt = groups[0], groups[1], groups[2]
    # Strike price is only present in market_match
    strike_price = float(groups[4]) if market_match and len(groups) > 4 and groups[4] else None
    
    try:
        year_int = 2000 + int(date_str_yymmmdd[:2])
        month_str = date_str_yymmmdd[2:5].upper()
        day_int = int(date_str_yymmmdd[5:])
        month_map = {'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
                     'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12}
        month_int = month_map[month_str]
        hour_edt_int = int(hour_str_edt) # This is the closing hour in EDT
        
        # For event_resolution_dt_utc, it's the same logic as before
        event_resolution_dt_naive_edt = dt.datetime(year_int, month_int, day_int, hour_edt_int, 0, 0)
        utc_offset_hours = 4 # Assuming EDT is UTC-4
        event_resolution_dt_utc_aware = event_resolution_dt_naive_edt.replace(tzinfo=timezone(timedelta(hours=-utc_offset_hours)))
        event_resolution_dt_utc = event_resolution_dt_utc_aware.astimezone(timezone.utc)
        
        return {
            "series": series,
            "date_str_yymmmdd": date_str_yymmmdd, # e.g., 25MAY15
            "hour_str_edt": hour_str_edt,         # e.g., 23 (closing hour EDT)
            "strike_price": strike_price,         # Can be None if it was an event_ticker
            "event_resolution_dt_utc": event_resolution_dt_utc
        }
    except Exception as e:
        logger.error(f"Error parsing ticker '{ticker_string}' in get_event_resolution_details: {e}")
        return None

# --- Binance Data Loading and Feature Calculation Helpers ---
# Cache for daily Binance data with pre-calculated TA features
_binance_daily_data_with_features_cache = {}

def get_binance_data_with_features(date_str: str) -> pd.DataFrame | None:
    """
    Loads Binance 1-minute data for a given date_str (YYYY-MM-DD),
    calculates TA features, and caches it.
    Assumes feature configuration variables (BTC_MOMENTUM_WINDOWS, etc.)
    are defined globally (e.g., in Cell 1).
    """
    global _binance_daily_data_with_features_cache # To modify the cache
    
    # Check if configuration variables are globally defined (they should be by Cell 1)
    # If any are missing, log an error and return None as features cannot be calculated.
    required_configs = [
        'BINANCE_FLAT_DATA_DIR', 'BTC_MOMENTUM_WINDOWS', 'BTC_VOLATILITY_WINDOW',
        'BTC_SMA_WINDOWS', 'BTC_EMA_WINDOWS', 'BTC_RSI_WINDOW'
    ]
    for config_var_name in required_configs:
        if config_var_name not in globals():
            logger.error(f"Global configuration variable '{config_var_name}' not defined. Cannot calculate Binance features.")
            return None

    if date_str in _binance_daily_data_with_features_cache:
        cached_df = _binance_daily_data_with_features_cache[date_str]
        return cached_df.copy() if cached_df is not None else None

    filepath = BINANCE_FLAT_DATA_DIR / f"BTCUSDT-1m-{date_str}.csv"
    if not filepath.exists():
        _binance_daily_data_with_features_cache[date_str] = None
        return None
        
    try:
        column_names = ["open_time_raw", "open", "high", "low", "close", "volume",
                        "close_time_ms", "quote_asset_volume", "number_of_trades",
                        "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"]
        df = pd.read_csv(filepath, header=None, names=column_names)
        df['timestamp_s'] = df['open_time_raw'] // 1_000_000
        df.set_index('timestamp_s', inplace=True)
        
        for col in ['open', 'high', 'low', 'close', 'volume']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=['close'], inplace=True) 

        # Access global configuration variables directly
        for window in BTC_MOMENTUM_WINDOWS: # Uses global BTC_MOMENTUM_WINDOWS
            df[f'btc_mom_{window}m'] = df['close'].diff(periods=window)
        df[f'btc_vol_{BTC_VOLATILITY_WINDOW}m'] = df['close'].rolling(window=BTC_VOLATILITY_WINDOW, min_periods=1).std() # Uses global BTC_VOLATILITY_WINDOW
        for window in BTC_SMA_WINDOWS: # Uses global BTC_SMA_WINDOWS
            df[f'btc_sma_{window}m'] = df['close'].rolling(window=window, min_periods=1).mean()
        for window in BTC_EMA_WINDOWS: # Uses global BTC_EMA_WINDOWS
            df[f'btc_ema_{window}m'] = df['close'].ewm(span=window, adjust=False, min_periods=1).mean()
        if BTC_RSI_WINDOW > 0: # Uses global BTC_RSI_WINDOW
            delta = df['close'].diff(1)
            gain = delta.where(delta > 0, 0)
            loss = -delta.where(delta < 0, 0)
            avg_gain = gain.rolling(window=BTC_RSI_WINDOW, min_periods=1).mean()
            avg_loss = loss.rolling(window=BTC_RSI_WINDOW, min_periods=1).mean()
            rs = avg_gain / avg_loss.replace(0, 0.000001) # Avoid division by zero for rs
            df['btc_rsi'] = 100 - (100 / (1 + rs))
            df['btc_rsi'].fillna(50, inplace=True) # Fill initial NaNs/Infs with neutral 50

        _binance_daily_data_with_features_cache[date_str] = df
        return df.copy()
        
    except Exception as e:
        logger.error(f"FeatureGen: Error loading/processing Binance data from {filepath}: {e}")
        _binance_daily_data_with_features_cache[date_str] = None
        return None

# --- Kalshi Data Loading Helper ---
_kalshi_market_data_cache = {}

def load_kalshi_market_data(market_ticker: str, date_str_yymmmdd: str, hour_str_edt: str) -> pd.DataFrame | None:
    global _kalshi_market_data_cache
    
    if 'KALSHI_NTM_DATA_DIR' not in globals():
        logger.error("KALSHI_NTM_DATA_DIR not defined globally. Cannot load Kalshi data.")
        return None

    if market_ticker in _kalshi_market_data_cache:
        cached_df = _kalshi_market_data_cache[market_ticker]
        return cached_df.copy() if cached_df is not None else None
    
    filepath = KALSHI_NTM_DATA_DIR / date_str_yymmmdd / hour_str_edt.zfill(2) / f"{market_ticker}.csv"
    if not filepath.exists():
        _kalshi_market_data_cache[market_ticker] = None
        return None
    try:
        df = pd.read_csv(filepath)
        if df.empty:
            _kalshi_market_data_cache[market_ticker] = pd.DataFrame()
            return pd.DataFrame()
            
        df['timestamp_s'] = pd.to_numeric(df['timestamp_s'])
        df.set_index('timestamp_s', inplace=True)
        for col in df.columns:
            if 'cents' in col or 'volume' in col or 'interest' in col:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        _kalshi_market_data_cache[market_ticker] = df
        return df.copy()
    except Exception as e:
        logger.error(f"FeatureGen: Error loading Kalshi market data from {filepath}: {e}")
        _kalshi_market_data_cache[market_ticker] = None
        return None

# --- Main Feature List ---
all_feature_records = []

# Ensure df_outcomes is not empty (should be loaded in Cell 2)
if 'df_outcomes' not in globals() or df_outcomes.empty:
    logger.error("df_outcomes is not defined or is empty. Please run Cell 2 first.")
else:
    logger.info(f"Starting feature generation for {len(df_outcomes)} Kalshi markets.")

    for idx, market_row in tqdm(df_outcomes.iterrows(), total=len(df_outcomes), desc="Processing Kalshi Markets"):
        kalshi_market_ticker = market_row['market_ticker']
        kalshi_strike_price = market_row['kalshi_strike_price']
        kalshi_market_open_dt = market_row['market_open_time_iso']
        kalshi_market_close_dt = market_row['market_close_time_iso']
        target_value = market_row['target_btc_diff_from_strike']

        if pd.isna(kalshi_market_open_dt) or pd.isna(kalshi_market_close_dt):
            logger.warning(f"Market {kalshi_market_ticker} has invalid open/close times. Skipping.")
            continue
            
        parsed_ticker_info = get_event_resolution_details(kalshi_market_ticker)
        if not parsed_ticker_info:
            logger.warning(f"Could not parse {kalshi_market_ticker} for dir info. Skipping market.")
            continue
        if not all(k in parsed_ticker_info for k in ["date_str_yymmmdd", "hour_str_edt"]):
            logger.warning(f"Parsed info for {kalshi_market_ticker} missing date/hour for dir. Parsed: {parsed_ticker_info}. Skipping.")
            continue

        date_dir_str = parsed_ticker_info["date_str_yymmmdd"]
        hour_dir_str = parsed_ticker_info["hour_str_edt"]

        df_kalshi_market = load_kalshi_market_data(kalshi_market_ticker, date_dir_str, hour_dir_str)
        if df_kalshi_market is None or df_kalshi_market.empty:
            continue

        current_minute_dt = kalshi_market_open_dt
        while current_minute_dt < kalshi_market_close_dt:
            decision_point_dt_utc = current_minute_dt 
            decision_point_ts_utc = int(decision_point_dt_utc.timestamp())
            signal_ts_utc = decision_point_ts_utc - 60
            
            signal_dt_utc_obj = dt.datetime.fromtimestamp(signal_ts_utc, tz=timezone.utc)
            binance_day_str = signal_dt_utc_obj.strftime("%Y-%m-%d")
            df_binance_day_features = get_binance_data_with_features(binance_day_str)

            btc_features = {}
            if df_binance_day_features is not None and not df_binance_day_features.empty:
                if signal_ts_utc in df_binance_day_features.index:
                    btc_row = df_binance_day_features.loc[signal_ts_utc]
                    btc_features['btc_price_t_minus_1'] = btc_row['close']
                    # Access global feature config variables directly
                    for window in BTC_MOMENTUM_WINDOWS: btc_features[f'btc_mom_{window}m'] = btc_row.get(f'btc_mom_{window}m')
                    btc_features[f'btc_vol_{BTC_VOLATILITY_WINDOW}m'] = btc_row.get(f'btc_vol_{BTC_VOLATILITY_WINDOW}m')
                    for window in BTC_SMA_WINDOWS: btc_features[f'btc_sma_{window}m'] = btc_row.get(f'btc_sma_{window}m')
                    for window in BTC_EMA_WINDOWS: btc_features[f'btc_ema_{window}m'] = btc_row.get(f'btc_ema_{window}m')
                    if BTC_RSI_WINDOW > 0: btc_features['btc_rsi'] = btc_row.get('btc_rsi')
            
            if 'btc_price_t_minus_1' not in btc_features or pd.isna(btc_features['btc_price_t_minus_1']):
                current_minute_dt += timedelta(minutes=1)
                continue
            
            kalshi_features = {}
            relevant_kalshi_rows = df_kalshi_market[df_kalshi_market.index <= signal_ts_utc]
            if not relevant_kalshi_rows.empty:
                latest_kalshi_row = relevant_kalshi_rows.iloc[-1]
                latest_kalshi_ts = latest_kalshi_row.name
                
                if 'KALSHI_MAX_STALENESS_SECONDS' not in globals(): KALSHI_MAX_STALENESS_SECONDS = 120
                
                if (signal_ts_utc - latest_kalshi_ts) <= KALSHI_MAX_STALENESS_SECONDS:
                    kalshi_features['kalshi_yes_bid'] = latest_kalshi_row.get('yes_bid_close_cents')
                    kalshi_features['kalshi_yes_ask'] = latest_kalshi_row.get('yes_ask_close_cents')
                    
                    if pd.notna(kalshi_features.get('kalshi_yes_bid')) and pd.notna(kalshi_features.get('kalshi_yes_ask')):
                        kalshi_features['kalshi_spread'] = kalshi_features['kalshi_yes_ask'] - kalshi_features['kalshi_yes_bid']
                        kalshi_features['kalshi_mid_price'] = (kalshi_features['kalshi_yes_bid'] + kalshi_features['kalshi_yes_ask']) / 2
                    
                    if 'KALSHI_PRICE_CHANGE_WINDOWS' not in globals(): KALSHI_PRICE_CHANGE_WINDOWS = [1,3,5]

                    for window in KALSHI_PRICE_CHANGE_WINDOWS:
                        prev_mid_price_ts = signal_ts_utc - (window * 60)
                        prev_mid_rows = df_kalshi_market[df_kalshi_market.index <= prev_mid_price_ts]
                        if not prev_mid_rows.empty and 'kalshi_mid_price' in kalshi_features and pd.notna(kalshi_features.get('kalshi_mid_price')):
                            prev_mid_latest_row = prev_mid_rows.iloc[-1]
                            prev_yes_bid = prev_mid_latest_row.get('yes_bid_close_cents')
                            prev_yes_ask = prev_mid_latest_row.get('yes_ask_close_cents')
                            if pd.notna(prev_yes_bid) and pd.notna(prev_yes_ask):
                                prev_mid = (prev_yes_bid + prev_yes_ask) / 2
                                kalshi_features[f'kalshi_mid_chg_{window}m'] = kalshi_features['kalshi_mid_price'] - prev_mid
                    
                    kalshi_features['kalshi_volume_t_minus_1'] = latest_kalshi_row.get('volume')
                    kalshi_features['kalshi_open_interest_t_minus_1'] = latest_kalshi_row.get('open_interest')

            kalshi_features['distance_to_strike'] = btc_features['btc_price_t_minus_1'] - kalshi_strike_price

            time_features = {}
            time_features['time_until_market_close_min'] = (kalshi_market_close_dt - decision_point_dt_utc).total_seconds() / 60
            time_features['hour_of_day_utc'] = decision_point_dt_utc.hour 
            time_features['day_of_week_utc'] = decision_point_dt_utc.weekday()
            decision_point_dt_edt = decision_point_dt_utc.astimezone(timezone(timedelta(hours=-4)))
            time_features['hour_of_day_edt'] = decision_point_dt_edt.hour

            current_record = {
                'kalshi_market_ticker': kalshi_market_ticker,
                'decision_point_ts_utc': decision_point_ts_utc,
                'kalshi_strike_price': kalshi_strike_price,
                **btc_features, **kalshi_features, **time_features,
                'TARGET_btc_diff_from_strike': target_value
            }
            all_feature_records.append(current_record)
            current_minute_dt += timedelta(minutes=1)

        if kalshi_market_ticker in _kalshi_market_data_cache: # Check before popping
            _kalshi_market_data_cache.pop(kalshi_market_ticker, None)
    
    _binance_daily_data_with_features_cache = {}

df_features = pd.DataFrame(all_feature_records)
logger.info(f"Generated {len(df_features)} feature records in total.")

if not df_features.empty:
    print("\nSample of generated features (first 5 rows):")
    print(df_features.head().to_string())
    
    save_dir = Path.cwd()
    features_csv_path = save_dir / f"kalshi_btc_features_target_v1_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    try:
        df_features.to_csv(features_csv_path, index=False)
        logger.info(f"Successfully saved features and target to: {features_csv_path}")
    except Exception as e:
        logger.error(f"Error saving features DataFrame to CSV: {e}")
else:
    logger.warning("No feature records were generated.")

2025-05-19 12:08:43,654 - INFO - feature_engineering_20250519_120649.<module>:163 - Starting feature generation for 9192 Kalshi markets.


Processing Kalshi Markets:   0%|          | 0/9192 [00:00<?, ?it/s]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['btc_rsi'].fillna(50, inplace=True) # Fill initial NaNs/Infs with neutral 50
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['btc_rsi'].fillna(50, inplace=True) # Fill initial NaNs/Infs with neutral 50
The behavior will change in pandas 3.0. This inplace method will never w


Sample of generated features (first 5 rows):
          kalshi_market_ticker  decision_point_ts_utc  kalshi_strike_price  btc_price_t_minus_1  btc_mom_5m  btc_mom_10m  btc_mom_15m  btc_mom_30m  btc_vol_15m  btc_sma_10m    btc_sma_30m    btc_ema_12m    btc_ema_26m    btc_rsi  distance_to_strike  time_until_market_close_min  hour_of_day_utc  day_of_week_utc  hour_of_day_edt  TARGET_btc_diff_from_strike  kalshi_yes_bid  kalshi_yes_ask  kalshi_spread  kalshi_mid_price  kalshi_volume_t_minus_1  kalshi_open_interest_t_minus_1  kalshi_mid_chg_1m  kalshi_mid_chg_3m  kalshi_mid_chg_5m
0  KXBTCD-25MAY1522-T106249.99             1747357200            106249.99            103764.81       73.29       -69.79      -182.17        22.80    95.991753   103732.700  103868.674000  103766.593435  103814.379274  30.460910            -2485.18                         60.0                1                4               21                     -2011.89             NaN             NaN            NaN             

2025-05-19 12:16:09,886 - INFO - feature_engineering_20250519_120649.<module>:284 - Successfully saved features and target to: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/kalshi_btc_features_target_v1_20250519_121558.csv
