In [10]:
# Cell 1: Imports and Setup

import pandas as pd
import numpy as np
from pathlib import Path
import datetime as dt
from datetime import timezone, timedelta
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import re 

# --- Path Setup for Imports ---
current_notebook_path = Path.cwd() 
project_root_level1 = current_notebook_path.parent 
project_root_level2 = project_root_level1.parent   

if str(project_root_level1) not in sys.path: sys.path.insert(0, str(project_root_level1))
if str(project_root_level2) not in sys.path: sys.path.insert(0, str(project_root_level2))

from live_backtester import live_utils 

# --- Basic Logging Setup for this Notebook ---
logger_understanding = logging.getLogger("live_data_understanding_iter") # New logger name
if not logger_understanding.handlers:
    logger_understanding.setLevel(logging.INFO) # Set to DEBUG for very detailed logs
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger_understanding.addHandler(ch)
else:
    logger_understanding.setLevel(logging.INFO)

# --- Data Directories ---
LIVE_DATA_ROOT_DIR = project_root_level1 
KALSHI_LIVE_LOGS_DIR = LIVE_DATA_ROOT_DIR / "market_data_logs"
BINANCE_LIVE_LOGS_DIR = LIVE_DATA_ROOT_DIR / "binance_market_data_logs"

# --- Plotting Style ---
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("muted")

logger_understanding.info("Setup complete for iterative understanding.ipynb")
logger_understanding.info(f"Kalshi live logs: {KALSHI_LIVE_LOGS_DIR.resolve()}")
logger_understanding.info(f"Binance live logs: {BINANCE_LIVE_LOGS_DIR.resolve()}")

2025-05-22 21:36:06,610 - INFO - live_data_understanding_iter - Setup complete for iterative understanding.ipynb
2025-05-22 21:36:06,611 - INFO - live_data_understanding_iter - Kalshi live logs: /Users/omarabul-hassan/Desktop/projects/kalshi/random/market_data_logs
2025-05-22 21:36:06,612 - INFO - live_data_understanding_iter - Binance live logs: /Users/omarabul-hassan/Desktop/projects/kalshi/random/binance_market_data_logs


In [11]:
# Cell 2: Define Session Mapping and Iteration Parameters

# --- Comprehensive Session to Binance File Map ---
# Based on your ls output and the assumption of incrementing numbers.
# Verify these carefully!
session_to_binance_file_map = {
    "25MAY1920": "btcusdt_kline_1m.csv",    # Assuming this is #1
    "25MAY2015": "btcusdt_2kline_1m.csv",
    "25MAY2016": "btcusdt_3kline_1m.csv",
    "25MAY2017": "btcusdt_4kline_1m.csv",
    "25MAY2018": "btcusdt_5kline_1m.csv",
    "25MAY2019": "btcusdt_6kline_1m.csv",
    "25MAY2117": "btcusdt_7kline_1m.csv",   # May 21st sessions start here
    "25MAY2118": "btcusdt_8kline_1m.csv",
    "25MAY2119": "btcusdt_9kline_1m.csv",
    "25MAY2120": "btcusdt_10kline_1m.csv",
    "25MAY2121": "btcusdt_11kline_1m.csv",
    "25MAY2122": "btcusdt_12kline_1m.csv",
    "25MAY2123": "btcusdt_13kline_1m.csv",
    "25MAY2219": "btcusdt_14kline_1m.csv",  # May 22nd sessions start here
    "25MAY2221": "btcusdt_15kline_1m.csv", # Note: Your ls shows 1801, 1802 files for this, check which binance file
    "25MAY2222": "btcusdt_16kline_1m.csv",
    "25MAY2223": "btcusdt_17kline_1m.csv",
    "25MAY2300": "btcusdt_18kline_1m.csv"   # May 23rd session
}
# You might need to manually create this full map by checking your collection script's logs
# or the start/end times within the Binance CSVs against the Kalshi session hours.

SESSIONS_TO_CHECK = list(session_to_binance_file_map.keys())
# SESSIONS_TO_CHECK = ["25MAY2223"] # For testing with a single session initially

# For each Kalshi session, how many market files to sample?
NUM_KALSHI_MARKETS_TO_SAMPLE_PER_SESSION = 2 
# For each sampled Kalshi market, how many decision points to check?
NUM_DECISION_POINTS_TO_SAMPLE_PER_MARKET = 5 

logger_understanding.info(f"Will iterate through {len(SESSIONS_TO_CHECK)} sessions.")
logger_understanding.info(f"For each session, will sample up to {NUM_KALSHI_MARKETS_TO_SAMPLE_PER_SESSION} Kalshi market files.")
logger_understanding.info(f"For each market, will sample up to {NUM_DECISION_POINTS_TO_SAMPLE_PER_MARKET} decision points for lookahead check.")

# Load the market outcomes file to get resolution times (needed by live_strategy's feature gen)
outcomes_file_path = LIVE_DATA_ROOT_DIR / "live_backtester" / "live_sessions_market_outcomes.csv"
if outcomes_file_path.exists():
    df_all_market_outcomes = pd.read_csv(outcomes_file_path)
    logger_understanding.info(f"Loaded outcomes file from {outcomes_file_path}")
else:
    logger_understanding.error(f"Outcomes file not found: {outcomes_file_path}. Some checks might be limited.")
    df_all_market_outcomes = pd.DataFrame()

# For live_strategy.py, it needs its artifacts loaded.
# We are not running the full strategy here, but if generate_features_from_live_data uses globals from it:
# from live_backtester import live_strategy # Already done in live_utils if structure is correct
# MOCK_MODEL_ARTIFACTS_DIR = project_root_level2 / "notebooks" / "trained_models" / "logreg_per_minute" # Example
# if not live_strategy.load_classifier_artifacts(MOCK_MODEL_ARTIFACTS_DIR): # Or whichever model type your live_strategy is set up for
#    logger_understanding.warning(f"Could not load mock model artifacts for live_strategy. Feature generation might fail if needed.")

# The live_strategy.py's generate_features_from_live_data uses TA params from live_utils,
# which itself tries to import from notebooks.utils. Ensure that path is also correct if needed.
# For this EDA, we might not need to call generate_features_from_live_data directly,
# but focus on the timestamps used by it.

2025-05-22 21:36:59,385 - INFO - live_data_understanding_iter - Will iterate through 18 sessions.
2025-05-22 21:36:59,386 - INFO - live_data_understanding_iter - For each session, will sample up to 2 Kalshi market files.
2025-05-22 21:36:59,386 - INFO - live_data_understanding_iter - For each market, will sample up to 5 decision points for lookahead check.
2025-05-22 21:36:59,391 - INFO - live_data_understanding_iter - Loaded outcomes file from /Users/omarabul-hassan/Desktop/projects/kalshi/random/live_backtester/live_sessions_market_outcomes.csv


In [12]:
# Cell 3: Iterative Analysis Loop

lookahead_issues_found = 0
total_decision_points_checked = 0

for session_key in SESSIONS_TO_CHECK:
    logger_understanding.info(f"\n--- Analyzing Session: {session_key} ---")
    
    binance_csv_name = session_to_binance_file_map.get(session_key)
    if not binance_csv_name:
        logger_understanding.warning(f"No Binance file mapped for session {session_key}. Skipping.")
        continue
        
    binance_file_path = BINANCE_LIVE_LOGS_DIR / binance_csv_name
    if not binance_file_path.exists():
        logger_understanding.warning(f"Binance file {binance_file_path} not found for session {session_key}. Skipping.")
        continue

    # Load Binance data for the session
    # df_binance_all_closed_klines index is kline_start_time_s (seconds)
    # It also has 'kline_start_dt_utc' column (pd.Timestamp UTC)
    df_binance_session_klines = live_utils.load_live_binance_csv_and_extract_closed_klines(binance_file_path)
    if df_binance_session_klines is None or df_binance_session_klines.empty:
        logger_understanding.warning(f"No Binance kline data loaded for {binance_csv_name}. Skipping session {session_key}.")
        continue
    
    # Get list of Kalshi market files for this session
    kalshi_session_pattern = f"*{session_key}-T*.csv" # More specific if needed: f"KXBTCD-{session_key}-T*.csv"
    kalshi_market_files_for_session = sorted(list(KALSHI_LIVE_LOGS_DIR.glob(kalshi_session_pattern)))
    
    if not kalshi_market_files_for_session:
        logger_understanding.warning(f"No Kalshi market files found for session pattern '{kalshi_session_pattern}'. Skipping session.")
        continue

    sampled_kalshi_files = kalshi_market_files_for_session[:NUM_KALSHI_MARKETS_TO_SAMPLE_PER_SESSION]

    for kalshi_csv_path in sampled_kalshi_files:
        market_ticker = kalshi_csv_path.stem
        logger_understanding.info(f"  -- Market: {market_ticker} --")
        
        df_kalshi_market = live_utils.load_live_kalshi_csv(kalshi_csv_path)
        if df_kalshi_market is None or df_kalshi_market.empty:
            logger_understanding.warning(f"    No data in Kalshi file {kalshi_csv_path.name}. Skipping market.")
            continue

        # Sample decision points from this Kalshi market's data
        # Kalshi df_kalshi_market.index is 'timestamp_utc' (pd.Timestamp UTC) of the Kalshi message
        # We want to simulate decisions made *at these Kalshi message times*.
        
        num_points_to_sample = min(NUM_DECISION_POINTS_TO_SAMPLE_PER_MARKET, len(df_kalshi_market))
        if num_points_to_sample == 0 : continue

        # Sample some decision points (can be random or first/last N)
        # Let's take some points spread out, or just the first N for simplicity
        sampled_indices = np.linspace(0, len(df_kalshi_market) - 1, num_points_to_sample, dtype=int)
        
        for i in sampled_indices:
            decision_dt_utc = df_kalshi_market.index[i] # This is the time of the Kalshi event (our decision_dt_utc)
            total_decision_points_checked += 1
            
            # --- This mimics the logic in live_strategy.generate_features_from_live_data ---
            # The Binance features are based on the kline that CLOSED at or before (decision_dt_utc - 1 minute)
            signal_features_dt_utc = (decision_dt_utc - timedelta(minutes=1)).replace(second=0, microsecond=0)
            signal_features_ts_s = int(signal_features_dt_utc.timestamp())
            # -----------------------------------------------------------------------------

            # Find the Binance kline data that would be used based on signal_features_ts_s
            # df_binance_session_klines is indexed by kline_start_time_s
            if signal_features_ts_s in df_binance_session_klines.index:
                binance_kline_used_for_features = df_binance_session_klines.loc[signal_features_ts_s]
                
                kline_start_dt_utc = binance_kline_used_for_features['kline_start_dt_utc']
                kline_assumed_close_dt_utc = kline_start_dt_utc + timedelta(minutes=1) # kline from T to T+1 closes at T+1
                
                logger_understanding.debug(f"    Decision @ {decision_dt_utc.isoformat()}:")
                logger_understanding.debug(f"      Signal Features Time (target for Binance kline START): {signal_features_dt_utc.isoformat()} (ts: {signal_features_ts_s})")
                logger_understanding.debug(f"      Binance Kline Used: Start={kline_start_dt_utc.isoformat()}, Assumed Close={kline_assumed_close_dt_utc.isoformat()}")

                # THE CRITICAL CHECK:
                # The kline data used for features (starting at kline_start_dt_utc)
                # must have its information fully available (i.e., it must have closed)
                # *before or at* the `signal_features_dt_utc`.
                # A kline starting at `kline_start_dt_utc` is considered "closed" and its data final at `kline_start_dt_utc + 1 minute`.
                # So, `kline_start_dt_utc + 1 minute` must be `<= signal_features_dt_utc`.
                
                if kline_assumed_close_dt_utc > signal_features_dt_utc:
                    logger_understanding.error(f"    POTENTIAL LOOKAHEAD BIAS for Market {market_ticker}:")
                    logger_understanding.error(f"      Decision Time (Kalshi msg): {decision_dt_utc.isoformat()}")
                    logger_understanding.error(f"      Binance Feature Signal Time:  {signal_features_dt_utc.isoformat()}")
                    logger_understanding.error(f"      Binance Kline Used: Starts at {kline_start_dt_utc.isoformat()}, Assumed Closes at {kline_assumed_close_dt_utc.isoformat()}")
                    logger_understanding.error(f"      ISSUE: Kline close ({kline_assumed_close_dt_utc.isoformat()}) is AFTER signal time ({signal_features_dt_utc.isoformat()})")
                    lookahead_issues_found += 1
                # Additional check: the `signal_features_dt_utc` itself (derived from `decision_dt_utc`) should not be too far in the future
                # relative to the data we actually have. The `load_live_binance_csv_and_extract_closed_klines` helps ensure we use final klines.

            else:
                logger_understanding.warning(f"    No Binance kline found with start_ts = {signal_features_ts_s} for decision @ {decision_dt_utc.isoformat()}")

logger_understanding.info(f"\n--- Iterative Lookahead Check Complete ---")
logger_understanding.info(f"Total decision points checked: {total_decision_points_checked}")
if lookahead_issues_found > 0:
    logger_understanding.error(f"CRITICAL: Found {lookahead_issues_found} potential lookahead bias instances!")
else:
    logger_understanding.info("No obvious lookahead bias instances found based on kline timing relative to signal feature time.")

2025-05-22 21:37:13,486 - INFO - live_data_understanding_iter - 
--- Analyzing Session: 25MAY1920 ---
2025-05-22 21:37:13,505 - INFO - live_data_understanding_iter -   -- Market: KXBTCD-25MAY1920-T105499.99 --
2025-05-22 21:37:13,733 - ERROR - live_data_understanding_iter -     POTENTIAL LOOKAHEAD BIAS for Market KXBTCD-25MAY1920-T105499.99:
2025-05-22 21:37:13,733 - ERROR - live_data_understanding_iter -       Decision Time (Kalshi msg): 2025-05-19T23:05:05.798010+00:00
2025-05-22 21:37:13,734 - ERROR - live_data_understanding_iter -       Binance Feature Signal Time:  2025-05-19T23:04:00+00:00
2025-05-22 21:37:13,734 - ERROR - live_data_understanding_iter -       Binance Kline Used: Starts at 2025-05-19T23:04:00+00:00, Assumed Closes at 2025-05-19T23:05:00+00:00
2025-05-22 21:37:13,734 - ERROR - live_data_understanding_iter -       ISSUE: Kline close (2025-05-19T23:05:00+00:00) is AFTER signal time (2025-05-19T23:04:00+00:00)
2025-05-22 21:37:13,735 - ERROR - live_data_understanding_