In [3]:
import requests
import os
import time
import datetime as dt
from datetime import timezone, timedelta
import base64
import json
from pathlib import Path
from cryptography.hazmat.primitives import serialization, hashes
from cryptography.hazmat.primitives.asymmetric import padding, rsa
import pandas as pd
from tqdm.notebook import tqdm # Use tqdm.notebook for Jupyter
from dotenv import load_dotenv
import logging

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
logger = logging.getLogger("kalshi_historical_fetcher_notebook")

# --- Configuration ---
load_dotenv()
KALSHI_API_KEY_ID = os.getenv("KALSHI_API_KEY_ID")
KALSHI_PRIVATE_KEY_PATH = os.getenv("KALSHI_PRIVATE_KEY_PATH")

IS_DEMO_MODE = os.getenv("KALSHI_DEMO_MODE", "false").lower() == "true"
if IS_DEMO_MODE:
    logger.info("KALSHI: Running in DEMO mode.")
    KALSHI_BASE_URL = "https://demo-api.kalshi.co"
    # ... (demo key handling if needed) ...
else:
    logger.info("KALSHI: Running in PRODUCTION mode.")
    KALSHI_BASE_URL = "https://api.elections.kalshi.com"

# Auth functions (load_private_key, sign_pss_text, get_kalshi_auth_headers)
# These are the same as in your historical.py. Copy them here.
# ... (paste your auth functions here) ...
def load_private_key(file_path: str) -> rsa.RSAPrivateKey | None:
    try:
        with open(file_path, "rb") as key_file:
            private_key = serialization.load_pem_private_key(key_file.read(), password=None)
        return private_key
    except FileNotFoundError:
        logger.error(f"Private key file not found: {file_path}")
        return None
    except Exception as e:
        logger.error(f"Error loading private key from {file_path}: {e}")
        return None

def sign_pss_text(private_key: rsa.RSAPrivateKey, text: str) -> str | None:
    message = text.encode('utf-8')
    try:
        signature = private_key.sign(
            message, padding.PSS(mgf=padding.MGF1(hashes.SHA256()), salt_length=padding.PSS.DIGEST_LENGTH),
            hashes.SHA256()
        )
        return base64.b64encode(signature).decode('utf-8')
    except Exception as e:
        logger.error(f"Error during signing: {e}")
        return None

def get_kalshi_auth_headers(method: str, path: str, private_key: rsa.RSAPrivateKey, key_id: str) -> dict | None:
    timestamp_ms_str = str(int(time.time() * 1000))
    message_to_sign = timestamp_ms_str + method.upper() + path
    signature = sign_pss_text(private_key, message_to_sign)
    if signature is None: return None
    return {
        'accept': 'application/json',
        'KALSHI-ACCESS-KEY': key_id,
        'KALSHI-ACCESS-SIGNATURE': signature,
        'KALSHI-ACCESS-TIMESTAMP': timestamp_ms_str
    }

# Initialize private key (do this once)
private_key_global = None
if KALSHI_API_KEY_ID and KALSHI_PRIVATE_KEY_PATH:
    private_key_global = load_private_key(KALSHI_PRIVATE_KEY_PATH)
    if not private_key_global:
        logger.critical("Failed to load private key. Further API calls will fail.")
else:
    logger.critical("Kalshi API Key ID or Private Key Path not set in .env file.")

2025-05-15 02:04:40,805 - INFO - 3611112791 - KALSHI: Running in PRODUCTION mode.


In [4]:
def get_events_with_markets(series_ticker: str, target_date_str: str, status: str = "settled") -> list:
    """
    Fetches events for a given series and date, including their nested markets.
    target_date_str should be YYYY-MM-DD for the day the markets CLOSED (EDT).
    """
    if not private_key_global:
        logger.error("Private key not loaded. Cannot fetch events.")
        return []

    target_dt_edt_day_start = dt.datetime.strptime(target_date_str, "%Y-%m-%d")
    
    # Kalshi uses EDT for its daily event grouping.
    # min_close_ts: start of target_date_str EDT (which is 04:00 UTC on target_date_str)
    # max_close_ts: end of target_date_str EDT (which is 03:59:59 UTC on target_date_str + 1 day)
    
    min_close_dt_edt = target_dt_edt_day_start.replace(hour=0, minute=0, second=0)
    max_close_dt_edt = target_dt_edt_day_start.replace(hour=23, minute=59, second=59)

    # Convert EDT to UTC for API query
    # EDT is UTC-4. So, 00:00 EDT is 04:00 UTC. 23:59 EDT is 03:59 UTC NEXT DAY.
    # It's usually easier to think of the UTC day range for the API.
    # If target_date_str is "2025-05-15" (EDT):
    #   - Markets closing at 00:00 EDT on May 15 are closing at 04:00 UTC on May 15.
    #   - Markets closing at 23:00 EDT on May 15 are closing at 03:00 UTC on May 16.
    # So, min_close_ts (UTC) would be start of May 15 04:00 UTC
    # And max_close_ts (UTC) would be start of May 16 04:00 UTC (to catch everything up to 23:59:59 EDT)
    
    # Let's define the UTC window based on the target EDT day
    # Start of the EDT day in UTC
    min_close_ts_utc = int(dt.datetime(target_dt_edt_day_start.year, target_dt_edt_day_start.month, target_dt_edt_day_start.day, 0, 0, 0, tzinfo=dt.timezone(dt.timedelta(hours=-4))).timestamp())
    # End of the EDT day in UTC (start of next EDT day in UTC)
    max_close_ts_utc = int((dt.datetime(target_dt_edt_day_start.year, target_dt_edt_day_start.month, target_dt_edt_day_start.day, 0, 0, 0, tzinfo=dt.timezone(dt.timedelta(hours=-4))) + dt.timedelta(days=1)).timestamp()) -1 # up to end of day

    logger.info(f"Querying events for series {series_ticker} that closed on {target_date_str} EDT.")
    logger.info(f"UTC close timestamp range: {min_close_ts_utc} to {max_close_ts_utc}")

    all_events = []
    cursor = None
    api_path_template = "/trade-api/v2/events"
    
    while True:
        params = {
            "series_ticker": series_ticker,
            "status": status,
            "min_close_ts": min_close_ts_utc,
            "max_close_ts": max_close_ts_utc,
            "with_nested_markets": "true", # Crucial!
            "limit": 100 # Max page size seems to be 200 based on docs, 100 is safe
        }
        if cursor:
            params["cursor"] = cursor

        auth_headers = get_kalshi_auth_headers("GET", api_path_template, private_key_global, KALSHI_API_KEY_ID)
        if not auth_headers: break

        try:
            response = requests.get(f"{KALSHI_BASE_URL}{api_path_template}", headers=auth_headers, params=params)
            logger.debug(f"Request URL: {response.url}")
            response.raise_for_status()
            data = response.json()
            
            events_on_page = data.get("events", [])
            if events_on_page:
                all_events.extend(events_on_page)
                logger.info(f"Fetched {len(events_on_page)} events. Total so far: {len(all_events)}")
            
            cursor = data.get("cursor")
            if not cursor or not events_on_page: # No more pages or no events on this page
                break
            time.sleep(0.5) # Rate limit
        except requests.exceptions.HTTPError as e:
            logger.error(f"HTTP Error fetching events: {e.response.status_code} - {e.response.text}")
            break
        except Exception as e:
            logger.exception(f"Error fetching events: {e}")
            break
            
    logger.info(f"Total events found for {series_ticker} on {target_date_str} (EDT): {len(all_events)}")
    return all_events

In [5]:
# Candlestick fetching function (fetch_and_parse_kalshi_candlesticks)
# This is the same as in your historical.py. Copy it here.
# Make sure it takes private_key and key_id as arguments if they are not global in the notebook.
# ... (paste your fetch_and_parse_kalshi_candlesticks function here) ...
def fetch_and_parse_kalshi_candlesticks(
    series_ticker: str, market_ticker: str, start_dt_utc: dt.datetime, end_dt_utc: dt.datetime,
    period_minutes: int, private_key: rsa.RSAPrivateKey, key_id: str
) -> pd.DataFrame | None:
    # ... (same logic as your working historical.py) ...
    all_candlesticks_processed = []
    start_ts_s = int(start_dt_utc.timestamp())
    end_ts_s = int(end_dt_utc.timestamp())
    current_start_ts = start_ts_s
    
    # logger.info(f"Fetching candlesticks for {market_ticker}")
    # logger.info(f"Target Period: {start_dt_utc.strftime('%Y-%m-%d %H:%M:%S %Z')} to {end_dt_utc.strftime('%Y-%m-%d %H:%M:%S %Z')}")
    
    total_duration_seconds = max(0, end_ts_s - start_ts_s)
    total_expected_intervals = (total_duration_seconds // (period_minutes * 60)) +1 if total_duration_seconds >=0 else 0

    # Reset for each market if it's a static attribute of the function
    fetch_and_parse_kalshi_candlesticks.first_candle_logged_this_market = False


    with tqdm(total=total_expected_intervals, desc=f"Candles: {market_ticker.split('-T-')[-1] if '-T-' in market_ticker else market_ticker}", leave=False) as pbar:
        while current_start_ts <= end_ts_s: 
            chunk_end_ts = min(end_ts_s, current_start_ts + (MAX_PERIODS_PER_REQUEST -1) * period_minutes * 60)
            api_path = f"/trade-api/v2/series/{series_ticker}/markets/{market_ticker}/candlesticks"
            params = {"start_ts": current_start_ts, "end_ts": chunk_end_ts, "period_interval": period_minutes}
            headers = get_kalshi_auth_headers("GET", api_path, private_key, key_id)
            if headers is None: return None
            try:
                response = requests.get(f"{KALSHI_BASE_URL}{api_path}", headers=headers, params=params)
                response.raise_for_status()
                api_response_data = response.json()
                
                if not fetch_and_parse_kalshi_candlesticks.first_candle_logged_this_market and api_response_data.get("candlesticks"):
                    logger.debug("Structure of the first candlestick received (this market):")
                    logger.debug(json.dumps(api_response_data["candlesticks"][0], indent=2))
                    fetch_and_parse_kalshi_candlesticks.first_candle_logged_this_market = True

                candlesticks_from_api = api_response_data.get("candlesticks", [])
                if candlesticks_from_api:
                    for candle_data in candlesticks_from_api:
                        ts = candle_data.get("end_period_ts")
                        if ts is None or ts < current_start_ts : continue
                        # ... (rest of parsing logic for price, yes_bid, yes_ask) ...
                        trade_price_info = candle_data.get("price", {})
                        yes_bid_info = candle_data.get("yes_bid", {})
                        yes_ask_info = candle_data.get("yes_ask", {})
                        all_candlesticks_processed.append({
                            "timestamp_s": ts,
                            "trade_open_cents": trade_price_info.get("open"), "trade_high_cents": trade_price_info.get("high"),
                            "trade_low_cents": trade_price_info.get("low"), "trade_close_cents": trade_price_info.get("close"),
                            "yes_bid_open_cents": yes_bid_info.get("open"), "yes_bid_high_cents": yes_bid_info.get("high"),
                            "yes_bid_low_cents": yes_bid_info.get("low"), "yes_bid_close_cents": yes_bid_info.get("close"),
                            "yes_ask_open_cents": yes_ask_info.get("open"), "yes_ask_high_cents": yes_ask_info.get("high"),
                            "yes_ask_low_cents": yes_ask_info.get("low"), "yes_ask_close_cents": yes_ask_info.get("close"),
                            "volume": candle_data.get("volume"), "open_interest": candle_data.get("open_interest")
                        })
                    last_candle_ts_in_chunk = candlesticks_from_api[-1]["end_period_ts"]
                    current_start_ts = last_candle_ts_in_chunk + (period_minutes * 60)
                    pbar.update(len(candlesticks_from_api))
                else:
                    current_start_ts = chunk_end_ts + (period_minutes * 60)
                    pbar.update(max(1, (chunk_end_ts - params["start_ts"]) // (period_minutes * 60) +1 ) if chunk_end_ts >= params["start_ts"] else 1)
                time.sleep(0.7) # Increased sleep
            except requests.exceptions.HTTPError as e:
                # logger.error(f"HTTP Error for {market_ticker}: {e.response.status_code} - {e.response.text if e.response else str(e)}")
                if e.response and e.response.status_code == 404:
                    logger.debug(f"Market {market_ticker} not found or no data for period.")
                else: # Log other HTTP errors more loudly
                    logger.error(f"HTTP Error for {market_ticker}: {e.response.status_code if e.response else 'N/A'} - {e.response.text if e.response else str(e)}")
                break 
            except Exception as e:
                logger.exception(f"Unexpected error for {market_ticker}: {e}")
                break
    if pbar.n < pbar.total and total_expected_intervals > 0: pbar.update(pbar.total - pbar.n)
    pbar.close()
    if not all_candlesticks_processed: return None
    df = pd.DataFrame(all_candlesticks_processed)
    df['timestamp_utc_end_of_period'] = pd.to_datetime(df['timestamp_s'], unit='s', utc=True)
    df.set_index('timestamp_utc_end_of_period', inplace=True)
    df.drop(columns=['timestamp_s'], inplace=True)
    df = df[~df.index.duplicated(keep='first')].sort_index()
    numeric_cols = [col for col in df.columns if 'cents' in col or 'volume' in col or 'interest' in col]
    for col in numeric_cols: df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [6]:
TARGET_SERIES_FOR_NOTEBOOK = "KXBTCD" # Or "KXETHD"
TARGET_DATE_FOR_NOTEBOOK = "2025-05-14" # Yesterday, for example

OUTPUT_BASE_DIR_NOTEBOOK = Path(f"./kalshi_historical_data_discovered/{TARGET_SERIES_FOR_NOTEBOOK}_{TARGET_DATE_FOR_NOTEBOOK.replace('-', '')}")
OUTPUT_BASE_DIR_NOTEBOOK.mkdir(parents=True, exist_ok=True)

PERIOD_INTERVAL_MINUTES_NOTEBOOK = 1

if private_key_global:
    # 1. Get all settled events for the target series and date, with their markets
    logger.info(f"Fetching events for {TARGET_SERIES_FOR_NOTEBOOK} on {TARGET_DATE_FOR_NOTEBOOK} (EDT)")
    events_data = get_events_with_markets(TARGET_SERIES_FOR_NOTEBOOK, TARGET_DATE_FOR_NOTEBOOK, status="settled")

    if not events_data:
        logger.warning("No events found. Exiting.")
    else:
        logger.info(f"Found {len(events_data)} events. Iterating through their markets...")
        total_markets_processed = 0
        
        for event in tqdm(events_data, desc="Processing Events"):
            event_ticker = event.get('ticker')
            markets_in_event = event.get('markets', [])
            
            if not markets_in_event:
                logger.info(f"No markets listed within event {event_ticker}. Skipping.")
                continue
            
            logger.info(f"\nProcessing Event: {event_ticker} ({event.get('title')}) - {len(markets_in_event)} markets")
            
            # Determine the event's close time to define market window
            # The close_date in the event object is usually what we need
            event_close_ts = event.get('close_ts') # This should be UTC timestamp
            if not event_close_ts:
                logger.warning(f"Event {event_ticker} missing close_ts. Cannot determine market window. Skipping.")
                continue
            
            event_market_close_dt_utc = dt.datetime.fromtimestamp(event_close_ts, timezone.utc)
            event_market_open_dt_utc = event_market_close_dt_utc - dt.timedelta(hours=1) # Assuming 1-hour markets

            fetch_and_parse_kalshi_candlesticks.first_candle_logged_this_market = False # Reset for new event

            for market_details in tqdm(markets_in_event, desc=f"Markets in {event_ticker}", leave=False):
                actual_market_ticker = market_details.get('ticker')
                if not actual_market_ticker:
                    logger.warning("Market details missing ticker. Skipping.")
                    continue
                
                # logger.info(f"--- Fetching data for Market: {actual_market_ticker} ---")
                # logger.info(f"    (Event: {event_ticker}, Window: {event_market_open_dt_utc} to {event_market_close_dt_utc})")

                market_history_df = fetch_and_parse_kalshi_candlesticks(
                    series_ticker=TARGET_SERIES_FOR_NOTEBOOK, # Use the main series ticker
                    market_ticker=actual_market_ticker,
                    start_dt_utc=event_market_open_dt_utc,
                    end_dt_utc=event_market_close_dt_utc,
                    period_minutes=PERIOD_INTERVAL_MINUTES_NOTEBOOK,
                    private_key=private_key_global,
                    key_id=KALSHI_API_KEY_ID
                )

                if market_history_df is not None and not market_history_df.empty:
                    base_output_filename = f"{actual_market_ticker}_detailed_{PERIOD_INTERVAL_MINUTES_NOTEBOOK}min"
                    output_filepath_csv = OUTPUT_BASE_DIR_NOTEBOOK / f"{base_output_filename}.csv"
                    
                    try:
                        market_history_df.to_csv(output_filepath_csv)
                        logger.info(f"Saved: {output_filepath_csv} (Shape: {market_history_df.shape})")
                        total_markets_processed +=1
                    except Exception as e:
                        logger.error(f"Error saving CSV for {actual_market_ticker}: {e}")
                # else:
                    # logger.debug(f"No data or empty data for {actual_market_ticker}.")
                
                time.sleep(1.1) # Be respectful to API - increased pause
            
            logger.info(f"--- Finished processing event {event_ticker} ---")
            # time.sleep(2) # Pause between events

        logger.info(f"--- All Historical Data Fetching Complete for {TARGET_DATE_FOR_NOTEBOOK}. Processed {total_markets_processed} markets with data. ---")
else:
    logger.error("Could not initialize Kalshi private key. Script cannot run.")

2025-05-15 02:05:23,239 - INFO - 3171678952 - Fetching events for KXBTCD on 2025-05-14 (EDT)
2025-05-15 02:05:23,239 - INFO - 1303599394 - Querying events for series KXBTCD that closed on 2025-05-14 EDT.
2025-05-15 02:05:23,240 - INFO - 1303599394 - UTC close timestamp range: 1747195200 to 1747281599
2025-05-15 02:05:25,031 - INFO - 1303599394 - Fetched 100 events. Total so far: 100
2025-05-15 02:05:27,187 - INFO - 1303599394 - Fetched 100 events. Total so far: 200
2025-05-15 02:05:29,435 - INFO - 1303599394 - Fetched 100 events. Total so far: 300
2025-05-15 02:05:31,594 - INFO - 1303599394 - Fetched 100 events. Total so far: 400
2025-05-15 02:05:33,786 - INFO - 1303599394 - Fetched 100 events. Total so far: 500
2025-05-15 02:05:35,933 - INFO - 1303599394 - Fetched 100 events. Total so far: 600
2025-05-15 02:05:38,132 - INFO - 1303599394 - Fetched 100 events. Total so far: 700
2025-05-15 02:05:40,308 - INFO - 1303599394 - Fetched 100 events. Total so far: 800
2025-05-15 02:05:42,594 - 

Processing Events:   0%|          | 0/2497 [00:00<?, ?it/s]

2025-05-15 02:06:14,363 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 15, 2025 at 12am EDT?) - 75 markets
2025-05-15 02:06:14,364 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 14, 2025 at 11pm EDT?) - 75 markets
2025-05-15 02:06:14,364 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 14, 2025 at 10pm EDT?) - 75 markets
2025-05-15 02:06:14,365 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 14, 2025 at 9pm EDT?) - 75 markets
2025-05-15 02:06:14,365 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 14, 2025 at 8pm EDT?) - 75 markets
2025-05-15 02:06:14,366 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 14, 2025 at 7pm EDT?) - 75 markets
2025-05-15 02:06:14,366 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 14, 2025 at 6pm EDT?) - 75 markets
2025-05-15 02:06:14,367 - INFO - 3171678952 - 
Processing Event: None (Bitcoin price on May 14, 2025 at 5pm 