# Payoff Asymmetry: BTC-Standard Classification (with Funding & OI)
This notebook treatments: **Relative Value** + **Microstructure (Funding, OI)**.
We aim to predict the probability of a high payoff asymmetry ($Y > 0.1$).

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-muted')
sns.set_palette("viridis")

## 1. Constants & Data Config

In [None]:
DATA_DIR = "/Users/chenzhao/Documents/lean_workspace/data/cryptofuture/binance/minute"
EXTRA_DIR = "/Users/chenzhao/Documents/lean_workspace/data/cryptofuture/binance/extra"
START_DATE = datetime(2025, 1, 1)
END_DATE = datetime(2026, 1, 31)

TICKERS = [
    "ethusdt", "bnbusdt", "solusdt", "xrpusdt",
    "dogeusdt", "adausdt", "avaxusdt", "dotusdt", "linkusdt",
    "maticusdt", "ltcusdt", "uniusdt", "atomusdt", "etcusdt",
    "filusdt", "aptusdt", "nearusdt", "arbusdt", "opusdt",
    "injusdt", "suiusdt", "tiausdt", "seiusdt", "stxusdt",
    "imxusdt", "runeusdt", "aaveusdt", "mkrusdt", "ldousdt"
]

## 2. Shared Data Functions

In [None]:
def load_ticker_data(ticker, start_date, end_date):
    ticker_dir = os.path.join(DATA_DIR, ticker)
    if not os.path.exists(ticker_dir): return pd.DataFrame()
    all_dfs = []
    for f in sorted(os.listdir(ticker_dir)):
        if not f.endswith("_trade.zip"): continue
        date_str = f.split("_")[0]
        try:
            file_date = datetime.strptime(date_str, "%Y%m%d")
            if start_date <= file_date <= end_date:
                df = pd.read_csv(os.path.join(ticker_dir, f), header=None, compression='zip')
                df.columns = ['ms', 'open', 'high', 'low', 'close', 'volume']
                df['time'] = file_date + pd.to_timedelta(df['ms'], unit='ms')
                df.set_index('time', inplace=True)
                all_dfs.append(df[['high', 'low', 'close', 'volume']])
        except: continue
    if not all_dfs: return pd.DataFrame()
    return pd.concat(all_dfs).sort_index().drop_duplicates()

def load_extra_data(ticker):
    path = os.path.join(EXTRA_DIR, f"{ticker}_extra.csv")
    if not os.path.exists(path): return pd.DataFrame()
    df = pd.read_csv(path)
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    return df

## 3. Dataset Construction (Relative + Microstructure)

In [None]:
print("Loading BTC reference data (OHLC + Funding)...")
btc_raw = load_ticker_data("btcusdt", START_DATE, END_DATE)
btc_extra = load_extra_data("btcusdt")

samples = []
for ticker in TICKERS:
    print(f"Processing {ticker}...", end='\r')
    asset_raw = load_ticker_data(ticker, START_DATE, END_DATE)
    asset_extra = load_extra_data(ticker)
    if asset_raw.empty or asset_extra.empty: continue
    
    # 1. Relative OHLC
    df = asset_raw.join(btc_raw, rsuffix='_btc', how='inner')
    df['close_rel'] = df['close'] / df['close_btc']
    df['high_rel'] = df['high'] / df['low_btc']
    df['low_rel'] = df['low'] / df['high_btc']
    df['log_ret_rel'] = np.log(df['close_rel'] / df['close_rel'].shift(1))
    
    # 2. Merge Extra Features (Forward Fill to Hour)
    # Note: Funding is usually 8h, we'll ffill it.
    ex_df = asset_extra.join(btc_extra, rsuffix='_btc', how='outer').sort_index().ffill()
    ex_df['rel_funding'] = ex_df['fundingRate'] - ex_df['fundingRate_btc']
    
    # Resample everything to Hour for feature construction
    hourly_idx = df.resample('1H').last().dropna().index
    
    for ts in hourly_idx:
        try:
            w1h = df.loc[ts - timedelta(hours=1):ts]
            w12h = df.loc[ts - timedelta(hours=12):ts]
            w24h = df.loc[ts - timedelta(hours=24):ts]
            if len(w1h) < 40 or len(w12h) < 500: continue
            
            price_rel = w1h['close_rel'].iloc[-1]
            sigma_rel = w1h['log_ret_rel'].std() + 1e-6
            
            # -- Price Features (Relative) --
            Rel_HighPressure = (w24h['high_rel'].max() - price_rel) / (price_rel * sigma_rel)
            Rel_Momentum = (price_rel - w12h['close_rel'].iloc[0]) / (w12h['close_rel'].iloc[0] * w12h['log_ret_rel'].std() + 1e-6)
            Rel_VolRatio = (w12h[w12h['log_ret_rel'] < 0]['log_ret_rel'].std() or 0) / (w12h[w12h['log_ret_rel'] > 0]['log_ret_rel'].std() + 1e-6)
            
            # -- Microstructure Features --
            # Get latest available metrics at or before ts
            cur_ex = ex_df.loc[:ts].iloc[-1] if not ex_df.loc[:ts].empty else None
            if cur_ex is None: continue
            
            Ext_Funding = cur_ex['fundingRate']
            Ext_RelFunding = cur_ex['rel_funding']
            Ext_OI = cur_ex['openInterest']
            # OI Change (Last 12h if available)
            prev_ex = ex_df.loc[:ts - timedelta(hours=12)].iloc[-1] if not ex_df.loc[:ts - timedelta(hours=12)].empty else cur_ex
            Ext_OIChange = (cur_ex['openInterest'] - prev_ex['openInterest']) / (prev_ex['openInterest'] + 1e-6)
            
            # -- Label (Classification) --
            f_end = ts + timedelta(hours=12)
            if f_end > df.index[-1]: continue
            f_period = df.loc[ts + timedelta(minutes=1):f_end]
            if f_period.empty: continue
            y_actual = np.log((price_rel - f_period['low_rel'].min()) / (f_period['high_rel'].max() - price_rel + 1e-6) + 1e-6)
            label = 1 if y_actual > 0.1 else 0
            
            samples.append({
                'ticker': ticker, 'time': ts, 'label': label, 'y_value': y_actual,
                'Rel_HighP': Rel_HighPressure, 'Rel_Mom': Rel_Momentum, 'Rel_VolR': Rel_VolRatio,
                'Funding': Ext_Funding, 'RelFunding': Ext_RelFunding, 'OI_Change': Ext_OIChange
            })
        except: continue

dataset = pd.DataFrame(samples).dropna()
print(f"\nDone. Dataset size: {len(dataset)} | Target 1 Rate: {dataset['label'].mean():.2%}")

## 4. Model Training & Validation

In [None]:
dataset = dataset.sort_values('time')
train_df = dataset[dataset['time'] < datetime(2026, 1, 1)]
test_df = dataset[dataset['time'] >= datetime(2026, 1, 1)]

X_cols = ['Rel_HighP', 'Rel_Mom', 'Rel_VolR', 'Funding', 'RelFunding', 'OI_Change']
model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
model.fit(train_df[X_cols], train_df['label'])

test_df['prob'] = model.predict_proba(test_df[X_cols])[:, 1]
auc = roc_auc_score(test_df['label'], test_df['prob'])
print(f"OOT ROC AUC: {auc:.4f}")

## 5. Diagnostic Plots

In [None]:
importances = pd.Series(model.feature_importances_, index=X_cols).sort_values()
importances.plot(kind='barh', title='Feature Importance (Alpha + Microstructure)')
plt.show()

test_df['prob_q'] = pd.qcut(test_df['prob'], 5, labels=False)
test_df.groupby('prob_q')['y_value'].mean().plot(kind='bar', title='Mean Realized Y by Prob Quintile')
plt.show()