In [1]:
import os
import time
import math
from dataclasses import dataclass
from datetime import datetime, timezone

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from aipricepatterns import Client

pd.set_option('display.max_columns', 160)
pd.set_option('display.width', 180)

def safe_float(x, default=np.nan) -> float:
    try:
        return float(x)
    except Exception:
        return float(default)

def map_suggested_action_to_pos(x) -> int:
    if x is None:
        return 0
    if isinstance(x, (int, float)):
        v = int(x)
        if v in (-1, 0, 1):
            return int(v)
        if v in (0, 1, 2):
            return 1 if v == 1 else (-1 if v == 2 else 0)
        return 0
    if not isinstance(x, str):
        return 0
    s = x.strip().lower()
    if s in ('hold', 'flat', 'none', 'neutral', 'wait'):
        return 0
    if s in ('long', 'buy', 'bull', 'up'):
        return 1
    if s in ('short', 'sell', 'bear', 'down'):
        return -1
    return 0

def episode_horizon_pnl_for_pos(ep: dict, horizon: int, pos: int, trade_cost_pct: float) -> float:
    ts = ep.get('transitions')
    if not isinstance(ts, list) or not ts:
        return 0.0
    steps = min(int(horizon), len(ts))
    pnl = 0.0
    for i in range(steps):
        t = ts[i] if isinstance(ts[i], dict) else {}
        ret = safe_float(t.get('ret', t.get('return', 0.0)), 0.0)
        pnl += float(pos) * float(ret)
    # One-time entry cost (pos change 0->pos):
    pnl -= abs(int(pos)) * (trade_cost_pct / 100.0)
    return float(pnl)

def episode_suggested_pnl(ep: dict, horizon: int, trade_cost_pct: float) -> float:
    ts = ep.get('transitions')
    if not isinstance(ts, list) or not ts:
        return 0.0
    # Use first step suggestedAction as the trade decision for the horizon
    t0 = ts[0] if isinstance(ts[0], dict) else {}
    pos = map_suggested_action_to_pos(t0.get('suggestedAction'))
    return episode_horizon_pnl_for_pos(ep, horizon=horizon, pos=pos, trade_cost_pct=trade_cost_pct)

def spearman_corr(x: np.ndarray, y: np.ndarray) -> float:
    # Spearman via rank-corr; avoids scipy dependency
    xr = pd.Series(x).rank(method='average').to_numpy(dtype=float)
    yr = pd.Series(y).rank(method='average').to_numpy(dtype=float)
    if len(xr) < 3:
        return float('nan')
    if np.std(xr) == 0 or np.std(yr) == 0:
        return float('nan')
    return float(np.corrcoef(xr, yr)[0, 1])

## Parameters
Defaults aim for a realistic sample size without hammering the API. The notebook caches the episode-level dataset to CSV.

In [2]:
BASE_URL = os.getenv('AIPP_BASE_URL', 'https://aipricepatterns.com/api/rust')
API_KEY = os.getenv('AIPP_API_KEY')

SYMBOL = os.getenv('AIPP_RL_SYMBOL', 'BTCUSDT')
INTERVAL = os.getenv('AIPP_RL_INTERVAL', '1h')

ANCHOR_POINTS = int(os.getenv('AIPP_SWEEP_ANCHORS', '300'))
LOOKBACK_DAYS = int(os.getenv('AIPP_SWEEP_LOOKBACK_DAYS', '120'))

FORECAST_HORIZON = int(os.getenv('AIPP_RL_HORIZON', '24'))
EPISODES_PER_ANCHOR = int(os.getenv('AIPP_SWEEP_EPISODES_PER_ANCHOR', '60'))
MIN_SIMILARITY = float(os.getenv('AIPP_RL_MIN_SIMILARITY', '0.70'))
SAMPLING_STRATEGY = os.getenv('AIPP_RL_SAMPLING_STRATEGY', 'uniform')

TRADE_COST_PCT = float(os.getenv('AIPP_RL_TRADE_COST_PCT', '0.00'))

BINS = int(os.getenv('AIPP_CALIB_BINS', '12'))

SWEEP_SLEEP_SEC = float(os.getenv('AIPP_SWEEP_SLEEP_SEC', '0.05'))

CACHE_DIR = os.getenv('AIPP_RESEARCH_CACHE_DIR', 'python-sdk/research/_cache')
CACHE_PATH = os.path.join(CACHE_DIR, f'05_similarity_calib_eps_{SYMBOL}_{INTERVAL}_{ANCHOR_POINTS}.csv')

print('Base URL:', BASE_URL)
print(f'Symbol: {SYMBOL}  Interval: {INTERVAL}')
print(f'Anchors: {ANCHOR_POINTS}  LookbackDays: {LOOKBACK_DAYS}')
print(f'Episodes/anchor: {EPISODES_PER_ANCHOR}  minSimilarity: {MIN_SIMILARITY:.2f}  horizon: {FORECAST_HORIZON}')
print(f'TradeCostPct: {TRADE_COST_PCT:.4f}%  bins: {BINS}')
print('Cache:', CACHE_PATH)

Base URL: https://aipricepatterns.com/api/rust
Symbol: BTCUSDT  Interval: 1h
Anchors: 300  LookbackDays: 120
Episodes/anchor: 60  minSimilarity: 0.70  horizon: 24
TradeCostPct: 0.0000%  bins: 12
Cache: python-sdk/research/_cache/05_similarity_calib_eps_BTCUSDT_1h_300.csv


## Build episode-level dataset (cached)
We collect episodes across anchors and compute:
- `similarity`
- `pnl_suggested` and `win_suggested` (win = pnl > 0)

This makes similarity calibration tangible: *higher similarity should correspond to higher winrate and/or better avgPnL*.

In [3]:
os.makedirs(CACHE_DIR, exist_ok=True)

if os.path.exists(CACHE_PATH):
    eps_df = pd.read_csv(CACHE_PATH)
    print('loaded cache:', CACHE_PATH, 'rows:', len(eps_df))
else:
    client = Client(base_url=BASE_URL, api_key=API_KEY)

    now_ms = int(time.time() * 1000)
    start_ms = now_ms - LOOKBACK_DAYS * 24 * 60 * 60 * 1000
    anchors = np.linspace(start_ms, now_ms, num=ANCHOR_POINTS, dtype=np.int64).tolist()

    rows = []
    for idx, anchor_ts in enumerate(anchors, start=1):
        res = client.get_rl_episodes(
            symbol=SYMBOL,
            interval=INTERVAL,
            anchor_ts=int(anchor_ts),
            forecast_horizon=FORECAST_HORIZON,
            num_episodes=EPISODES_PER_ANCHOR,
            min_similarity=MIN_SIMILARITY,
            include_actions=True,
            reward_type='returns',
            sampling_strategy=SAMPLING_STRATEGY,
        )
        eps = res.get('episodes') if isinstance(res, dict) else None
        if not isinstance(eps, list) or not eps:
            continue

        anchor_dt = datetime.fromtimestamp(anchor_ts/1000, tz=timezone.utc).strftime('%Y-%m-%d %H:%M')
        for ep in eps:
            sim = safe_float(ep.get('similarity'), np.nan)
            pnl = episode_suggested_pnl(ep, horizon=FORECAST_HORIZON, trade_cost_pct=TRADE_COST_PCT)
            rows.append({
                'anchorTs': int(anchor_ts),
                'anchorDtUtc': anchor_dt,
                'similarity': float(sim),
                'pnl_suggested': float(pnl),
                'win_suggested': 1 if pnl > 0 else 0,
            })

        if idx % 10 == 0:
            print(f'{idx}/{len(anchors)} anchors... rows={len(rows)}')
        time.sleep(SWEEP_SLEEP_SEC)

    eps_df = pd.DataFrame(rows)
    eps_df = eps_df.dropna(subset=['similarity']).reset_index(drop=True)
    eps_df.to_csv(CACHE_PATH, index=False)
    print('wrote cache:', CACHE_PATH, 'rows:', len(eps_df))

eps_df.head()

10/300 anchors... rows=600
20/300 anchors... rows=1200
30/300 anchors... rows=1800
40/300 anchors... rows=2400
50/300 anchors... rows=3000
60/300 anchors... rows=3600
70/300 anchors... rows=4200
80/300 anchors... rows=4800
90/300 anchors... rows=5400
100/300 anchors... rows=6000
110/300 anchors... rows=6600
120/300 anchors... rows=7200
130/300 anchors... rows=7800
140/300 anchors... rows=8400
150/300 anchors... rows=9000
160/300 anchors... rows=9600
170/300 anchors... rows=10200
180/300 anchors... rows=10800
190/300 anchors... rows=11400
200/300 anchors... rows=12000
210/300 anchors... rows=12600
220/300 anchors... rows=13200
230/300 anchors... rows=13800
240/300 anchors... rows=14400
250/300 anchors... rows=15000
260/300 anchors... rows=15600
270/300 anchors... rows=16200
280/300 anchors... rows=16800
290/300 anchors... rows=17400
300/300 anchors... rows=17998
wrote cache: python-sdk/research/_cache/05_similarity_calib_eps_BTCUSDT_1h_300.csv rows: 17998


Unnamed: 0,anchorTs,anchorDtUtc,similarity,pnl_suggested,win_suggested
0,1755726914204,2025-08-20 21:55,0.9571,0.0,0
1,1755726914204,2025-08-20 21:55,0.9548,0.0,0
2,1755726914204,2025-08-20 21:55,0.9535,0.0,0
3,1755726914204,2025-08-20 21:55,0.9531,0.0,0
4,1755726914204,2025-08-20 21:55,0.953,0.0,0


## Reliability diagram (similarity bins → winrate / avgPnL)

In [4]:
eps = eps_df.copy()
print('episodes:', len(eps))

# Equal-count bins by quantile for stable estimates
eps['bin'] = pd.qcut(eps['similarity'], q=BINS, duplicates='drop')

cal = (
    eps.groupby('bin')
      .agg(
          n=('similarity', 'size'),
          sim_mean=('similarity', 'mean'),
          sim_min=('similarity', 'min'),
          sim_max=('similarity', 'max'),
          winrate=('win_suggested', 'mean'),
          pnl_mean=('pnl_suggested', 'mean'),
          pnl_p25=('pnl_suggested', lambda s: float(np.quantile(s, 0.25))),
          pnl_p50=('pnl_suggested', lambda s: float(np.quantile(s, 0.50))),
          pnl_p75=('pnl_suggested', lambda s: float(np.quantile(s, 0.75))),
      )
      .reset_index()
)
cal

episodes: 17998


  eps.groupby('bin')


Unnamed: 0,bin,n,sim_mean,sim_min,sim_max,winrate,pnl_mean,pnl_p25,pnl_p50,pnl_p75
0,"(0.727, 0.864]",1502,0.829747,0.7278,0.8643,0.13715,0.203613,0.0,0.0,0.0
1,"(0.864, 0.894]",1502,0.879052,0.8644,0.894,0.157124,0.296771,0.0,0.0,0.0
2,"(0.894, 0.911]",1502,0.903173,0.8941,0.9108,0.163782,0.199647,0.0,0.0,0.0
3,"(0.911, 0.923]",1493,0.917005,0.9109,0.9228,0.174816,0.300114,0.0,0.0,0.0
4,"(0.923, 0.934]",1508,0.928726,0.9229,0.9344,0.139257,0.238003,0.0,0.0,0.0
5,"(0.934, 0.944]",1509,0.939473,0.9345,0.944,0.160371,0.258848,0.0,0.0,0.0
6,"(0.944, 0.953]",1483,0.948515,0.9441,0.9528,0.190155,0.314722,0.0,0.0,0.0
7,"(0.953, 0.961]",1511,0.957141,0.9529,0.9612,0.215089,0.542475,0.0,0.0,0.0
8,"(0.961, 0.968]",1502,0.965089,0.9613,0.9683,0.177763,0.232285,0.0,0.0,0.0
9,"(0.968, 0.975]",1498,0.971581,0.9684,0.9749,0.184913,0.296794,0.0,0.0,0.0


In [5]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cal['sim_mean'],
    y=cal['winrate'],
    mode='markers+lines',
    name='winrate',
    marker=dict(size=np.clip(cal['n'] / cal['n'].max() * 18, 6, 18)),
))
fig.update_layout(
    title='Reliability diagram: similarity → factual winrate (suggestedAction)',
    xaxis_title='mean similarity (bin)',
    yaxis_title='winrate = P(pnl>0)',
    height=420,
)
fig

In [6]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cal['sim_mean'],
    y=cal['pnl_mean'],
    mode='markers+lines',
    name='avgPnL',
    marker=dict(size=np.clip(cal['n'] / cal['n'].max() * 18, 6, 18)),
))
fig.update_layout(
    title='Similarity → avgPnL (suggestedAction)',
    xaxis_title='mean similarity (bin)',
    yaxis_title='avg net PnL (horizon)',
    height=420,
)
fig

## Monotonicity tests
We check whether performance improves with similarity:
- rank correlation (Spearman)
- bin-level monotonicity violations (non-decreasing winrate/avgPnL)

In [7]:
x = eps['similarity'].to_numpy(dtype=float)
w = eps['win_suggested'].to_numpy(dtype=float)
p = eps['pnl_suggested'].to_numpy(dtype=float)

print('Spearman(sim, win):', spearman_corr(x, w))
print('Spearman(sim, pnl):', spearman_corr(x, p))

def monotonic_violations(y: np.ndarray) -> int:
    # count i where y[i] < y[i-1]
    v = 0
    for i in range(1, len(y)):
        if float(y[i]) + 1e-12 < float(y[i-1]):
            v += 1
    return v

print('bin winrate violations:', monotonic_violations(cal.sort_values('sim_mean')['winrate'].to_numpy()))
print('bin avgPnL violations:', monotonic_violations(cal.sort_values('sim_mean')['pnl_mean'].to_numpy()))

Spearman(sim, win): 0.057430798378592524
Spearman(sim, pnl): 0.02626652280766235
bin winrate violations: 2
bin avgPnL violations: 4


## Gating functions: hard threshold vs top-k vs smooth weights
We compare ways to turn similarity into a production decision rule.

Definitions (per anchor):
- **hard threshold**: trade suggestedAction only if similarity ≥ t
- **top-k**: trade suggestedAction only on the top-k most similar episodes
- **smooth weights**: expected PnL weighted by a function of similarity (e.g., linear ramp or logistic)

We approximate with the episode dataset (ignoring anchor identity) to get a global view of tradeoffs.

In [8]:
def eval_hard_threshold(eps: pd.DataFrame, thresholds: np.ndarray) -> pd.DataFrame:
    out = []
    for t in thresholds:
        m = eps['similarity'] >= float(t)
        cov = float(m.mean())
        if m.any():
            pnl = float(eps.loc[m, 'pnl_suggested'].mean())
            win = float(eps.loc[m, 'win_suggested'].mean())
        else:
            pnl = float('nan')
            win = float('nan')
        out.append({'method': 'hard', 'param': float(t), 'coverage': cov, 'avgPnL': pnl, 'winrate': win})
    return pd.DataFrame(out)

def eval_topk(eps: pd.DataFrame, ks: np.ndarray) -> pd.DataFrame:
    eps_sorted = eps.sort_values('similarity', ascending=False).reset_index(drop=True)
    n = len(eps_sorted)
    out = []
    for k in ks:
        k = int(k)
        k = max(1, min(k, n))
        sl = eps_sorted.iloc[:k]
        cov = float(k / n)
        out.append({
            'method': 'topk',
            'param': int(k),
            'coverage': cov,
            'avgPnL': float(sl['pnl_suggested'].mean()),
            'winrate': float(sl['win_suggested'].mean()),
        })
    return pd.DataFrame(out)

def eval_smooth(eps: pd.DataFrame, alpha: float, power: float, name: str) -> dict:
    # weight = clamp((sim - alpha)/(1-alpha), 0..1)^power
    sim = eps['similarity'].to_numpy(dtype=float)
    denom = max(1e-9, 1.0 - float(alpha))
    w = np.clip((sim - float(alpha)) / denom, 0.0, 1.0)
    w = np.power(w, float(power))
    # coverage proxy: average weight
    cov = float(np.mean(w))
    if float(np.sum(w)) <= 1e-12:
        return {'method': name, 'param': f'alpha={alpha:.2f},p={power:.2f}', 'coverage': cov, 'avgPnL': float('nan'), 'winrate': float('nan')}
    pnl = float(np.sum(w * eps['pnl_suggested'].to_numpy(dtype=float)) / np.sum(w))
    win = float(np.sum(w * eps['win_suggested'].to_numpy(dtype=float)) / np.sum(w))
    return {'method': name, 'param': f'alpha={alpha:.2f},p={power:.2f}', 'coverage': cov, 'avgPnL': pnl, 'winrate': win}

thresholds = np.round(np.linspace(eps['similarity'].quantile(0.05), eps['similarity'].quantile(0.95), 25), 4)
hard = eval_hard_threshold(eps, thresholds)

ks = np.unique(np.round(np.geomspace(50, max(50, len(eps)//2), 20)).astype(int))
topk = eval_topk(eps, ks)

smooth_rows = []
for alpha in [0.60, 0.70, 0.80, 0.85]:
    for power in [1.0, 2.0, 4.0]:
        smooth_rows.append(eval_smooth(eps, alpha=alpha, power=power, name='smooth'))
smooth = pd.DataFrame(smooth_rows)

gate = pd.concat([hard, topk, smooth], ignore_index=True)
gate.sort_values(['avgPnL', 'coverage'], ascending=[False, False]).head(15)

Unnamed: 0,method,param,coverage,avgPnL,winrate
40,topk,3016,0.167574,0.379751,0.213859
19,hard,0.9548,0.398767,0.36621,0.200641
18,hard,0.9492,0.452106,0.361033,0.200934
33,topk,445,0.024725,0.360974,0.235955
21,hard,0.9659,0.283143,0.360627,0.204082
42,topk,5210,0.289477,0.355911,0.204223
44,topk,8999,0.5,0.355781,0.199133
17,hard,0.9436,0.504056,0.354149,0.198743
22,hard,0.9715,0.207523,0.351888,0.207764
41,topk,3964,0.220247,0.351495,0.206609


In [9]:
fig = px.scatter(
    gate,
    x='coverage',
    y='avgPnL',
    color='method',
    hover_data=['param', 'winrate'],
    title='Gating tradeoff: coverage vs avgPnL (suggestedAction)',
)
fig.update_layout(height=420)
fig

In [10]:
best = gate.sort_values('avgPnL', ascending=False).head(10).copy()
best

Unnamed: 0,method,param,coverage,avgPnL,winrate
40,topk,3016.0,0.167574,0.379751,0.213859
19,hard,0.9548,0.398767,0.36621,0.200641
18,hard,0.9492,0.452106,0.361033,0.200934
33,topk,445.0,0.024725,0.360974,0.235955
21,hard,0.9659,0.283143,0.360627,0.204082
42,topk,5210.0,0.289477,0.355911,0.204223
44,topk,8999.0,0.5,0.355781,0.199133
17,hard,0.9436,0.504056,0.354149,0.198743
22,hard,0.9715,0.207523,0.351888,0.207764
41,topk,3964.0,0.220247,0.351495,0.206609


## Product guidance (how to pick threshold in prod)
Use these rules of thumb:
- pick a threshold to hit a target coverage (ops capacity) while keeping winrate above a minimum
- monitor calibration drift over time (run this notebook weekly and compare the curve)
- if similarity is monotonic but noisy, consider smooth weighting instead of a hard cutoff

If you want, we can extend this notebook to compute *per-anchor* calibration curves (true walk-forward), not just global pooling.