# RL Episode Gating Research (Plotly)

Goal: turn `rl-episodes` into quant research artifacts: similarity distributions, outcome quantiles, and a confidence-gating curve.

This is *not* online RL training. It is *retrieval + statistics* (optionally with simple ML later).


In [1]:
import os
import time
from datetime import datetime, timezone

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from aipricepatterns import Client

pd.set_option('display.max_columns', 80)
pd.set_option('display.width', 160)

def to_ms_utc(dt: str) -> int:
    return int(datetime.strptime(dt, "%Y-%m-%d %H:%M").replace(tzinfo=timezone.utc).timestamp() * 1000)

def period_anchor_ms(days_ago: int) -> int:
    return int((time.time() - days_ago * 24 * 60 * 60) * 1000)

def safe_float(x, default=0.0) -> float:
    try:
        return float(x)
    except Exception:
        return float(default)

def map_suggested_action_to_pos(x) -> int:
    """Map suggestedAction to position: -1/0/+1."""
    if x is None:
        return 0
    if isinstance(x, (int, float)):
        v = int(x)
        if v in (-1, 0, 1):
            return int(v)
        if v in (0, 1, 2):
            return 1 if v == 1 else (-1 if v == 2 else 0)
        return 0
    if not isinstance(x, str):
        return 0
    s = x.strip().lower()
    if s in ("hold", "flat", "none", "neutral", "wait"):
        return 0
    if s in ("long", "buy", "bull", "up"):
        return 1
    if s in ("short", "sell", "bear", "down"):
        return -1
    return 0


## Parameters
Use `anchorTs` for reproducible “replay”. For “now”, set `ANCHOR_TS_MS` close to the latest bar (or use `currentState` in the API directly).


In [2]:
BASE_URL = os.getenv("AIPP_BASE_URL", "https://aipricepatterns.com/api/rust")
API_KEY = os.getenv("AIPP_API_KEY")

SYMBOL = os.getenv("AIPP_RL_SYMBOL", "BTCUSDT")
INTERVAL = os.getenv("AIPP_RL_INTERVAL", "1h")

# Example: "2025-10-10 15:00" (UTC)
ANCHOR_DT_UTC = os.getenv("AIPP_RESEARCH_ANCHOR_DT_UTC", "")
ANCHOR_TS_MS = int(os.getenv("AIPP_RL_ANCHOR_TS", "0")) or (to_ms_utc(ANCHOR_DT_UTC) if ANCHOR_DT_UTC else period_anchor_ms(30))

FORECAST_HORIZON = int(os.getenv("AIPP_RL_HORIZON", "24"))
NUM_EPISODES = int(os.getenv("AIPP_RL_NUM_EPISODES", "2000"))
MIN_SIMILARITY = float(os.getenv("AIPP_RL_MIN_SIMILARITY", "0.70"))
SAMPLING_STRATEGY = os.getenv("AIPP_RL_SAMPLING_STRATEGY", "uniform")

# Simple execution friction for policy simulations (pct-per-trade semantics: 0.04 = 0.04%)
TRADE_COST_PCT = float(os.getenv("AIPP_RL_TRADE_COST_PCT", "0.00"))

# Confidence gating threshold for suggestedAction
SUGGESTED_MIN_SIM = float(os.getenv("AIPP_RL_SUGGESTED_MIN_SIMILARITY", "0.90"))

print("Base URL:", BASE_URL)
print(f"Symbol: {SYMBOL}  Interval: {INTERVAL}")
print(f"AnchorTs: {ANCHOR_TS_MS}  (dt={datetime.fromtimestamp(ANCHOR_TS_MS/1000, tz=timezone.utc)})")
print(f"Episodes: {NUM_EPISODES}  minSimilarity={MIN_SIMILARITY:.2f}  horizon={FORECAST_HORIZON}  sampling={SAMPLING_STRATEGY}")
print(f"TradeCostPct: {TRADE_COST_PCT:.4f}%")
print(f"Gating: suggestedMinSimilarity={SUGGESTED_MIN_SIM:.2f}")


Base URL: https://aipricepatterns.com/api/rust
Symbol: BTCUSDT  Interval: 1h
AnchorTs: 1763646319562  (dt=2025-11-20 13:45:19.562000+00:00)
Episodes: 2000  minSimilarity=0.70  horizon=24  sampling=uniform
TradeCostPct: 0.0000%
Gating: suggestedMinSimilarity=0.90


In [3]:
client = Client(base_url=BASE_URL, api_key=API_KEY)
res = client.get_rl_episodes(
    symbol=SYMBOL,
    interval=INTERVAL,
    anchor_ts=ANCHOR_TS_MS,
    forecast_horizon=FORECAST_HORIZON,
    num_episodes=NUM_EPISODES,
    min_similarity=MIN_SIMILARITY,
    include_actions=True,
    reward_type="returns",
    sampling_strategy=SAMPLING_STRATEGY,
)
episodes = res.get("episodes") if isinstance(res, dict) else None
if not isinstance(episodes, list) or not episodes:
    raise RuntimeError("No episodes returned. Lower MIN_SIMILARITY or change ANCHOR_TS_MS.")
print("episodes:", len(episodes))


episodes: 251


## Optional: actually train an RL agent (PPO)



Up to now we did **retrieval + statistics** (great for gating). This section trains a tiny PPO agent on the fixed episode trajectories returned by `rl-episodes`.



Notes:

- This is a *toy offline training loop* on replayed episodes (not production-grade online RL).

- Requires `gymnasium` + `stable-baselines3`.


In [4]:
# If you don't have these deps yet:

# /Users/serg/projects/prod/ai_patterns/.venv/bin/python -m pip install gymnasium stable-baselines3


import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv



class ReplayEpisodeEnv(gym.Env):
    metadata = {"render_modes": []}

    def __init__(
        self, episodes, trade_cost_pct=0.0, dd_penalty=0.0, horizon=None, seed=0
    ):
        super().__init__()

        self.episodes = [
            ep
            for ep in episodes
            if isinstance(ep, dict)
            and isinstance(ep.get("transitions"), list)
            and ep.get("transitions")
        ]

        if not self.episodes:
            raise ValueError("No usable episodes with transitions")

        self.trade_cost_pct = float(trade_cost_pct)

        self.dd_penalty = float(dd_penalty)

        self.horizon = int(horizon) if horizon is not None else None

        self.rng = np.random.default_rng(seed)

        self.action_space = spaces.Discrete(3)

        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32
        )

        self._ep = None

        self._ts = None

        self._i = 0

        self._pos = 0

        self._prev_dd = 0.0

    def _obs(self):
        t = self._ts[self._i]

        price = float(t.get("price", 0.0))

        vol = float(t.get("volatility", 0.0))

        cumret = float(t.get("cumulativeReturn", 0.0))

        dd = float(t.get("maxDrawdown", 0.0))

        sim = float(self._ep.get("similarity", 0.0))

        log_price = float(np.log(max(price, 1e-12)))

        return np.array([log_price, vol, cumret, dd, sim], dtype=np.float32)

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)

        self._ep = self.episodes[int(self.rng.integers(0, len(self.episodes)))]

        self._ts = self._ep["transitions"]

        self._i = 0

        self._pos = 0

        self._prev_dd = (
            float(self._ts[0].get("maxDrawdown", 0.0)) if self._ts else 0.0
        )

        return self._obs(), {}

    def step(self, action):
        # map action -> position

        new_pos = 0 if int(action) == 0 else (1 if int(action) == 1 else -1)

        trade_cost = 0.0

        if new_pos != self._pos and self.trade_cost_pct != 0.0:
            trade_cost = abs(new_pos - self._pos) * (self.trade_cost_pct / 100.0)

        self._pos = new_pos

        t = self._ts[self._i]

        ret = float(t.get("ret", t.get("return", 0.0)))

        dd = float(t.get("maxDrawdown", 0.0))

        dd_increase = max(0.0, dd - self._prev_dd)

        self._prev_dd = dd

        reward = float(self._pos) * ret - trade_cost - self.dd_penalty * dd_increase

        self._i += 1

        done = self._i >= len(self._ts) or (
            self.horizon is not None and self._i >= self.horizon
        )

        if done:
            # dummy obs at terminal

            obs = np.zeros((5,), dtype=np.float32)

        else:
            obs = self._obs()

        return obs, reward, done, False, {"ret": ret, "pos": self._pos}

RL_TRAIN_STEPS = int(os.getenv("AIPP_RL_NOTEBOOK_TRAIN_STEPS", "20000"))

RL_TRADE_COST_PCT = float(os.getenv("AIPP_RL_TRADE_COST_PCT", str(TRADE_COST_PCT)))

RL_DD_PENALTY = float(os.getenv("AIPP_RL_DD_PENALTY", "0.0"))

def make_env():
    return ReplayEpisodeEnv(
        episodes,
        trade_cost_pct=RL_TRADE_COST_PCT,
        dd_penalty=RL_DD_PENALTY,
        horizon=FORECAST_HORIZON,
        seed=42,
    )

env = DummyVecEnv([make_env])

model = PPO("MlpPolicy", env, verbose=0)

model.learn(total_timesteps=RL_TRAIN_STEPS)

print("trained PPO steps:", RL_TRAIN_STEPS)

def eval_policy(policy_name: str, n_episodes: int = 50):
    # Reuse the same env definition but do manual rollouts.

    base = make_env()

    rewards = []

    for _ in range(n_episodes):
        obs, _ = base.reset()

        done = False

        total = 0.0

        while not done:
            if policy_name == "ppo":
                a, _ = model.predict(obs, deterministic=True)

                action = int(a)

            elif policy_name == "alwaysFlat":
                action = 0

            elif policy_name == "alwaysLong":
                action = 1

            elif policy_name == "alwaysShort":
                action = 2

            else:
                action = 0

            obs, r, done, _, _ = base.step(action)

            total += float(r)

        rewards.append(total)

    return {
        "policy": policy_name,
        "avgReward": float(np.mean(rewards)),
        "medReward": float(np.median(rewards)),
        "p10": float(np.quantile(rewards, 0.10)),
        "p90": float(np.quantile(rewards, 0.90)),
    }

rl_summary = pd.DataFrame(
    [
        eval_policy("ppo"),
        eval_policy("alwaysFlat"),
        eval_policy("alwaysLong"),
        eval_policy("alwaysShort"),
    ]
).sort_values("avgReward", ascending=False)

rl_summary


trained PPO steps: 20000


Unnamed: 0,policy,avgReward,medReward,p10,p90
0,ppo,2.943804,2.45705,0.83953,4.52616
3,alwaysShort,0.30368,-0.10325,-2.62325,3.47961
1,alwaysFlat,0.0,0.0,0.0,0.0
2,alwaysLong,-0.30368,0.10325,-3.47961,2.62325


### Train/test split + learning curve (more honest)



This evaluates generalization: train PPO on earlier episodes and measure performance on later episodes.



It also plots a simple learning curve vs baselines on the **test** split.


In [5]:
# Sort by episode start time, then take an 80/20 split.
import plotly.graph_objects as go

usable_eps = [
    ep
    for ep in episodes
    if isinstance(ep, dict)
    and isinstance(ep.get("transitions"), list)
    and ep.get("transitions")
]

usable_eps = sorted(usable_eps, key=lambda ep: int(ep.get("startTs", 0)))

split = max(1, int(0.8 * len(usable_eps)))

train_eps = usable_eps[:split]

test_eps = usable_eps[split:]

if len(test_eps) == 0:
    # fallback if too few episodes

    train_eps = usable_eps

    test_eps = usable_eps


print(
    "episodes usable:",
    len(usable_eps),
    "train:",
    len(train_eps),
    "test:",
    len(test_eps),
)


def make_env_for(eps, seed=0):
    return ReplayEpisodeEnv(
        eps,
        trade_cost_pct=RL_TRADE_COST_PCT,
        dd_penalty=RL_DD_PENALTY,
        horizon=FORECAST_HORIZON,
        seed=seed,
    )


def eval_on_eps(model_or_none, eps, n_eval=100, policy="ppo"):
    base = make_env_for(eps, seed=123)

    rewards = []

    for _ in range(n_eval):
        obs, _ = base.reset()

        done = False

        total = 0.0

        while not done:
            if policy == "ppo":
                a, _ = model_or_none.predict(obs, deterministic=True)

                action = int(a)

            elif policy == "alwaysFlat":
                action = 0

            elif policy == "alwaysLong":
                action = 1

            elif policy == "alwaysShort":
                action = 2

            else:
                action = 0

            obs, r, done, _, _ = base.step(action)

            total += float(r)

        rewards.append(total)

    rewards = np.asarray(rewards, dtype=float)

    return {
        "avg": float(rewards.mean()),
        "med": float(np.median(rewards)),
        "p10": float(np.quantile(rewards, 0.10)),
        "p90": float(np.quantile(rewards, 0.90)),
    }


# Learning curve: train in chunks and evaluate on TEST after each chunk.

chunks = [0, 2000, 5000, 10000, 20000]

chunks = [c for c in chunks if c <= RL_TRAIN_STEPS]

if chunks[-1] != RL_TRAIN_STEPS:
    chunks.append(RL_TRAIN_STEPS)

chunks = sorted(set(chunks))


env_train = DummyVecEnv([lambda: make_env_for(train_eps, seed=42)])

lc_model = PPO("MlpPolicy", env_train, verbose=0)


rows_lc = []

prev = 0

for steps in chunks:
    delta = steps - prev

    if delta > 0:
        lc_model.learn(total_timesteps=delta)

    prev = steps

    m = eval_on_eps(lc_model, test_eps, n_eval=100, policy="ppo")

    rows_lc.append(
        {"trainSteps": int(steps), "avg": m["avg"], "p10": m["p10"], "p90": m["p90"]}
    )


# Baselines on the same TEST split.

b_flat = eval_on_eps(None, test_eps, n_eval=200, policy="alwaysFlat")

b_long = eval_on_eps(None, test_eps, n_eval=200, policy="alwaysLong")

b_short = eval_on_eps(None, test_eps, n_eval=200, policy="alwaysShort")


lc = pd.DataFrame(rows_lc)

baseline_tbl = pd.DataFrame(
    [
        {"policy": "alwaysFlat", **b_flat},
        {"policy": "alwaysLong", **b_long},
        {"policy": "alwaysShort", **b_short},
    ]
).sort_values("avg", ascending=False)


display(lc)

display(baseline_tbl)




fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=lc["trainSteps"], y=lc["avg"], mode="lines+markers", name="PPO avg (test)"
    )
)

fig.add_trace(
    go.Scatter(x=lc["trainSteps"], y=lc["p10"], mode="lines", name="PPO p10 (test)")
)

fig.add_trace(
    go.Scatter(x=lc["trainSteps"], y=lc["p90"], mode="lines", name="PPO p90 (test)")
)


for row in baseline_tbl.itertuples(index=False):
    fig.add_hline(
        y=float(row.avg),
        line_dash="dot",
        annotation_text=str(row.policy),
        annotation_position="top left",
    )


fig.update_layout(
    title="Learning curve on TEST split (PPO vs baselines)",
    xaxis_title="training timesteps",
    yaxis_title="episode reward (net)",
    height=460,
)

fig


episodes usable: 251 train: 200 test: 51


Unnamed: 0,trainSteps,avg,p10,p90
0,0,-0.098654,-3.05,4.2565
1,2000,1.568448,-0.636,4.8094
2,5000,2.376238,0.259,5.63612
3,10000,2.451176,0.4864,5.63612
4,20000,2.455048,0.4864,5.61055


Unnamed: 0,policy,avg,med,p10,p90
1,alwaysLong,0.137506,0.6461,-4.2565,2.56593
0,alwaysFlat,0.0,0.0,0.0,0.0
2,alwaysShort,-0.137506,-0.6461,-2.56593,4.2565


## Episode-level metrics
We compute simple realized PnL for a few policies on each episode:
- `alwaysFlat`
- `alwaysLong`
- `alwaysShort`
- `suggested` (follow `suggestedAction` each step, if provided)
- `suggestedIfConf` (only follow suggested if `episode.similarity >= threshold`, else flat)


In [6]:
def simulate_policy(ep: dict, policy: str) -> dict:
    ts = ep.get("transitions")
    if not isinstance(ts, list) or not ts:
        return {"steps": 0, "pnl": 0.0, "cost": 0.0, "trades": 0}
    sim = safe_float(ep.get("similarity"), 0.0)
    pos = 0
    pnl = 0.0
    cost = 0.0
    trades = 0
    steps = min(len(ts), FORECAST_HORIZON)
    for i in range(steps):
        t = ts[i] if isinstance(ts[i], dict) else {}
        ret = safe_float(t.get("ret", t.get("return", 0.0)), 0.0)
        old_pos = pos
        if policy == "alwaysFlat":
            pos = 0
        elif policy == "alwaysLong":
            pos = 1
        elif policy == "alwaysShort":
            pos = -1
        elif policy == "suggested":
            pos = map_suggested_action_to_pos(t.get("suggestedAction"))
        elif policy == "suggestedIfConf":
            pos = map_suggested_action_to_pos(t.get("suggestedAction")) if sim >= SUGGESTED_MIN_SIM else 0
        else:
            pos = 0

        if pos != old_pos and TRADE_COST_PCT != 0.0:
            trades += 1
            cost += abs(pos - old_pos) * (TRADE_COST_PCT / 100.0)

        pnl += float(pos) * ret
    return {"steps": steps, "pnl": float(pnl), "cost": float(cost), "trades": int(trades)}

rows = []
for ep in episodes:
    if not isinstance(ep, dict):
        continue
    sim = safe_float(ep.get("similarity"), 0.0)
    r_flat = simulate_policy(ep, "alwaysFlat")
    r_long = simulate_policy(ep, "alwaysLong")
    r_short = simulate_policy(ep, "alwaysShort")
    r_sug = simulate_policy(ep, "suggested")
    r_conf = simulate_policy(ep, "suggestedIfConf")
    rows.append({
        "similarity": sim,
        "pnl_flat": r_flat["pnl"] - r_flat["cost"],
        "pnl_long": r_long["pnl"] - r_long["cost"],
        "pnl_short": r_short["pnl"] - r_short["cost"],
        "pnl_suggested": r_sug["pnl"] - r_sug["cost"],
        "pnl_suggestedIfConf": r_conf["pnl"] - r_conf["cost"],
        "trades_suggested": r_sug["trades"],
        "trades_conf": r_conf["trades"],
    })

df = pd.DataFrame(rows).sort_values("similarity", ascending=False)
df.head(10)


Unnamed: 0,similarity,pnl_flat,pnl_long,pnl_short,pnl_suggested,pnl_suggestedIfConf,trades_suggested,trades_conf
0,0.9512,0.0,-0.1967,0.1967,5.9339,5.9339,0,0
1,0.9464,0.0,1.8693,-1.8693,9.2271,9.2271,0,0
2,0.9377,0.0,0.3068,-0.3068,5.9339,5.9339,0,0
3,0.9349,0.0,-9.6435,9.6435,18.257,18.257,0,0
4,0.9311,0.0,0.0748,-0.0748,5.9339,5.9339,0,0
5,0.9283,0.0,2.4025,-2.4025,8.4629,8.4629,0,0
6,0.9271,0.0,0.9383,-0.9383,2.5789,2.5789,0,0
7,0.9268,0.0,-7.4956,7.4956,22.3791,22.3791,0,0
8,0.9219,0.0,-9.1424,9.1424,20.6626,20.6626,0,0
9,0.9199,0.0,0.5417,-0.5417,2.5789,2.5789,0,0


## Similarity distribution


In [7]:
fig = px.histogram(df, x="similarity", nbins=40, title="Episode similarity distribution")
fig.update_layout(height=360)
fig


## Similarity vs outcomes
A quick diagnostic: do higher-similarity episodes actually have better outcomes for the policy?


In [8]:
fig = px.scatter(df, x="similarity", y="pnl_suggested", title="Similarity vs net PnL (suggested)")



# Add a simple linear fit without extra deps (no statsmodels).

x = df["similarity"].to_numpy(dtype=float)

y = df["pnl_suggested"].to_numpy(dtype=float)

if len(x) >= 2:

    m, b = np.polyfit(x, y, 1)

    xs = np.linspace(float(np.min(x)), float(np.max(x)), 50)

    ys = m * xs + b

    fig.add_trace(go.Scatter(x=xs, y=ys, mode="lines", name="linear fit"))



fig.update_layout(height=360)

fig


## Confidence gating curve
We sweep a similarity threshold and compute:
- coverage (fraction of episodes that pass the threshold)
- avg/median PnL for suggested policy on passing episodes
- win rate on passing episodes


In [9]:
thr_min = float(df["similarity"].min())
thr_max = float(df["similarity"].max())
thresholds = np.linspace(thr_min, thr_max, num=16)

curve_rows = []
for thr in thresholds:
    sub = df[df["similarity"] >= thr]
    if len(sub) == 0:
        continue
    pnl = sub["pnl_suggested"].astype(float)
    curve_rows.append({
        "threshold": float(thr),
        "count": int(len(sub)),
        "coverage": float(len(sub) / len(df)),
        "avgPnl": float(pnl.mean()),
        "medPnl": float(pnl.median()),
        "p10": float(pnl.quantile(0.10)),
        "p90": float(pnl.quantile(0.90)),
        "winRate": float((pnl > 0).mean()),
    })

curve = pd.DataFrame(curve_rows)
curve


Unnamed: 0,threshold,count,coverage,avgPnl,medPnl,p10,p90,winRate
0,0.8543,251,1.0,5.879608,4.3823,0.0,13.2598,0.884462
1,0.86076,201,0.800797,5.86154,4.1931,0.0,13.2598,0.875622
2,0.86722,142,0.565737,5.622022,3.935,0.0,13.2598,0.852113
3,0.87368,99,0.394422,5.761405,3.9532,0.0,13.76766,0.878788
4,0.88014,72,0.286853,6.166389,4.15715,0.56669,14.43441,0.916667
5,0.8866,52,0.207171,5.664302,3.935,0.7862,14.27329,0.923077
6,0.89306,34,0.135458,6.576712,4.93335,0.93416,14.42363,0.941176
7,0.89952,28,0.111554,7.157782,5.56505,1.13144,15.58496,0.928571
8,0.90598,19,0.075697,7.854458,5.9339,1.53304,18.73812,0.894737
9,0.91244,14,0.055777,9.847114,7.1984,2.96044,19.94092,1.0


In [10]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=curve["threshold"], y=curve["coverage"], mode="lines+markers", name="coverage"))
fig.add_trace(go.Scatter(x=curve["threshold"], y=curve["avgPnl"], mode="lines+markers", name="avgPnl", yaxis="y2"))
fig.add_trace(go.Scatter(x=curve["threshold"], y=curve["p10"], mode="lines", name="p10", yaxis="y2"))
fig.add_trace(go.Scatter(x=curve["threshold"], y=curve["p90"], mode="lines", name="p90", yaxis="y2"))
fig.update_layout(
    title="Gating curve: threshold vs coverage and PnL quantiles",
    xaxis_title="similarity threshold",
    yaxis=dict(title="coverage"),
    yaxis2=dict(title="net PnL", overlaying="y", side="right"),
    height=420,
)
fig


## Policy comparison summary
Quick aggregated view to sanity-check whether confidence gating helps (and how it trades off coverage).


In [11]:
summary = pd.DataFrame({
    "policy": ["alwaysFlat", "alwaysLong", "alwaysShort", "suggested", "suggestedIfConf"],
    "avgPnl": [
        float(df["pnl_flat"].mean()),
        float(df["pnl_long"].mean()),
        float(df["pnl_short"].mean()),
        float(df["pnl_suggested"].mean()),
        float(df["pnl_suggestedIfConf"].mean()),
    ],
    "medPnl": [
        float(df["pnl_flat"].median()),
        float(df["pnl_long"].median()),
        float(df["pnl_short"].median()),
        float(df["pnl_suggested"].median()),
        float(df["pnl_suggestedIfConf"].median()),
    ],
    "winRate": [
        float((df["pnl_flat"] > 0).mean()),
        float((df["pnl_long"] > 0).mean()),
        float((df["pnl_short"] > 0).mean()),
        float((df["pnl_suggested"] > 0).mean()),
        float((df["pnl_suggestedIfConf"] > 0).mean()),
    ],
})
summary.sort_values(["avgPnl"], ascending=False)


Unnamed: 0,policy,avgPnl,medPnl,winRate
3,suggested,5.879608,4.3823,0.884462
4,suggestedIfConf,0.774849,0.0,0.099602
1,alwaysLong,0.013672,0.177,0.561753
0,alwaysFlat,0.0,0.0,0.0
2,alwaysShort,-0.013672,-0.177,0.438247


In [12]:
fig = go.Figure()
for col, name in [
    ("pnl_long", "alwaysLong"),
    ("pnl_short", "alwaysShort"),
    ("pnl_suggested", "suggested"),
    ("pnl_suggestedIfConf", "suggestedIfConf"),
]:
    fig.add_trace(go.Box(y=df[col], name=name, boxmean=True))
fig.update_layout(title="Outcome distributions by policy (net PnL)", height=420)
fig


## Similarity bins (does gating make sense?)
Binning similarity helps see whether higher-similarity episodes are actually “better” for the suggested policy.


In [13]:
bins = pd.qcut(df["similarity"].astype(float), q=10, duplicates="drop")
binned = (
    df.assign(sim_bin=bins)
      .groupby("sim_bin", as_index=False)
      .agg(
          count=("similarity", "size"),
          simMin=("similarity", "min"),
          simMax=("similarity", "max"),
          avgSuggested=("pnl_suggested", "mean"),
          medSuggested=("pnl_suggested", "median"),
          winSuggested=("pnl_suggested", lambda s: float((s > 0).mean())),
      )
)
binned






Unnamed: 0,sim_bin,count,simMin,simMax,avgSuggested,medSuggested,winSuggested
0,"(0.853, 0.858]",26,0.8543,0.8575,6.107112,4.76195,0.884615
1,"(0.858, 0.861]",26,0.8579,0.8608,5.486054,5.1411,0.961538
2,"(0.861, 0.864]",24,0.8609,0.8639,6.4413,3.9996,0.916667
3,"(0.864, 0.866]",26,0.8641,0.8665,5.765408,5.38955,0.923077
4,"(0.866, 0.869]",24,0.8667,0.8694,8.130008,5.1686,0.875
5,"(0.869, 0.873]",25,0.8695,0.8733,3.745556,3.5766,0.76
6,"(0.873, 0.88]",26,0.8734,0.8798,5.166169,4.46655,0.807692
7,"(0.88, 0.887]",24,0.88,0.8872,6.568254,4.55945,0.833333
8,"(0.887, 0.901]",25,0.8878,0.9007,3.905884,2.8228,0.96
9,"(0.901, 0.951]",25,0.9009,0.9512,7.660116,5.6245,0.92


In [14]:
fig = go.Figure()
fig.add_trace(go.Bar(x=binned["sim_bin"].astype(str), y=binned["avgSuggested"], name="avgSuggested"))
fig.add_trace(go.Scatter(x=binned["sim_bin"].astype(str), y=binned["winSuggested"], name="winRate", yaxis="y2"))
fig.update_layout(
    title="Suggested policy by similarity deciles",
    xaxis_title="similarity bin (deciles)",
    yaxis=dict(title="avg net PnL"),
    yaxis2=dict(title="win rate", overlaying="y", side="right", rangemode="tozero"),
    height=420,
)
fig


## Gating curve for `suggestedIfConf`
This uses the *sweep threshold* policy itself (flat if below threshold), so it directly answers: “what threshold maximizes expected net PnL?”


In [15]:
thresholds = np.linspace(float(df["similarity"].min()), float(df["similarity"].max()), num=21)
rows2 = []
for thr in thresholds:
    active = df[df["similarity"] >= thr]
    if len(active) == 0:
        continue
    pnl_active = active["pnl_suggested"].astype(float)
    # If we go flat below threshold, overall expected PnL is: mean(pnl on active) * coverage
    coverage = float(len(active) / len(df))
    rows2.append({
        "threshold": float(thr),
        "coverage": coverage,
        "avgActive": float(pnl_active.mean()),
        "expectedOverall": float(pnl_active.mean() * coverage),
        "winActive": float((pnl_active > 0).mean()),
    })
curve2 = pd.DataFrame(rows2)
curve2


Unnamed: 0,threshold,coverage,avgActive,expectedOverall,winActive
0,0.8543,1.0,5.879608,5.879608,0.884462
1,0.859145,0.840637,5.890058,4.951403,0.881517
2,0.86399,0.697211,5.827247,4.062822,0.868571
3,0.868835,0.517928,5.408504,2.801217,0.853846
4,0.87368,0.394422,5.761405,2.272427,0.878788
5,0.878525,0.330677,5.917731,1.956859,0.891566
6,0.88337,0.243028,5.877593,1.428419,0.934426
7,0.888215,0.191235,5.601846,1.071269,0.9375
8,0.89306,0.135458,6.576712,0.890869,0.941176
9,0.897905,0.119522,7.122193,0.851258,0.933333


In [16]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=curve2["threshold"], y=curve2["expectedOverall"], mode="lines+markers", name="expectedOverall"))
fig.add_trace(go.Scatter(x=curve2["threshold"], y=curve2["coverage"], mode="lines+markers", name="coverage", yaxis="y2"))
fig.update_layout(
    title="Threshold sweep (flat below thr): expected overall PnL vs coverage",
    xaxis_title="similarity threshold",
    yaxis=dict(title="expected overall net PnL"),
    yaxis2=dict(title="coverage", overlaying="y", side="right"),
    height=420,
)
fig


## Optional: simple ML baseline (requires scikit-learn)
If you install `scikit-learn`, you can fit a quick classifier to predict whether the suggested policy is profitable, using basic episode-level features.


In [17]:
try:
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import roc_auc_score
    SKLEARN_OK = True
except Exception as e:
    SKLEARN_OK = False
    print("scikit-learn not installed. To enable this cell:")
    print("  /Users/serg/projects/prod/ai_patterns/.venv/bin/python -m pip install scikit-learn")
    print("Error:", e)

if SKLEARN_OK:
    X = df[["similarity", "trades_suggested"]].fillna(0.0).astype(float)
    y = (df["pnl_suggested"] > 0).astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    m = LogisticRegression(max_iter=200)
    m.fit(X_train, y_train)
    p = m.predict_proba(X_test)[:, 1]
    print("AUC:", roc_auc_score(y_test, p))


AUC: 0.5255102040816326
