# Turning Massive Grid: Post Analysis

This notebook reads saved grid-search artifacts and performs focused post-analysis.
It does **not** rerun the massive grid search.


In [None]:
from __future__ import annotations

import sys
from pathlib import Path
from typing import Any, Dict

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / "RESEARCH").exists():
    for parent in PROJECT_ROOT.parents:
        if (parent / "RESEARCH").exists():
            PROJECT_ROOT = parent
            break

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from RESEARCH.config import cfg as project_cfg
from RESEARCH.data_loader import load_market_data
from RESEARCH.model_training import check_cuda_available
from src.models.xgb import XGBBaseline

from RESEARCH2.Moon_cycles.turning_points import TurningPointLabelConfig, label_turning_points
from RESEARCH2.Moon_cycles.turning_targets import build_turning_target_frame, merge_features_with_turning_target
from RESEARCH2.Moon_cycles.turning_astro_features import TurningAstroFeatureConfig, build_turning_astro_feature_set
from RESEARCH2.Moon_cycles.eval_utils import compute_binary_metrics
from RESEARCH2.Moon_cycles.splits import make_classic_split

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

print("Python:", sys.version.split()[0])
print("PROJECT_ROOT:", PROJECT_ROOT)


In [None]:
# Analysis settings
RUN_TAG = "turning_massive_label_grid"
DATA_START = "2017-11-01"
SEED = 42
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15

# Honest candidate rules (can be tightened/relaxed).
HONEST_RULE_STRICT = {
    "test_recall_min": 0.55,
    "test_recall_gap": 0.25,
    "val_recall_min": 0.50,
    "val_recall_gap": 0.25,
}
HONEST_RULE_RELAXED = {
    "test_recall_min": 0.50,
    "test_recall_gap": 0.35,
    "val_recall_min": 0.45,
    "val_recall_gap": 0.35,
}

REPORTS_DIR = project_cfg.reports_dir if hasattr(project_cfg, "reports_dir") else (PROJECT_ROOT / "data" / "market" / "reports")
CHECKPOINT_PATH = Path(REPORTS_DIR) / f"{RUN_TAG}_checkpoint.csv"
DONE_PAIRS_PATH = Path(REPORTS_DIR) / f"{RUN_TAG}_done_pairs.txt"

if not CHECKPOINT_PATH.exists():
    raise FileNotFoundError(f"Checkpoint not found: {CHECKPOINT_PATH}")

print("CHECKPOINT:", CHECKPOINT_PATH)
print("DONE_PAIRS:", DONE_PAIRS_PATH, "exists=", DONE_PAIRS_PATH.exists())


In [None]:
def sort_results_frame(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df

    out = df.copy()
    defaults = {
        "is_feasible": 0,
        "total_constraint_violation": 1e9,
        "test_profit_y_obj": -1e9,
        "test_profit_y": -1e9,
        "test_recall_min": -1.0,
        "test_recall_gap": 1e9,
        "mcc": -1e9,
    }
    for k, v in defaults.items():
        if k not in out.columns:
            out[k] = v

    return out.sort_values(
        [
            "is_feasible",
            "total_constraint_violation",
            "test_profit_y_obj",
            "test_profit_y",
            "test_recall_min",
            "test_recall_gap",
            "mcc",
        ],
        ascending=[False, True, False, False, False, True, False],
    )


def _extract_label_cfg(row: pd.Series) -> Dict[str, Any]:
    return {
        "up_move_pct": float(row["label_up_move_pct"]),
        "down_move_pct": float(row["label_down_move_pct"]),
        "cluster_gap_days": int(row["label_cluster_gap_days"]),
        "min_turn_gap_days": int(row["label_min_turn_gap_days"]),
        "past_horizon_days": int(row["label_past_horizon_days"]),
        "past_up_move_pct": float(row["label_past_up_move_pct"]),
        "past_down_move_pct": float(row["label_past_down_move_pct"]),
    }


def _extract_target_cfg(row: pd.Series) -> Dict[str, Any]:
    mode = str(row["target_mode"])
    cfg = {
        "mode": mode,
        "min_weight": float(row["target_min_weight"]),
        "use_amplitude_weight": bool(row["target_use_amplitude_weight"]),
    }
    if mode == "window_kernel":
        cfg.update({
            "window_radius_days": int(row["target_window_radius_days"]),
            "window_distance_power": float(row["target_window_distance_power"]),
        })
    elif mode == "segment_midpoint":
        cfg.update({
            "segment_center_power": float(row["target_segment_center_power"]),
            "segment_direction_anchor": str(row["target_segment_direction_anchor"]),
            "include_last_open_segment": bool(row["target_include_last_open_segment"]),
            "segment_open_tail_direction_mode": str(row["target_segment_open_tail_direction_mode"]),
            "segment_open_tail_min_move_pct": float(row["target_segment_open_tail_min_move_pct"]),
        })
    else:
        raise ValueError(f"Unsupported target_mode: {mode}")
    return cfg


def _extract_model_cfg(row: pd.Series) -> Dict[str, Any]:
    return {
        "n_estimators": int(row["model_n_estimators"]),
        "max_depth": int(row["model_max_depth"]),
        "learning_rate": float(row["model_learning_rate"]),
        "subsample": float(row["model_subsample"]),
        "colsample_bytree": float(row["model_colsample_bytree"]),
        "early_stopping_rounds": int(row["model_early_stopping_rounds"]),
    }


def _safe_predict_proba_up(model_obj: XGBBaseline, X: np.ndarray) -> np.ndarray:
    const_cls = getattr(model_obj, "constant_class", None)
    if const_cls is not None:
        c = int(const_cls)
        return np.full(X.shape[0], 1.0 if c == 1 else 0.0, dtype=float)

    Xs = model_obj.scaler.transform(X)
    booster = None
    restore_device = None
    try:
        booster = model_obj.model.get_booster()
        restore_device = str(getattr(model_obj, "device", "cpu"))
        booster.set_param({"device": "cpu"})
    except Exception:
        booster = None

    try:
        proba_up = model_obj.model.predict_proba(Xs)[:, 1]
    finally:
        if booster is not None and restore_device and restore_device.startswith("cuda"):
            try:
                booster.set_param({"device": restore_device})
            except Exception:
                pass

    return np.asarray(proba_up, dtype=float)


def _profit_y(y_pred: np.ndarray, next_ret: np.ndarray) -> float:
    p = np.where(np.asarray(y_pred, dtype=np.int32) == 1, 1.0, -1.0)
    y = np.asarray(next_ret, dtype=float)
    return float(np.mean(p * y)) if len(y) > 0 else 0.0


def _weighted_move_vector(next_ret: np.ndarray, sample_weight: np.ndarray, power: float = 1.5, clip_q: float = 0.98) -> np.ndarray:
    base = np.abs(np.asarray(next_ret, dtype=float))
    if base.size == 0:
        return np.array([], dtype=float)
    cap = float(np.quantile(base, clip_q))
    if not np.isfinite(cap) or cap <= 0.0:
        cap = float(np.nanmax(base)) if np.isfinite(np.nanmax(base)) and np.nanmax(base) > 0 else 1.0
    move_part = np.clip(base / cap, 0.0, 1.0) ** float(power)
    w = move_part * np.asarray(sample_weight, dtype=float)
    return np.maximum(w, 1e-8)


def _profit_y_obj(y_pred: np.ndarray, next_ret: np.ndarray, sample_weight: np.ndarray) -> float:
    p = np.where(np.asarray(y_pred, dtype=np.int32) == 1, 1.0, -1.0)
    y = np.asarray(next_ret, dtype=float)
    w = _weighted_move_vector(next_ret=y, sample_weight=sample_weight)
    return float(np.sum(w * (p * y)) / np.sum(w))


In [None]:
df_results_raw = pd.read_csv(CHECKPOINT_PATH)
df_results = sort_results_frame(df_results_raw).reset_index(drop=True)

print(f"rows={len(df_results)} cols={len(df_results.columns)}")
if "is_feasible" in df_results.columns:
    print(f"feasible={int((df_results['is_feasible']==1).sum())}/{len(df_results)}")

best_row = df_results.iloc[0].copy()
feas = df_results[df_results.get("is_feasible", 0) == 1].copy()

strict = feas[
    (feas["test_recall_min"] >= HONEST_RULE_STRICT["test_recall_min"])
    & (feas["test_recall_gap"] <= HONEST_RULE_STRICT["test_recall_gap"])
    & (feas["val_recall_min"] >= HONEST_RULE_STRICT["val_recall_min"])
    & (feas["val_recall_gap"] <= HONEST_RULE_STRICT["val_recall_gap"])
].copy()

if strict.empty:
    strict = feas[
        (feas["test_recall_min"] >= HONEST_RULE_RELAXED["test_recall_min"])
        & (feas["test_recall_gap"] <= HONEST_RULE_RELAXED["test_recall_gap"])
        & (feas["val_recall_min"] >= HONEST_RULE_RELAXED["val_recall_min"])
        & (feas["val_recall_gap"] <= HONEST_RULE_RELAXED["val_recall_gap"])
    ].copy()

if strict.empty:
    strict = feas.copy() if not feas.empty else df_results.copy()

honest_row = strict.sort_values(
    ["test_recall_min", "test_recall_gap", "mcc", "bal_acc", "test_profit_y_obj"],
    ascending=[False, True, False, False, False],
).iloc[0].copy()

metric_cols = [
    "test_profit_y_obj", "test_profit_y",
    "val_recall_min", "val_recall_gap",
    "test_recall_min", "test_recall_gap",
    "mcc", "bal_acc",
]
print("\n[A] grid_best from checkpoint")
display(pd.DataFrame([best_row[metric_cols]]))
print("[B] honest candidate from checkpoint")
display(pd.DataFrame([honest_row[metric_cols]]))

param_cols = [c for c in df_results.columns if c.startswith("label_") or c.startswith("target_") or c.startswith("model_")]
print("\nParameters: grid_best")
display(pd.DataFrame([best_row[param_cols]]))
print("Parameters: honest")
display(pd.DataFrame([honest_row[param_cols]]))


In [None]:
# Build shared data once (market + close_map + astro features).
# Uses cache namespace from grid run, so this should be fast if cache exists.

df_market = load_market_data()
df_market = df_market[df_market["date"] >= DATA_START].copy()
df_market["date"] = pd.to_datetime(df_market["date"])
df_market["close"] = pd.to_numeric(df_market["close"], errors="coerce")
df_market = df_market.dropna(subset=["date", "close"]).sort_values("date").drop_duplicates("date").reset_index(drop=True)

close_map = df_market[["date", "close"]].copy()
close_map["date"] = pd.to_datetime(close_map["date"])
close_map = close_map.sort_values("date").reset_index(drop=True)
close_map["next_close"] = close_map["close"].shift(-1)
close_map["next_ret"] = close_map["next_close"] / close_map["close"] - 1.0

birth_dt_utc = str(best_row.get("birth_dt_utc", project_cfg.subject.get("birth_dt_utc", "2009-10-10T18:15:05Z")))
feature_coord_mode = str(best_row.get("feature_coord_mode", "both"))
feature_orb_mult = float(best_row.get("feature_orb_mult", 0.10))

astro_cfg = TurningAstroFeatureConfig(
    coord_mode=feature_coord_mode,
    orb_mult=feature_orb_mult,
    include_pair_aspects=True,
    include_phases=True,
    include_transit_aspects=True,
    add_trig_for_longitudes=True,
    add_trig_for_moon_phase=True,
    add_trig_for_elongations=True,
)

df_features = build_turning_astro_feature_set(
    df_market=df_market,
    birth_dt_utc=birth_dt_utc,
    cfg=astro_cfg,
    cache_namespace="research2_turning_grid",
    use_cache=True,
    verbose=True,
    progress=True,
)

print("market rows:", len(df_market), "features rows:", len(df_features))


In [None]:
def evaluate_candidate(row: pd.Series, tag: str) -> Dict[str, Any]:
    label_cfg = _extract_label_cfg(row)
    target_cfg = _extract_target_cfg(row)
    model_cfg = _extract_model_cfg(row)

    turn_cfg = TurningPointLabelConfig(
        horizon_days=int(row.get("horizon_days_fixed", 10)),
        up_move_pct=float(label_cfg["up_move_pct"]),
        down_move_pct=float(label_cfg["down_move_pct"]),
        cluster_gap_days=int(label_cfg["cluster_gap_days"]),
        min_turn_gap_days=int(label_cfg["min_turn_gap_days"]),
        past_horizon_days=int(label_cfg["past_horizon_days"]),
        past_up_move_pct=float(label_cfg["past_up_move_pct"]),
        past_down_move_pct=float(label_cfg["past_down_move_pct"]),
        tail_direction_mode=str(row.get("tail_direction_mode_fixed", "endpoint_sign")),
        tail_min_move_pct=float(row.get("tail_min_move_pct_fixed", 0.0)),
    )

    _, df_turns, _ = label_turning_points(df_market=df_market, cfg=turn_cfg)

    if target_cfg["mode"] == "window_kernel":
        df_target = build_turning_target_frame(
            df_market=df_market,
            df_turning_points=df_turns,
            mode="window_kernel",
            window_radius_days=int(target_cfg["window_radius_days"]),
            window_distance_power=float(target_cfg["window_distance_power"]),
            min_weight=float(target_cfg["min_weight"]),
            use_amplitude_weight=bool(target_cfg["use_amplitude_weight"]),
            use_numba=True,
        )
    else:
        df_target = build_turning_target_frame(
            df_market=df_market,
            df_turning_points=df_turns,
            mode="segment_midpoint",
            segment_center_power=float(target_cfg["segment_center_power"]),
            segment_direction_anchor=str(target_cfg["segment_direction_anchor"]),
            include_last_open_segment=bool(target_cfg["include_last_open_segment"]),
            segment_open_tail_direction_mode=str(target_cfg["segment_open_tail_direction_mode"]),
            segment_open_tail_min_move_pct=float(target_cfg["segment_open_tail_min_move_pct"]),
            min_weight=float(target_cfg["min_weight"]),
            use_amplitude_weight=bool(target_cfg["use_amplitude_weight"]),
            use_numba=True,
        )

    df_dataset = merge_features_with_turning_target(
        df_features=df_features,
        df_target=df_target,
        df_market_close=df_market[["date", "close"]],
    )
    df_dataset = pd.merge(df_dataset, close_map[["date", "next_ret"]], on="date", how="left")
    df_dataset = df_dataset.dropna(subset=["next_ret", "target", "sample_weight", "close"]).sort_values("date").reset_index(drop=True)

    split = make_classic_split(df_dataset, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)
    train_df = df_dataset.iloc[split.train_idx].copy().reset_index(drop=True)
    val_df = df_dataset.iloc[split.val_idx].copy().reset_index(drop=True)
    test_df = df_dataset.iloc[split.test_idx].copy().reset_index(drop=True)

    feature_cols = [
        c for c in df_dataset.columns
        if c not in {
            "date", "target", "close", "next_ret",
            "turning_direction", "sample_weight", "target_mode",
            "event_index", "segment_index",
        }
    ]

    X_train = train_df[feature_cols].to_numpy(dtype=np.float32)
    y_train = train_df["target"].to_numpy(dtype=np.int32)
    X_val = val_df[feature_cols].to_numpy(dtype=np.float32)
    y_val = val_df["target"].to_numpy(dtype=np.int32)
    X_test = test_df[feature_cols].to_numpy(dtype=np.float32)
    y_test = test_df["target"].to_numpy(dtype=np.int32)

    sw_train_base = pd.to_numeric(train_df["sample_weight"], errors="coerce").fillna(1.0).to_numpy(dtype=np.float32)
    sw_val_base = pd.to_numeric(val_df["sample_weight"], errors="coerce").fillna(1.0).to_numpy(dtype=np.float32)
    sw_test_base = pd.to_numeric(test_df["sample_weight"], errors="coerce").fillna(1.0).to_numpy(dtype=np.float32)
    sw_train = sw_train_base * compute_sample_weight(class_weight="balanced", y=y_train).astype(np.float32)
    sw_val = sw_val_base * compute_sample_weight(class_weight="balanced", y=y_val).astype(np.float32)
    sw_test = sw_test_base * compute_sample_weight(class_weight="balanced", y=y_test).astype(np.float32)

    _, device = check_cuda_available()

    def _make_model(device_name: str) -> XGBBaseline:
        return XGBBaseline(
            n_classes=2,
            device=device_name,
            random_state=SEED,
            early_stopping_rounds=int(model_cfg["early_stopping_rounds"]),
            n_estimators=int(model_cfg["n_estimators"]),
            max_depth=int(model_cfg["max_depth"]),
            learning_rate=float(model_cfg["learning_rate"]),
            subsample=float(model_cfg["subsample"]),
            colsample_bytree=float(model_cfg["colsample_bytree"]),
            tree_method="hist",
            eval_metric="logloss",
        )

    used_device = str(device)
    model = _make_model(used_device)
    try:
        model.fit(
            X_train=X_train, y_train=y_train,
            X_val=X_val, y_val=y_val,
            feature_names=feature_cols,
            sample_weight=sw_train,
            sample_weight_val=sw_val,
        )
    except Exception:
        used_device = "cpu"
        model = _make_model(used_device)
        model.fit(
            X_train=X_train, y_train=y_train,
            X_val=X_val, y_val=y_val,
            feature_names=feature_cols,
            sample_weight=sw_train,
            sample_weight_val=sw_val,
        )

    p_val = _safe_predict_proba_up(model, X_val)
    p_test = _safe_predict_proba_up(model, X_test)

    threshold = float(row.get("threshold", 0.5))
    pred_val = (p_val >= threshold).astype(np.int32)
    pred_test = (p_test >= threshold).astype(np.int32)

    val_metrics = compute_binary_metrics(y_val, pred_val)
    test_metrics = compute_binary_metrics(y_test, pred_test)

    ret_val = pd.to_numeric(val_df["next_ret"], errors="coerce").fillna(0.0).to_numpy(dtype=np.float32)
    ret_test = pd.to_numeric(test_df["next_ret"], errors="coerce").fillna(0.0).to_numpy(dtype=np.float32)

    out = {
        "tag": tag,
        "used_device": used_device,
        "threshold": threshold,
        "val_profit_y": _profit_y(pred_val, ret_val),
        "test_profit_y": _profit_y(pred_test, ret_test),
        "val_profit_y_obj": _profit_y_obj(pred_val, ret_val, sw_val),
        "test_profit_y_obj": _profit_y_obj(pred_test, ret_test, sw_test),
        "val_metrics": val_metrics,
        "test_metrics": test_metrics,
        "conf": confusion_matrix(y_test, pred_test, labels=[0, 1]),
        "test_frame": test_df[["date", "close", "next_ret", "target"]].assign(
            pred=pred_test,
            proba_up=p_test,
            match=(y_test == pred_test).astype(np.int32),
        ),
    }
    return out


best_eval = evaluate_candidate(best_row, tag="grid_best")
honest_eval = evaluate_candidate(honest_row, tag="honest")

def _flat(res: Dict[str, Any]) -> Dict[str, Any]:
    vm = res["val_metrics"]
    tm = res["test_metrics"]
    return {
        "candidate": res["tag"],
        "device": res["used_device"],
        "threshold": res["threshold"],
        "val_profit_y_obj": res["val_profit_y_obj"],
        "test_profit_y_obj": res["test_profit_y_obj"],
        "val_profit_y": res["val_profit_y"],
        "test_profit_y": res["test_profit_y"],
        "val_recall_min": vm["recall_min"],
        "val_recall_gap": vm["recall_gap"],
        "test_recall_min": tm["recall_min"],
        "test_recall_gap": tm["recall_gap"],
        "test_mcc": tm["mcc"],
        "test_bal_acc": tm["balanced_accuracy"],
        "test_acc": tm["accuracy"],
    }

compare_df = pd.DataFrame([_flat(best_eval), _flat(honest_eval)])
print("Recomputed on test period (true labels vs predicted):")
display(compare_df)

print("\nConfusion matrix grid_best [rows=true DOWN/UP, cols=pred DOWN/UP]")
print(best_eval["conf"])
print("\nConfusion matrix honest [rows=true DOWN/UP, cols=pred DOWN/UP]")
print(honest_eval["conf"])

show_cols = ["date", "close", "next_ret", "target", "pred", "proba_up", "match"]
print("\nSample test markup: grid_best")
display(best_eval["test_frame"][show_cols].head(40))
print("Sample test markup: honest")
display(honest_eval["test_frame"][show_cols].head(40))

print("\nQuick take:")
print("- grid_best: more aggressive profit objective.")
print("- honest: typically more balanced recalls and lower class-collapse risk.")


In [None]:
import matplotlib.pyplot as plt

# Keep these as default visual style for comparison charts.
DEFAULT_PRICE_COLOR = "#1f77b4"
DEFAULT_UP_COLOR = "green"
DEFAULT_DOWN_COLOR = "red"
DEFAULT_SHADE_ALPHA = 0.20


def _add_market_truth(frame: pd.DataFrame) -> pd.DataFrame:
    out = frame.copy()
    out["true_surrogate"] = pd.to_numeric(out["target"], errors="coerce").astype("Int64")
    out["true_market"] = (pd.to_numeric(out["next_ret"], errors="coerce").fillna(0.0) >= 0.0).astype(np.int32)
    return out


def plot_price_pred_true_stack(
    eval_result: Dict[str, Any],
    title_prefix: str,
    true_col: str = "true_market",
    price_color: str = DEFAULT_PRICE_COLOR,
    up_color: str = DEFAULT_UP_COLOR,
    down_color: str = DEFAULT_DOWN_COLOR,
    shade_alpha: float = DEFAULT_SHADE_ALPHA,
) -> None:
    """
    Top: predicted labels background.
    Bottom: true labels background (market truth by default).
    """
    frame = _add_market_truth(eval_result["test_frame"]).reset_index(drop=True)
    frame["date"] = pd.to_datetime(frame["date"])

    if true_col not in frame.columns:
        raise ValueError(f"Unknown true_col={true_col}")

    dates = frame["date"]
    prices = pd.to_numeric(frame["close"], errors="coerce").to_numpy(dtype=float)
    y_true = pd.to_numeric(frame[true_col], errors="coerce").fillna(0).to_numpy(dtype=np.int32)
    y_pred = pd.to_numeric(frame["pred"], errors="coerce").fillna(0).to_numpy(dtype=np.int32)

    if len(frame) == 0:
        print(f"[{title_prefix}] empty test frame, nothing to plot")
        return

    p_min = float(np.nanmin(prices))
    p_max = float(np.nanmax(prices))
    margin = (p_max - p_min) * 0.05 if p_max > p_min else 1.0
    fill_min = p_min - margin
    fill_max = p_max + margin

    fig, axes = plt.subplots(2, 1, figsize=(16, 9), sharex=True)

    # Top: predicted background.
    axes[0].set_title(
        f"{title_prefix} | PREDICTED (R_MIN={eval_result['test_metrics']['recall_min']:.3f}, "
        f"GAP={eval_result['test_metrics']['recall_gap']:.3f}, MCC={eval_result['test_metrics']['mcc']:.3f})"
    )
    axes[0].plot(dates, prices, color=price_color, linewidth=1.5, label="Price")
    axes[0].fill_between(dates, fill_min, fill_max, where=(y_pred == 1), color=up_color, alpha=shade_alpha, step="mid", label="UP")
    axes[0].fill_between(dates, fill_min, fill_max, where=(y_pred == 0), color=down_color, alpha=shade_alpha, step="mid", label="DOWN")
    axes[0].set_ylabel("Price")
    axes[0].set_ylim(fill_min, fill_max)
    axes[0].grid(True, alpha=0.3, linestyle=":")
    axes[0].legend(loc="upper left")

    # Bottom: true background.
    true_name = "TRUE_MARKET(next_ret>=0)" if true_col == "true_market" else "TRUE_SURROGATE(target)"
    true_down = float(np.mean(y_true == 0))
    true_up = float(np.mean(y_true == 1))
    axes[1].set_title(f"{title_prefix} | {true_name} (DOWN={true_down:.3f}, UP={true_up:.3f})")
    axes[1].plot(dates, prices, color=price_color, linewidth=1.5, label="Price")
    axes[1].fill_between(dates, fill_min, fill_max, where=(y_true == 1), color=up_color, alpha=shade_alpha, step="mid", label="UP")
    axes[1].fill_between(dates, fill_min, fill_max, where=(y_true == 0), color=down_color, alpha=shade_alpha, step="mid", label="DOWN")
    axes[1].set_ylabel("Price")
    axes[1].set_xlabel("Date")
    axes[1].set_ylim(fill_min, fill_max)
    axes[1].grid(True, alpha=0.3, linestyle=":")
    axes[1].legend(loc="upper left")

    fig.suptitle(
        f"{title_prefix} | test_profit_y_obj={eval_result['test_profit_y_obj']:+.6f} "
        f"| test_profit_y={eval_result['test_profit_y']:+.6f}",
        fontsize=13,
        y=0.98,
    )
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()


def print_truth_disagreement(eval_result: Dict[str, Any], tag: str) -> None:
    frame = _add_market_truth(eval_result["test_frame"]).reset_index(drop=True)
    disagree = (frame["true_surrogate"].astype(int) != frame["true_market"].astype(int)).mean()
    print(f"[{tag}] surrogate-vs-market disagreement on test: {disagree:.3f}")


print_truth_disagreement(best_eval, "grid_best")
print_truth_disagreement(honest_eval, "honest")

# Main chart requested: predicted (top) vs REAL market truth (bottom)
plot_price_pred_true_stack(best_eval, title_prefix="grid_best", true_col="true_market")
plot_price_pred_true_stack(honest_eval, title_prefix="honest", true_col="true_market")

# Optional debug: old surrogate truth (to see why it looked wrong)
plot_price_pred_true_stack(best_eval, title_prefix="grid_best (debug surrogate)", true_col="true_surrogate")
plot_price_pred_true_stack(honest_eval, title_prefix="honest (debug surrogate)", true_col="true_surrogate")


## Retrain On Market Truth

This section retrains models directly on market truth labels (future return sign),
so we can compare against the surrogate turning-target training.


In [None]:
# Market-truth retrain settings (fast, no massive grid rerun)
TRUTH_HORIZON_DAYS = int(best_row.get("horizon_days_fixed", 10))
TRUTH_DEADZONE_PCT = 0.0
TRUTH_WEIGHT_POWER = 1.5
TRUTH_WEIGHT_CLIP_Q = 0.98

# Threshold constraints: prioritize non-collapsed class behavior.
TRUTH_VAL_RECALL_MIN_FLOOR = 0.50
TRUTH_VAL_RECALL_GAP_CEIL = 0.35
TRUTH_TEST_RECALL_MIN_FLOOR = 0.50
TRUTH_TEST_RECALL_GAP_CEIL = 0.35
TRUTH_CONSTRAINT_PENALTY = 2.0

# Build market-truth dataset from existing astro features.
cm = df_market[["date", "close"]].copy().sort_values("date").reset_index(drop=True)
cm["future_close"] = cm["close"].shift(-TRUTH_HORIZON_DAYS)
cm["next_ret"] = cm["future_close"] / cm["close"] - 1.0

df_truth = pd.merge(df_features, cm[["date", "close", "next_ret"]], on="date", how="inner")
df_truth = df_truth.dropna(subset=["next_ret", "close"]).sort_values("date").reset_index(drop=True)
if TRUTH_DEADZONE_PCT > 0.0:
    df_truth = df_truth[df_truth["next_ret"].abs() >= TRUTH_DEADZONE_PCT].copy().reset_index(drop=True)

df_truth["target"] = (df_truth["next_ret"] >= 0.0).astype(np.int32)

cap = float(df_truth["next_ret"].abs().quantile(TRUTH_WEIGHT_CLIP_Q))
if not np.isfinite(cap) or cap <= 0:
    cap = max(float(df_truth["next_ret"].abs().max()), 1e-6)
df_truth["sample_weight"] = np.clip(df_truth["next_ret"].abs() / cap, 0.0, 1.0) ** TRUTH_WEIGHT_POWER

df_truth["sample_weight"] = np.maximum(df_truth["sample_weight"].astype(float), 1e-8)

print(f"truth dataset rows={len(df_truth)} horizon={TRUTH_HORIZON_DAYS}d")
print("target share up:", float((df_truth["target"] == 1).mean()))
print("period:", df_truth["date"].min(), "->", df_truth["date"].max())


In [None]:
def tune_threshold_market(y_val: np.ndarray, p_val: np.ndarray, ret_val: np.ndarray, sw_val: np.ndarray) -> tuple[float, float, float, dict, float, float, int]:
    best_t = 0.5
    best_score = -1e18
    best_profit_obj = -1e18
    best_profit_raw = -1e18
    best_metrics = {}
    best_violation = 1e18
    best_feasible = 0

    for t in np.linspace(0.05, 0.95, 91):
        pred = (p_val >= t).astype(np.int32)
        m = compute_binary_metrics(y_val, pred)
        profit_obj = _profit_y_obj(pred, ret_val, sw_val)
        profit_raw = _profit_y(pred, ret_val)

        violation = (
            max(0.0, TRUTH_VAL_RECALL_MIN_FLOOR - float(m["recall_min"]))
            + max(0.0, float(m["recall_gap"]) - TRUTH_VAL_RECALL_GAP_CEIL)
        )
        feasible = int(violation <= 1e-12)
        score = float(profit_obj) - TRUTH_CONSTRAINT_PENALTY * float(violation)

        if score > best_score:
            best_score = float(score)
            best_t = float(t)
            best_profit_obj = float(profit_obj)
            best_profit_raw = float(profit_raw)
            best_metrics = m
            best_violation = float(violation)
            best_feasible = int(feasible)
        elif np.isclose(score, best_score):
            if feasible > best_feasible:
                best_t = float(t)
                best_profit_obj = float(profit_obj)
                best_profit_raw = float(profit_raw)
                best_metrics = m
                best_violation = float(violation)
                best_feasible = int(feasible)
            elif feasible == best_feasible and float(violation) < float(best_violation):
                best_t = float(t)
                best_profit_obj = float(profit_obj)
                best_profit_raw = float(profit_raw)
                best_metrics = m
                best_violation = float(violation)
                best_feasible = int(feasible)

    return best_t, best_score, best_profit_obj, best_metrics, best_profit_raw, best_violation, best_feasible


def is_better_market(candidate: dict, current: dict | None) -> bool:
    if current is None:
        return True
    if int(candidate.get("is_feasible", 0)) != int(current.get("is_feasible", 0)):
        return int(candidate.get("is_feasible", 0)) > int(current.get("is_feasible", 0))

    c_v = float(candidate.get("total_constraint_violation", np.inf))
    p_v = float(current.get("total_constraint_violation", np.inf))
    if not np.isclose(c_v, p_v):
        return c_v < p_v

    c_obj = float(candidate.get("test_profit_y_obj", -1e9))
    p_obj = float(current.get("test_profit_y_obj", -1e9))
    if not np.isclose(c_obj, p_obj):
        return c_obj > p_obj

    c_r = float(candidate.get("test_recall_min", -1.0))
    p_r = float(current.get("test_recall_min", -1.0))
    if not np.isclose(c_r, p_r):
        return c_r > p_r

    c_g = float(candidate.get("test_recall_gap", 1e9))
    p_g = float(current.get("test_recall_gap", 1e9))
    if not np.isclose(c_g, p_g):
        return c_g < p_g

    return float(candidate.get("mcc", -1e9)) > float(current.get("mcc", -1e9))


def train_eval_market_truth(model_cfg: dict, tag: str) -> dict:
    split = make_classic_split(df_truth, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)
    train_df = df_truth.iloc[split.train_idx].copy().reset_index(drop=True)
    val_df = df_truth.iloc[split.val_idx].copy().reset_index(drop=True)
    test_df = df_truth.iloc[split.test_idx].copy().reset_index(drop=True)

    feature_cols = [
        c for c in df_truth.columns
        if c not in {"date", "target", "close", "next_ret", "sample_weight"}
    ]

    X_train = train_df[feature_cols].to_numpy(dtype=np.float32)
    y_train = train_df["target"].to_numpy(dtype=np.int32)
    X_val = val_df[feature_cols].to_numpy(dtype=np.float32)
    y_val = val_df["target"].to_numpy(dtype=np.int32)
    X_test = test_df[feature_cols].to_numpy(dtype=np.float32)
    y_test = test_df["target"].to_numpy(dtype=np.int32)

    sw_train_base = train_df["sample_weight"].to_numpy(dtype=np.float32)
    sw_val_base = val_df["sample_weight"].to_numpy(dtype=np.float32)
    sw_test_base = test_df["sample_weight"].to_numpy(dtype=np.float32)

    sw_train = sw_train_base * compute_sample_weight(class_weight="balanced", y=y_train).astype(np.float32)
    sw_val = sw_val_base * compute_sample_weight(class_weight="balanced", y=y_val).astype(np.float32)
    sw_test = sw_test_base * compute_sample_weight(class_weight="balanced", y=y_test).astype(np.float32)

    _, device = check_cuda_available()

    def _make_model(device_name: str) -> XGBBaseline:
        return XGBBaseline(
            n_classes=2,
            device=device_name,
            random_state=SEED,
            early_stopping_rounds=int(model_cfg["early_stopping_rounds"]),
            n_estimators=int(model_cfg["n_estimators"]),
            max_depth=int(model_cfg["max_depth"]),
            learning_rate=float(model_cfg["learning_rate"]),
            subsample=float(model_cfg["subsample"]),
            colsample_bytree=float(model_cfg["colsample_bytree"]),
            tree_method="hist",
            eval_metric="logloss",
        )

    used_device = str(device)
    model = _make_model(used_device)
    try:
        model.fit(
            X_train=X_train, y_train=y_train,
            X_val=X_val, y_val=y_val,
            feature_names=feature_cols,
            sample_weight=sw_train,
            sample_weight_val=sw_val,
        )
    except Exception:
        used_device = "cpu"
        model = _make_model(used_device)
        model.fit(
            X_train=X_train, y_train=y_train,
            X_val=X_val, y_val=y_val,
            feature_names=feature_cols,
            sample_weight=sw_train,
            sample_weight_val=sw_val,
        )

    p_val = _safe_predict_proba_up(model, X_val)
    p_test = _safe_predict_proba_up(model, X_test)

    ret_val = val_df["next_ret"].to_numpy(dtype=np.float32)
    ret_test = test_df["next_ret"].to_numpy(dtype=np.float32)

    thr, _, val_profit_obj, val_m, val_profit_raw, val_violation_thr, val_feasible_thr = tune_threshold_market(
        y_val=y_val,
        p_val=p_val,
        ret_val=ret_val,
        sw_val=sw_val,
    )

    pred_val = (p_val >= thr).astype(np.int32)
    pred_test = (p_test >= thr).astype(np.int32)

    val_metrics = compute_binary_metrics(y_val, pred_val)
    test_metrics = compute_binary_metrics(y_test, pred_test)

    val_violation = (
        max(0.0, TRUTH_VAL_RECALL_MIN_FLOOR - float(val_metrics["recall_min"]))
        + max(0.0, float(val_metrics["recall_gap"]) - TRUTH_VAL_RECALL_GAP_CEIL)
    )
    test_violation = (
        max(0.0, TRUTH_TEST_RECALL_MIN_FLOOR - float(test_metrics["recall_min"]))
        + max(0.0, float(test_metrics["recall_gap"]) - TRUTH_TEST_RECALL_GAP_CEIL)
    )
    total_violation = float(val_violation + test_violation)

    out = {
        "tag": tag,
        "device": used_device,
        "threshold": float(thr),
        "is_feasible": int(total_violation <= 1e-12),
        "total_constraint_violation": float(total_violation),
        "val_constraint_violation": float(val_violation),
        "test_constraint_violation": float(test_violation),
        "test_profit_y_obj": float(_profit_y_obj(pred_test, ret_test, sw_test)),
        "test_profit_y": float(_profit_y(pred_test, ret_test)),
        "val_profit_y_obj": float(val_profit_obj),
        "val_profit_y": float(val_profit_raw),
        "test_recall_up": float(test_metrics["recall_up"]),
        "test_recall_down": float(test_metrics["recall_down"]),
        "test_recall_min": float(test_metrics["recall_min"]),
        "test_recall_gap": float(test_metrics["recall_gap"]),
        "val_recall_min": float(val_metrics["recall_min"]),
        "val_recall_gap": float(val_metrics["recall_gap"]),
        "mcc": float(test_metrics["mcc"]),
        "bal_acc": float(test_metrics["balanced_accuracy"]),
        "test_metrics": test_metrics,
        "test_frame": test_df[["date", "close", "next_ret", "target"]].assign(
            pred=pred_test,
            proba_up=p_test,
            match=(pred_test == y_test).astype(np.int32),
        ),
    }
    return out


In [None]:
# Candidate configs for quick retrain
model_candidates = {
    "from_grid_best": _extract_model_cfg(best_row),
    "from_honest": _extract_model_cfg(honest_row),
    "honest_light": {
        "n_estimators": 300,
        "max_depth": 4,
        "learning_rate": 0.03,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "early_stopping_rounds": 50,
    },
}

market_runs = []
best_market = None

for tag, cfg in model_candidates.items():
    r = train_eval_market_truth(cfg, tag=tag)
    r["model_cfg"] = cfg
    market_runs.append(r)
    if is_better_market(r, best_market):
        best_market = r

market_table = pd.DataFrame([
    {
        "tag": r["tag"],
        "is_feasible": r["is_feasible"],
        "violation": r["total_constraint_violation"],
        "test_profit_y_obj": r["test_profit_y_obj"],
        "test_profit_y": r["test_profit_y"],
        "test_recall_min": r["test_recall_min"],
        "test_recall_gap": r["test_recall_gap"],
        "mcc": r["mcc"],
        "bal_acc": r["bal_acc"],
        "threshold": r["threshold"],
        "device": r["device"],
    }
    for r in market_runs
]).sort_values(
    ["is_feasible", "violation", "test_profit_y_obj", "test_recall_min", "test_recall_gap", "mcc"],
    ascending=[False, True, False, False, True, False],
).reset_index(drop=True)

print("Market-truth retrain results:")
display(market_table)

print("Best market-truth run:")
print(best_market["tag"], best_market["model_cfg"])


In [None]:
# Plot winner against real market truth (no surrogate target here)
winner = best_market

# Reuse plotting function from previous section if available.
if "plot_price_pred_true_stack" in globals():
    plot_price_pred_true_stack(
        {
            "test_frame": winner["test_frame"],
            "test_metrics": winner["test_metrics"],
            "test_profit_y_obj": winner["test_profit_y_obj"],
            "test_profit_y": winner["test_profit_y"],
        },
        title_prefix=f"market_truth_retrain::{winner['tag']}",
        true_col="true_market",
    )
else:
    # Fallback compact plot
    f = winner["test_frame"].copy().reset_index(drop=True)
    f["date"] = pd.to_datetime(f["date"])
    y_true = (f["next_ret"].to_numpy(dtype=float) >= 0.0).astype(np.int32)
    y_pred = f["pred"].to_numpy(dtype=np.int32)
    prices = f["close"].to_numpy(dtype=float)

    p_min, p_max = float(np.nanmin(prices)), float(np.nanmax(prices))
    margin = (p_max - p_min) * 0.05 if p_max > p_min else 1.0

    fig, axes = plt.subplots(2, 1, figsize=(16, 9), sharex=True)
    for ax, lab, ttl in [
        (axes[0], y_pred, "PREDICTED"),
        (axes[1], y_true, "TRUE_MARKET"),
    ]:
        ax.plot(f["date"], prices, color="#1f77b4", linewidth=1.5)
        ax.fill_between(f["date"], p_min - margin, p_max + margin, where=(lab == 1), color="green", alpha=0.2, step="mid")
        ax.fill_between(f["date"], p_min - margin, p_max + margin, where=(lab == 0), color="red", alpha=0.2, step="mid")
        ax.set_title(ttl)
        ax.grid(True, alpha=0.3, linestyle=":")
    plt.tight_layout()
    plt.show()

print("winner metrics:")
print({
    "test_profit_y_obj": winner["test_profit_y_obj"],
    "test_profit_y": winner["test_profit_y"],
    "test_recall_min": winner["test_recall_min"],
    "test_recall_gap": winner["test_recall_gap"],
    "mcc": winner["mcc"],
    "bal_acc": winner["bal_acc"],
})
