# Astro pipeline: target variable and XGBoost (BTC)

This notebook shows the full cycle:
1) load quotes (daily)
2) build target variable (oracle labels)
3) compute astro data and build astro features
4) train and evaluate XGBoost.

Important: features are astro-only; price is used only for targets.


## 0. Environment setup

If some packages are missing, install via conda-forge (in active env):

```
conda install -c conda-forge xgboost scikit-learn matplotlib seaborn tqdm pyarrow jupyterlab
```

Also check:
- `configs/astro.yaml` -> `ephe_path` (path to Swiss Ephemeris)
- `configs/subjects.yaml` -> `active_subject_id` and subject birth date


In [None]:
# Check dependencies (stop notebook if missing)
import importlib.util as iu

required = ["xgboost", "sklearn", "matplotlib", "seaborn", "tqdm", "pyarrow"]
missing = [pkg for pkg in required if iu.find_spec(pkg) is None]

if missing:
    print("Missing packages:", ", ".join(missing))
    print("Install them with:")
    print("conda install -c conda-forge xgboost scikit-learn matplotlib seaborn tqdm pyarrow jupyterlab")
    raise SystemExit("Stopped: install dependencies and rerun")

print("OK: all core dependencies found")


In [None]:
# Base imports and environment setup
from pathlib import Path
import sys
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Visual style
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 4)

# Table display settings
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# Project root search (look for configs/market.yaml)
PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / "configs/market.yaml").exists():
    for parent in PROJECT_ROOT.parents:
        if (parent / "configs/market.yaml").exists():
            PROJECT_ROOT = parent
            break

if not (PROJECT_ROOT / "configs/market.yaml").exists():
    raise FileNotFoundError("Project root not found: configs/market.yaml")

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"PROJECT_ROOT = {PROJECT_ROOT}")


In [None]:
# Load configs and market data (from Postgres)
from src.common.config import load_yaml, load_subjects
from src.db.connection import psql_connection

cfg_market = load_yaml(PROJECT_ROOT / "configs/market.yaml")
cfg_astro = load_yaml(PROJECT_ROOT / "configs/astro.yaml")
cfg_labels = load_yaml(PROJECT_ROOT / "configs/labels.yaml")
cfg_db = load_yaml(PROJECT_ROOT / "configs/db.yaml")
cfg_train = load_yaml(PROJECT_ROOT / "configs/training.yaml")

subjects, active_id = load_subjects(PROJECT_ROOT / "configs/subjects.yaml")
subject = subjects[active_id]

market_cfg = cfg_market["market"]

# NOTE: if path is relative, resolve from PROJECT_ROOT
def _resolve_path(value: str | Path) -> Path:
    path = Path(value)
    if path.is_absolute():
        return path
    return (PROJECT_ROOT / path).resolve()

data_root = _resolve_path(market_cfg["data_root"])
processed_dir = data_root / "processed"
reports_dir = data_root / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)

print(f"Active subject: {subject.subject_id}")
print(f"Data root: {data_root}")

# Market source: Postgres only
if "db" not in cfg_db or "url" not in cfg_db["db"]:
    raise KeyError("configs/db.yaml must define db.url")

db_url = cfg_db["db"]["url"]

# Load market_daily from DB
with psql_connection(db_url) as conn:
    df_market = pd.read_sql_query(
        "SELECT date, close FROM market_daily WHERE subject_id = %s ORDER BY date",
        conn,
        params=(subject.subject_id,),
    )

if df_market.empty:
    raise ValueError(
        f"No market data for subject_id={subject.subject_id}. "
        "Load market_daily into Postgres first."
    )

if "date" not in df_market.columns or "close" not in df_market.columns:
    raise ValueError("market_daily must have date and close columns")

df_market["date"] = pd.to_datetime(df_market["date"])
print(df_market.head())
print(f"Market range: {df_market['date'].min().date()} -> {df_market['date'].max().date()}")
print(f"Rows: {len(df_market)}")

# Plot price mode (optional override)
NB_PLOT_PRICE_MODE = None  # 'log' or 'raw'
PLOT_PRICE_MODE = str(NB_PLOT_PRICE_MODE or cfg_labels['labels'].get('price_mode', 'log')).lower()
if PLOT_PRICE_MODE not in {'log', 'raw'}:
    print(f"[WARN] Unknown PLOT_PRICE_MODE={PLOT_PRICE_MODE}, fallback to 'log'")
    PLOT_PRICE_MODE = 'log'
print(f"PLOT_PRICE_MODE = {PLOT_PRICE_MODE}")


In [None]:
# Quick look at price and daily change distribution
fig, ax = plt.subplots(2, 1, figsize=(12, 6), sharex=False)

price_series = np.log(df_market['close']) if PLOT_PRICE_MODE == 'log' else df_market['close']
price_label = 'log(close)' if PLOT_PRICE_MODE == 'log' else 'close'

ax[0].plot(df_market['date'], price_series, color='tab:blue', linewidth=1)
ax[0].set_title('BTC close (daily)')
ax[0].set_xlabel('Date')
ax[0].set_ylabel(price_label)

# Log returns for a rough distribution check
log_ret = np.log(df_market['close']).diff().dropna()
ax[1].hist(log_ret, bins=80, color='tab:gray')
ax[1].set_title('Daily log return distribution')
ax[1].set_xlabel('log_return')
ax[1].set_ylabel('frequency')

plt.tight_layout()
plt.show()


## 1. Oracle labels (target variable)

Idea: smooth log price, take slope, classify by threshold.


In [None]:
from src.labeling.oracle import (
    create_oracle_labels,
    analyze_label_distribution,
    estimate_threshold_for_move_balance,
)

labels_cfg = cfg_labels["labels"]
SIGMA = int(labels_cfg.get("sigma", 3))
THRESHOLD = float(labels_cfg.get("threshold", 0.0005))
THRESHOLD_MODE = str(labels_cfg.get("threshold_mode", "fixed")).strip().lower()
TARGET_MOVE_SHARE = float(labels_cfg.get("target_move_share", 0.5))
THRESHOLD_MIN = labels_cfg.get("threshold_min", 0.0)
if THRESHOLD_MIN is None:
    THRESHOLD_MIN = 0.0
THRESHOLD_MIN = float(THRESHOLD_MIN)
THRESHOLD_MAX = labels_cfg.get("threshold_max", None)
if THRESHOLD_MAX is not None:
    THRESHOLD_MAX = float(THRESHOLD_MAX)
HORIZON = int(labels_cfg.get("horizon", 1))
PRICE_MODE = str(labels_cfg.get("price_mode", "log")).lower()
AUTO_GRID_SELECT = bool(labels_cfg.get("auto_grid_select", True))
BINARY_TREND = bool(labels_cfg.get("binary_trend", False))
BINARY_FALLBACK = str(labels_cfg.get("binary_fallback", "up"))

# Notebook overrides (optional)
# SIDEWAYS share + MOVE share = 1.0, so MOVE share = 1 - SIDEWAYS.
SIDEWAYS_SHARE_TARGET = 0.33  # set to None to use config target_move_share
NB_THRESHOLD = None  # set float to force fixed threshold
NB_THRESHOLD_MODE = None  # "auto" or "fixed" to override config

if NB_THRESHOLD_MODE is not None:
    THRESHOLD_MODE = str(NB_THRESHOLD_MODE).strip().lower()
if SIDEWAYS_SHARE_TARGET is not None:
    TARGET_MOVE_SHARE = 1.0 - float(SIDEWAYS_SHARE_TARGET)
if NB_THRESHOLD is not None:
    THRESHOLD = float(NB_THRESHOLD)
    THRESHOLD_MODE = "fixed"

if THRESHOLD_MODE == "auto":
    THRESHOLD = estimate_threshold_for_move_balance(
        df_market,
        sigma=SIGMA,
        price_col="close",
        price_mode=PRICE_MODE,
        target_move_share=TARGET_MOVE_SHARE,
        min_threshold=THRESHOLD_MIN,
        max_threshold=THRESHOLD_MAX,
    )
    print(f"Auto threshold={THRESHOLD:.8f} (target_move_share={TARGET_MOVE_SHARE:.2f})")

print(
    f"Label params: sigma={SIGMA}, threshold={THRESHOLD}, "
    f"horizon={HORIZON}, price_mode={PRICE_MODE}, threshold_mode={THRESHOLD_MODE}, "
    f"target_move_share={TARGET_MOVE_SHARE:.2f}"
)
print(f"Label mode: {'binary' if BINARY_TREND else '3-class'}")

df_labels = create_oracle_labels(
    df_market,
    sigma=SIGMA,
    threshold=THRESHOLD,
    price_col="close",
    price_mode=PRICE_MODE,
    binary_trend=BINARY_TREND,
    binary_fallback=BINARY_FALLBACK,
)

label_3_col = "target_3" if BINARY_TREND else "target"
move_share = (df_labels[label_3_col] != 1).mean()
sideways_share = (df_labels[label_3_col] == 1).mean()
print(f"MOVE share: {move_share*100:.1f}%, SIDEWAYS share: {sideways_share*100:.1f}%")

# Quick visual: threshold -> SIDEWAYS share
PLOT_THRESHOLD_SWEEP = True
SWEEP_STEPS = 25
SWEEP_FACTOR = 0.5  # +/-50% around current threshold
if PLOT_THRESHOLD_SWEEP:
    thr_min = max(THRESHOLD * (1.0 - SWEEP_FACTOR), 1e-10)
    thr_max = THRESHOLD * (1.0 + SWEEP_FACTOR) if THRESHOLD > 0 else 1e-3
    thresholds = np.linspace(thr_min, thr_max, SWEEP_STEPS)
    sideways_vals = []
    for t in thresholds:
        tmp = create_oracle_labels(
            df_market,
            sigma=SIGMA,
            threshold=float(t),
            price_col="close",
            price_mode=PRICE_MODE,
            binary_trend=BINARY_TREND,
            binary_fallback=BINARY_FALLBACK,
        )
        tmp_label_col = "target_3" if BINARY_TREND else "target"
        sideways_vals.append((tmp[tmp_label_col] == 1).mean())

    target_sideways = (
        float(SIDEWAYS_SHARE_TARGET)
        if SIDEWAYS_SHARE_TARGET is not None
        else 1.0 - float(TARGET_MOVE_SHARE)
    )

    plt.figure(figsize=(7, 4))
    plt.plot(thresholds, sideways_vals, marker=".")
    plt.axvline(THRESHOLD, color="orange", linestyle="--", label="current threshold")
    if target_sideways is not None:
        plt.axhline(target_sideways, color="red", linestyle=":", label="target sideways")
    plt.title("Threshold vs SIDEWAYS share")
    plt.xlabel("threshold")
    plt.ylabel("sideways share")
    plt.legend()
    plt.show()

cols = ["date", "close", "smoothed_close", "smooth_slope", "target"]
if BINARY_TREND:
    cols.append("target_3")
print(df_labels[cols].head())


In [None]:
# Class distribution
if BINARY_TREND:
    label_map = {0: "DOWN", 1: "UP"}
    counts = df_labels["target"].value_counts(normalize=True).sort_index() * 100
    colors = ["#d62728", "#2ca02c"]
else:
    label_map = {0: "DOWN", 1: "SIDEWAYS", 2: "UP"}
    counts = df_labels["target"].value_counts(normalize=True).sort_index() * 100
    colors = ["#d62728", "#7f7f7f", "#2ca02c"]

plt.figure(figsize=(6, 4))
plt.bar([label_map[i] for i in counts.index], counts.values, color=colors)
plt.title("Class share (oracle)")
plt.ylabel("%")
plt.show()


In [None]:
# Visual: price vs smoothed price
if PLOT_PRICE_MODE == 'log':
    close_series = np.log(df_labels['close'])
    smoothed_series = np.log(df_labels['smoothed_close'])
    ylabel = 'log(price)'
    title = 'Log close and log-smoothed line (oracle)'
else:
    close_series = df_labels['close']
    smoothed_series = df_labels['smoothed_close']
    ylabel = 'Price'
    title = 'Close and smoothed line (oracle)'

fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.plot(df_labels['date'], close_series, label='close', linewidth=0.8)
ax.plot(df_labels['date'], smoothed_series, label='smoothed', linewidth=1.5)
ax.set_title(title)
ax.set_xlabel('Date')
ax.set_ylabel(ylabel)
ax.legend()
plt.show()


In [None]:
# Auto-tune sigma/threshold (minimize DOWN vs UP imbalance)
# Also support auto-threshold (move share target)

if AUTO_GRID_SELECT:
    sigma_min = max(2, SIGMA - 2)
    sigma_max = SIGMA + 2

    if THRESHOLD_MODE == "auto":
        sigmas = np.unique(np.linspace(sigma_min, sigma_max, 5, dtype=int))
        rows = []
        for s in sigmas:
            thr = estimate_threshold_for_move_balance(
                df_market,
                sigma=int(s),
                price_col="close",
                price_mode=PRICE_MODE,
                target_move_share=TARGET_MOVE_SHARE,
                min_threshold=THRESHOLD_MIN,
                max_threshold=THRESHOLD_MAX,
            )
            labeled = create_oracle_labels(
                df_market,
                sigma=int(s),
                threshold=float(thr),
                price_col="close",
                price_mode=PRICE_MODE,
                binary_trend=BINARY_TREND,
                binary_fallback=BINARY_FALLBACK,
            )
            label_col = "target_3" if BINARY_TREND else "target"
            counts = labeled[label_col].value_counts(normalize=True)
            rows.append({
                "sigma": int(s),
                "threshold": float(thr),
                "down_pct": counts.get(0, 0) * 100,
                "sideways_pct": counts.get(1, 0) * 100,
                "up_pct": counts.get(2, 0) * 100,
                "imbalance": abs(counts.get(0, 0) - counts.get(2, 0)) * 100,
            })
        grid = pd.DataFrame(rows).sort_values("imbalance")
    else:
        thr_min = max(THRESHOLD * 0.5, 1e-6)
        thr_max = THRESHOLD * 1.5 if THRESHOLD > 0 else 1e-3
        grid = analyze_label_distribution(
            df_market,
            sigma_range=(sigma_min, sigma_max),
            threshold_range=(thr_min, thr_max),
            n_steps=5,
            price_mode=PRICE_MODE,
            price_col="close",
        )

    best = grid.iloc[0]
    SIGMA = int(best["sigma"])
    THRESHOLD = float(best["threshold"])
    print(f"Auto-grid: sigma={SIGMA}, threshold={THRESHOLD:.8f} (imbalance={best['imbalance']:.2f}%)")

    df_labels = create_oracle_labels(
        df_market,
        sigma=SIGMA,
        threshold=THRESHOLD,
        price_col="close",
        price_mode=PRICE_MODE,
        binary_trend=BINARY_TREND,
        binary_fallback=BINARY_FALLBACK,
    )


In [None]:
# Simple oracle label plot (matplotlib)
plot_df = df_labels[['date', 'close', 'target']].copy()
plot_df['date'] = pd.to_datetime(plot_df['date'])
plot_df = plot_df.sort_values('date').reset_index(drop=True)

def shade_up_down(ax, dates, close, up_mask, down_mask, title: str, y_label: str):
    ax.plot(dates, close, color='black', linewidth=1.2, label='BTC close')
    ax.fill_between(
        dates,
        0,
        1,
        where=up_mask,
        transform=ax.get_xaxis_transform(),
        color='green',
        alpha=0.15,
        label='UP',
    )
    ax.fill_between(
        dates,
        0,
        1,
        where=down_mask,
        transform=ax.get_xaxis_transform(),
        color='red',
        alpha=0.15,
        label='DOWN',
    )
    ax.set_title(title)
    ax.set_ylabel(y_label)
    ax.legend(loc='upper left')

labels = plot_df['target'].to_numpy()
if BINARY_TREND:
    up_mask = labels == 1
    down_mask = labels == 0
    title = 'Oracle labels (binary: UP/DOWN)'
else:
    up_mask = labels == 2
    down_mask = labels == 0
    title = 'Oracle labels (3-class: UP/DOWN shaded)'

dates = plot_df['date'].to_numpy()
if PLOT_PRICE_MODE == 'log':
    close = np.log(plot_df['close'].to_numpy())
    y_label = 'log(price)'
else:
    close = plot_df['close'].to_numpy()
    y_label = 'Close'

fig, ax = plt.subplots(figsize=(12, 4))
shade_up_down(ax, dates, close, up_mask, down_mask, title, y_label)
ax.set_xlabel('Date')
plt.tight_layout()
plt.show()


## 2. Target shift by horizon


In [None]:
# Shift target to the future (features(t) -> target(t+horizon))
df_labels_shifted = df_labels.copy()
df_labels_shifted["target"] = df_labels_shifted["target"].shift(-HORIZON)

# Drop rows without future label
df_labels_shifted = df_labels_shifted.dropna(subset=["target"]).reset_index(drop=True)
df_labels_shifted["target"] = df_labels_shifted["target"].astype(int)

print(df_labels_shifted[["date", "target"]].tail())
print(f"Rows after shift: {len(df_labels_shifted)}")


## 3. Astro data and astro features


In [None]:
from datetime import datetime
from tqdm import tqdm

from src.astro.engine.settings import AstroSettings
from src.astro.engine.calculator import set_ephe_path, calculate_daily_bodies
from src.astro.engine.aspects import calculate_aspects
from src.features.builder import build_features_daily

# Astro settings
astro_cfg = cfg_astro["astro"]
# Same path rules: resolve to PROJECT_ROOT
_ephe_path = _resolve_path(astro_cfg["ephe_path"])
set_ephe_path(str(_ephe_path))

settings = AstroSettings(
    bodies_path=_resolve_path(astro_cfg["bodies_path"]),
    aspects_path=_resolve_path(astro_cfg["aspects_path"]),
)

time_utc = datetime.strptime(astro_cfg["daily_time_utc"], "%H:%M:%S").time()

bodies_path = processed_dir / f"{subject.subject_id}_astro_bodies.parquet"
aspects_path = processed_dir / f"{subject.subject_id}_astro_aspects.parquet"
features_path = processed_dir / f"{subject.subject_id}_features.parquet"

# Ignore astro cache, recompute
print("Ignoring astro cache, recomputing...")
bodies_rows = []
aspects_rows = []
dates = pd.to_datetime(df_market["date"]).dt.date

for d in tqdm(dates, desc="astro days"):
    bodies = calculate_daily_bodies(d, time_utc, settings.bodies)
    aspects = calculate_aspects(bodies, settings.aspects)

    for b in bodies:
        bodies_rows.append({
            "date": b.date,
            "body": b.body,
            "lon": b.lon,
            "lat": b.lat,
            "speed": b.speed,
            "is_retro": b.is_retro,
            "sign": b.sign,
            "declination": b.declination,
        })

    for a in aspects:
        aspects_rows.append({
            "date": a.date,
            "p1": a.p1,
            "p2": a.p2,
            "aspect": a.aspect,
            "orb": a.orb,
            "is_exact": a.is_exact,
            "is_applying": a.is_applying,
        })

df_bodies = pd.DataFrame(bodies_rows)
df_aspects = pd.DataFrame(aspects_rows)

bodies_path.parent.mkdir(parents=True, exist_ok=True)
df_bodies.to_parquet(bodies_path, index=False)
df_aspects.to_parquet(aspects_path, index=False)
print(f"Saved bodies: {bodies_path}")
print(f"Saved aspects: {aspects_path}")

print(df_bodies.head())
print(df_aspects.head())


In [None]:
# Build astro features
# Ignore features cache, recompute
print("Ignoring features cache, recomputing...")
df_features = build_features_daily(df_bodies, df_aspects)
df_features.to_parquet(features_path, index=False)
print(f"Saved features: {features_path}")

print(df_features.head())
print(f"Features: {df_features.shape}")


## 4. Merge features and target


In [None]:
# Merge by date
features = df_features.copy()
features["date"] = pd.to_datetime(features["date"])

labels = df_labels_shifted[["date", "target"]].copy()
labels["date"] = pd.to_datetime(labels["date"])

# Date intersection only
df_dataset = pd.merge(features, labels, on="date", how="inner")

# Drop possible duplicates
if df_dataset["date"].duplicated().any():
    df_dataset = df_dataset.drop_duplicates(subset=["date"]).reset_index(drop=True)

print(df_dataset.head())
print(f"Final dataset: {df_dataset.shape}")


## 5. Train/val/test split (time-based)


In [None]:
# Time-based split without shuffling
train_ratio = 0.7
val_ratio = 0.15

n = len(df_dataset)
train_end = int(n * train_ratio)
val_end = int(n * (train_ratio + val_ratio))

train_df = df_dataset.iloc[:train_end].copy()
val_df = df_dataset.iloc[train_end:val_end].copy()
test_df = df_dataset.iloc[val_end:].copy()

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
print(f"Train range: {train_df['date'].min().date()} -> {train_df['date'].max().date()}")
print(f"Test range : {test_df['date'].min().date()} -> {test_df['date'].max().date()}")


## 6. Prepare X/y matrices


In [None]:
# Feature list (astro only)
feature_cols = [c for c in df_dataset.columns if c not in ["date", "target"]]

X_train = train_df[feature_cols].to_numpy(dtype=np.float64)
y_train = train_df["target"].to_numpy(dtype=np.int32)

X_val = val_df[feature_cols].to_numpy(dtype=np.float64)
y_val = val_df["target"].to_numpy(dtype=np.int32)

X_test = test_df[feature_cols].to_numpy(dtype=np.float64)
y_test = test_df["target"].to_numpy(dtype=np.int32)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val  : {X_val.shape}, y_val  : {y_val.shape}")
print(f"X_test : {X_test.shape}, y_test : {y_test.shape}")


## 7. XGBoost training


In [None]:
import xgboost as xgb
import numpy as np
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    balanced_accuracy_score,
    matthews_corrcoef,
    recall_score,
)
from src.models.xgb import XGBBaseline

train_cfg = cfg_train.get("training", {})
TWO_STAGE = bool(train_cfg.get("two_stage", True))
if BINARY_TREND and TWO_STAGE:
    print("Binary trend active -> forcing SINGLE_STAGE")
    TWO_STAGE = False

# Check if this XGBoost build supports CUDA
use_cuda = False
try:
    info = xgb.build_info()
    use_cuda = bool(info.get("USE_CUDA", False))
    print(f"XGBoost build_info USE_CUDA = {info.get('USE_CUDA', None)}")
except Exception as e:
    print("Failed to read build_info:", e)

device = "cuda" if use_cuda else "cpu"
print(f"Using device={device}")

if BINARY_TREND:
    label_names = ["DOWN", "UP"]
    label_ids = [0, 1]
else:
    label_names = ["DOWN", "SIDEWAYS", "UP"]
    label_ids = [0, 1, 2]
N_CLASSES = len(label_ids)


def majority_baseline_pred(y_true, lbls):
    counts = [int((y_true == lbl).sum()) for lbl in lbls]
    majority_label = lbls[int(np.argmax(counts))]
    return np.full_like(y_true, majority_label)


def prev_label_baseline_pred(y_true, fallback_label: int = 0):
    if len(y_true) == 0:
        return np.array([], dtype=y_true.dtype)
    pred = np.roll(y_true, 1)
    pred[0] = fallback_label
    return pred


def calc_metrics(y_true, y_pred, lbls):
    acc = accuracy_score(y_true, y_pred)
    bal = recall_score(y_true, y_pred, labels=lbls, average="macro", zero_division=0)
    mcc = matthews_corrcoef(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro", zero_division=0)
    return {
        "acc": acc,
        "bal_acc": bal,
        "mcc": mcc,
        "f1_macro": f1m,
        "summary": 0.5 * (bal + f1m),
    }


def bootstrap_metrics(y_true, y_pred, lbls, n_boot=200, seed=42):
    rng = np.random.default_rng(seed)
    n = len(y_true)
    if n == 0:
        return None
    samples = {"acc": [], "bal_acc": [], "mcc": [], "f1_macro": [], "summary": []}
    for _ in range(n_boot):
        idx = rng.integers(0, n, size=n)
        m = calc_metrics(y_true[idx], y_pred[idx], lbls)
        for k in samples:
            samples[k].append(m[k])
    out = {}
    for k, vals in samples.items():
        lo, hi = np.percentile(vals, [2.5, 97.5])
        out[k] = (float(lo), float(hi))
    return out

In [None]:
def print_ci(ci, key, name):
    if ci is None:
        return
    lo, hi = ci[key]
    print(f"  {name} 95% CI: [{lo:.4f}, {hi:.4f}]")


def print_basic_metrics(y_true, y_pred, lbls, names, title: str) -> None:
    print()
    print(title)

    metrics = calc_metrics(y_true, y_pred, lbls)
    print("Accuracy:", metrics["acc"])
    print("Balanced acc:", metrics["bal_acc"])
    print("MCC:", metrics["mcc"])
    print("F1 macro:", metrics["f1_macro"])
    print("Summary score (avg bal_acc + f1_macro):", metrics["summary"])

    counts = [int((y_true == lbl).sum()) for lbl in lbls]
    n = len(y_true)
    dist_parts = []
    for lbl, name, cnt in zip(lbls, names, counts):
        pct = 100.0 * cnt / n if n else 0.0
        dist_parts.append(f"{name}={cnt} ({pct:.1f}%)")
    print("Class distribution:", ", ".join(dist_parts))

    report_str = classification_report(
        y_true,
        y_pred,
        labels=lbls,
        target_names=names,
        zero_division=0,
    )
    report_dict = classification_report(
        y_true,
        y_pred,
        labels=lbls,
        target_names=names,
        output_dict=True,
        zero_division=0,
    )
    print("Classification report:")
    print(report_str)

    # Baseline 1: always predict majority class
    base_pred = majority_baseline_pred(y_true, lbls)
    base_metrics = calc_metrics(y_true, base_pred, lbls)
    print(
        f"Majority baseline -> acc={base_metrics['acc']:.4f}, "
        f"bal_acc={base_metrics['bal_acc']:.4f}, f1_macro={base_metrics['f1_macro']:.4f}, "
        f"summary={base_metrics['summary']:.4f}"
    )

    # Baseline 2: predict previous label (naive time baseline)
    prev_pred = prev_label_baseline_pred(y_true, fallback_label=lbls[0])
    prev_metrics = calc_metrics(y_true, prev_pred, lbls)
    print(
        f"Prev-label baseline -> acc={prev_metrics['acc']:.4f}, "
        f"bal_acc={prev_metrics['bal_acc']:.4f}, f1_macro={prev_metrics['f1_macro']:.4f}, "
        f"summary={prev_metrics['summary']:.4f}"
    )

    # Bootstrap CI for model metrics
    ci = bootstrap_metrics(y_true, y_pred, lbls, n_boot=200, seed=42)
    if ci is not None:
        print("Model 95% bootstrap CI:")
        print_ci(ci, "acc", "acc")
        print_ci(ci, "bal_acc", "bal_acc")
        print_ci(ci, "f1_macro", "f1_macro")
        print_ci(ci, "summary", "summary")

    # Sanity warnings
    warn_margin = 0.02
    if metrics["acc"] < max(base_metrics["acc"], prev_metrics["acc"]) + warn_margin:
        print("WARNING: accuracy barely above naive baselines")
    if metrics["bal_acc"] < max(base_metrics["bal_acc"], prev_metrics["bal_acc"]) + warn_margin:
        print("WARNING: balanced accuracy barely above naive baselines")
    if metrics["f1_macro"] < max(base_metrics["f1_macro"], prev_metrics["f1_macro"]) + warn_margin:
        print("WARNING: macro F1 barely above naive baselines")
    if metrics["summary"] < max(base_metrics["summary"], prev_metrics["summary"]) + warn_margin:
        print("WARNING: summary score barely above naive baselines")

    low_recall = []
    for name in names:
        if name in report_dict and report_dict[name]["recall"] < 0.2:
            low_recall.append(f"{name} (recall={report_dict[name]['recall']:.2f})")
    if low_recall:
        print("WARNING: low recall ->", ", ".join(low_recall))

    if len(counts) > 0 and min(counts) < 30:
        print("WARNING: some classes have <30 samples; metrics may be unstable")


def plot_confusion(y_true, y_pred, lbls, names, title: str) -> None:
    cm = confusion_matrix(y_true, y_pred, labels=lbls)
    plt.figure(figsize=(4.5, 3.8))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=names,
        yticklabels=names,
    )
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()


if TWO_STAGE:
    print("Training mode: TWO_STAGE (MOVE/NO_MOVE -> UP/DOWN)")

    # --- Stage 1: MOVE vs NO_MOVE ---
    y_train_move = (y_train != 1).astype(np.int32)
    y_val_move = (y_val != 1).astype(np.int32)
    y_test_move = (y_test != 1).astype(np.int32)

    w_train_move = compute_sample_weight(class_weight="balanced", y=y_train_move)
    w_val_move = compute_sample_weight(class_weight="balanced", y=y_val_move)

    model_move = XGBBaseline(
        n_classes=2,
        device=device,
        random_state=42,
        n_estimators=300,
        max_depth=6,
        learning_rate=0.01,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
    )

    model_move.fit(
        X_train, y_train_move,
        X_val=X_val, y_val=y_val_move,
        feature_names=feature_cols,
        sample_weight=w_train_move,
        sample_weight_val=w_val_move,
    )

    # --- Stage 2: direction (UP vs DOWN) only on MOVE rows ---
    mask_train_dir = y_train != 1
    mask_val_dir = y_val != 1
    mask_test_dir = y_test != 1

    X_train_dir = X_train[mask_train_dir]
    y_train_dir = (y_train[mask_train_dir] == 2).astype(np.int32)
    X_val_dir = X_val[mask_val_dir]
    y_val_dir = (y_val[mask_val_dir] == 2).astype(np.int32)

    w_train_dir = compute_sample_weight(class_weight="balanced", y=y_train_dir)
    w_val_dir = compute_sample_weight(class_weight="balanced", y=y_val_dir)

    model_dir = XGBBaseline(
        n_classes=2,
        device=device,
        random_state=42,
        n_estimators=300,
        max_depth=6,
        learning_rate=0.01,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
    )

    model_dir.fit(
        X_train_dir, y_train_dir,
        X_val=X_val_dir, y_val=y_val_dir,
        feature_names=feature_cols,
        sample_weight=w_train_dir,
        sample_weight_val=w_val_dir,
    )

    # --- Combine predictions to preserve the original sequence ---
    move_pred = model_move.predict(X_test)
    dir_pred_full = model_dir.predict(X_test)

    # If MOVE then UP/DOWN else SIDEWAYS(1)
    y_pred = np.where(move_pred == 1, np.where(dir_pred_full == 1, 2, 0), 1)
else:
    if BINARY_TREND:
        print("Training mode: SINGLE_STAGE (binary)")
    else:
        print("Training mode: SINGLE_STAGE (3 classes)")

    w_train = compute_sample_weight(class_weight="balanced", y=y_train)
    w_val = compute_sample_weight(class_weight="balanced", y=y_val)

    model = XGBBaseline(
        n_classes=N_CLASSES,
        device=device,
        random_state=42,
        n_estimators=300,
        max_depth=3,
        learning_rate=0.01,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
    )

    model.fit(
        X_train, y_train,
        X_val=X_val, y_val=y_val,
        feature_names=feature_cols,
        sample_weight=w_train,
        sample_weight_val=w_val,
    )

    y_pred = model.predict(X_test)

# --- Metrics: overall 3-class ---
overall_title = "Overall (binary) metrics" if BINARY_TREND else "Overall (3-class) metrics"
print_basic_metrics(y_test, y_pred, label_ids, label_names, overall_title)
cm_title = "Confusion matrix (binary)" if BINARY_TREND else "Confusion matrix (3-class)"
plot_confusion(y_test, y_pred, label_ids, label_names, cm_title)

if TWO_STAGE:
    # Stage 1 metrics (MOVE vs NO_MOVE)
    move_names = ["NO_MOVE", "MOVE"]
    print_basic_metrics(y_test_move, move_pred, [0, 1], move_names, "Stage 1 (MOVE vs NO_MOVE) metrics")
    plot_confusion(y_test_move, move_pred, [0, 1], move_names, "Confusion matrix (MOVE vs NO_MOVE)")

    # Stage 2 metrics (UP vs DOWN) only on MOVE rows
    if mask_test_dir.sum() > 0:
        X_test_dir = X_test[mask_test_dir]
        y_test_dir = (y_test[mask_test_dir] == 2).astype(np.int32)
        dir_pred = model_dir.predict(X_test_dir)
        dir_names = ["DOWN", "UP"]
        print_basic_metrics(y_test_dir, dir_pred, [0, 1], dir_names, "Stage 2 (UP vs DOWN) metrics")
        plot_confusion(y_test_dir, dir_pred, [0, 1], dir_names, "Confusion matrix (UP vs DOWN)")
    else:
        print()
        print("Stage 2 metrics skipped: no MOVE samples in test set.")

In [None]:
# Plot BTC close with prediction background
# Green: UP, Red: DOWN, no color: SIDEWAYS

# --- Plot options ---
PLOT_SCOPE = "test"  # "test", "val", "train", "full"
PLOT_START = None    # e.g. "2023-01-01"
PLOT_END = None      # e.g. "2024-01-01"
PLOT_LAST_N = 1400    # set None to disable
PRED_MODE = "dir_only"  # "three_class" or "dir_only"
PRED_DIR_MASK_MOVE = False  # if True, show dir preds only when MOVE predicted
SHOW_TRUE = True           # second panel with true labels
PRICE_COL = "close"

# Select base dataframe
if PLOT_SCOPE == "full":
    base_df = df_dataset.copy()
elif PLOT_SCOPE == "train":
    base_df = train_df.copy()
elif PLOT_SCOPE == "val":
    base_df = val_df.copy()
else:
    base_df = test_df.copy()

if PLOT_SCOPE != "test":
    print("NOTE: PLOT_SCOPE is not test; this is in-sample visualization.")

base_df["date"] = pd.to_datetime(base_df["date"])

# Compute predictions for chosen scope
X_plot = base_df[feature_cols].to_numpy(dtype=np.float64)
if TWO_STAGE:
    move_pred_plot = model_move.predict(X_plot)
    dir_pred_plot = model_dir.predict(X_plot)
    pred_3c_plot = np.where(move_pred_plot == 1, np.where(dir_pred_plot == 1, 2, 0), 1)
else:
    move_pred_plot = np.full(len(base_df), np.nan)
    dir_pred_plot = np.full(len(base_df), np.nan)
    pred_3c_plot = model.predict(X_plot)

plot_df = base_df[["date", "target"]].copy()
plot_df["pred_3c"] = pred_3c_plot
plot_df["pred_move"] = move_pred_plot
plot_df["pred_dir"] = dir_pred_plot

market_dates = df_market[["date", PRICE_COL]].copy()
market_dates["date"] = pd.to_datetime(market_dates["date"])

plot_df = plot_df.merge(market_dates, on="date", how="left")
plot_df = plot_df.dropna(subset=[PRICE_COL]).sort_values("date").reset_index(drop=True)

# Apply date window
if PLOT_START is not None:
    plot_df = plot_df[plot_df["date"] >= pd.to_datetime(PLOT_START)]
if PLOT_END is not None:
    plot_df = plot_df[plot_df["date"] <= pd.to_datetime(PLOT_END)]
if PLOT_LAST_N is not None and len(plot_df) > PLOT_LAST_N:
    plot_df = plot_df.tail(PLOT_LAST_N)

if plot_df.empty:
    raise ValueError("Plot window is empty. Check PLOT_START/PLOT_END/PLOT_LAST_N")

# Helper to shade UP/DOWN zones

def shade_up_down(ax, dates, close, up_mask, down_mask, title: str, y_label: str):
    ax.plot(dates, close, color="black", linewidth=1.2, label="BTC close")
    ax.fill_between(
        dates,
        0,
        1,
        where=up_mask,
        transform=ax.get_xaxis_transform(),
        color="green",
        alpha=0.15,
        label="UP",
    )
    ax.fill_between(
        dates,
        0,
        1,
        where=down_mask,
        transform=ax.get_xaxis_transform(),
        color="red",
        alpha=0.15,
        label="DOWN",
    )
    ax.set_title(title)
    ax.set_ylabel(y_label)
    ax.legend(loc="upper left")

# Choose prediction masks
preds_3c = plot_df["pred_3c"].to_numpy()
if BINARY_TREND:
    up_mask_pred = preds_3c == 1
    down_mask_pred = preds_3c == 0
    title_pred = "Predicted binary (UP/DOWN shaded)"
elif PRED_MODE == "dir_only" and TWO_STAGE:
    dir_pred = plot_df["pred_dir"].to_numpy().astype(int)
    if PRED_DIR_MASK_MOVE:
        move_mask = plot_df["pred_move"].to_numpy() == 1
        up_mask_pred = (dir_pred == 1) & move_mask
        down_mask_pred = (dir_pred == 0) & move_mask
        title_pred = "Predicted direction (dir model, MOVE only)"
    else:
        up_mask_pred = dir_pred == 1
        down_mask_pred = dir_pred == 0
        title_pred = "Predicted direction (dir model, all points)"
else:
    up_mask_pred = preds_3c == 2
    down_mask_pred = preds_3c == 0
    title_pred = "Predicted 3-class (UP/DOWN shaded)"

dates = plot_df["date"].to_numpy()
close = plot_df[PRICE_COL].to_numpy()
if PLOT_PRICE_MODE == 'log':
    close = np.log(close)
    y_label = 'log(price)'
else:
    y_label = 'Close'

if SHOW_TRUE:
    fig, axes = plt.subplots(2, 1, figsize=(12, 7), sharex=True)
    shade_up_down(axes[0], dates, close, up_mask_pred, down_mask_pred, title_pred, y_label)

    true_labels = plot_df["target"].to_numpy()
    if BINARY_TREND:
        up_mask_true = true_labels == 1
        down_mask_true = true_labels == 0
    else:
        up_mask_true = true_labels == 2
        down_mask_true = true_labels == 0
    shade_up_down(axes[1], dates, close, up_mask_true, down_mask_true, "True labels (UP/DOWN shaded)", y_label)

    axes[1].set_xlabel("Date")
    plt.tight_layout()
    plt.show()
else:
    fig, ax = plt.subplots(figsize=(12, 4))
    shade_up_down(ax, dates, close, up_mask_pred, down_mask_pred, title_pred, y_label)
    ax.set_xlabel("Date")
    plt.tight_layout()
    plt.show()



In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

if BINARY_TREND:
    labels = ['DOWN', 'UP']
    lbl_ids = [0, 1]
else:
    labels = ['DOWN', 'SIDEWAYS', 'UP']
    lbl_ids = [0, 1, 2]

cm = confusion_matrix(y_test, y_pred, labels=lbl_ids)

plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
# Feature importance (top 20)
# In two-stage mode you can plot one stage or both.
IMP_MODEL_STAGE = "dir"  # "dir", "move", or "both"

if TWO_STAGE:
    if IMP_MODEL_STAGE == "both":
        stage_models = [
            ("MOVE vs NO_MOVE", model_move),
            ("UP vs DOWN", model_dir),
        ]
    elif IMP_MODEL_STAGE == "move":
        stage_models = [("MOVE vs NO_MOVE", model_move)]
    else:
        stage_models = [("UP vs DOWN", model_dir)]
else:
    stage_models = [("3-class", model)]

for stage_name, stage_model in stage_models:
    importances = stage_model.model.feature_importances_
    imp_df = pd.DataFrame({
        "feature": feature_cols,
        "importance": importances,
    }).sort_values("importance", ascending=False)

    plt.figure(figsize=(8, 6))
    sns.barplot(data=imp_df.head(20), x="importance", y="feature", color="tab:blue")
    plt.title(f"Top-20 astro features by importance ({stage_name})")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()



In [None]:
# Save model (optional)
from joblib import dump

artifact_dir = PROJECT_ROOT / "models_artifacts"
artifact_dir.mkdir(parents=True, exist_ok=True)

if TWO_STAGE:
    artifact = {
        "mode": "two_stage",
        "move": {
            "model": model_move.model,
            "scaler": model_move.scaler,
        },
        "dir": {
            "model": model_dir.model,
            "scaler": model_dir.scaler,
        },
        "feature_names": feature_cols,
        "config": {
            "sigma": SIGMA,
            "threshold": THRESHOLD,
            "horizon": HORIZON,
        },
    }
    out_path = artifact_dir / f"xgb_astro_baseline_two_stage_h{HORIZON}.joblib"
else:
    artifact = {
        "mode": "single_stage",
        "model": model.model,
        "scaler": model.scaler,
        "feature_names": feature_cols,
        "config": {
            "sigma": SIGMA,
            "threshold": THRESHOLD,
            "horizon": HORIZON,
        },
    }
    out_path = artifact_dir / f"xgb_astro_baseline_h{HORIZON}.joblib"

dump(artifact, out_path)
print(f"Saved: {out_path}")



## 8. Ideas for improvement

- Pick sigma/threshold based on model metrics, not only class balance.
- Add transit-to-natal aspects as extra features.
- Use separate models for different market regimes.
