# Moon Cycles Deep Search (v2) â€” Model Bakeoff

Goal of this notebook:
- Answer one simple question in the most honest way possible:

**Do Moon-phase-only features contain any real predictive edge for BTC direction, or is it basically random?**

How we make this answer convincing:
1. We test multiple model families (linear, trees, small neural net, XGBoost).
2. We use strict time split (no shuffling).
3. We choose Gaussian label parameters using **validation only**.
4. We touch the test set only for the final report.

If all models look random on the test set, the problem is NOT the model.
It means the features probably do not carry useful signal (for this target).

In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

PROJECT_ROOT = Path('/home/rut/ostrofun')
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from RESEARCH2.Moon_cycles.moon_data import (
    MoonLabelConfig,
    build_moon_phase_features,
    load_market_slice,
)
from RESEARCH2.Moon_cycles.bakeoff_utils import run_moon_model_bakeoff
from RESEARCH2.Moon_cycles.eval_visuals import VisualizationConfig, evaluate_with_visuals

pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 160)

In [None]:
# ------------------------------
# Research configuration block
# ------------------------------

START_DATE = '2017-11-01'
END_DATE = None
USE_CACHE = True
VERBOSE = True

# Label parameters that stay fixed while we tune Gaussian window/std.
LABEL_CFG = MoonLabelConfig(
    horizon=1,
    move_share=0.5,
    label_mode='balanced_detrended',
    price_mode='raw',
)

# Gaussian grid to tune (label detrending parameters).
GAUSS_WINDOWS = [101, 151, 201, 251, 301]
GAUSS_STDS = [30.0, 50.0, 70.0, 90.0]

# Threshold tuning penalties (helps avoid one-class prediction collapse).
THRESHOLD_GAP_PENALTY = 0.25
THRESHOLD_PRIOR_PENALTY = 0.05

# XGBoost params (kept as baseline).
XGB_PARAMS = {
    'n_estimators': 500,
    'max_depth': 6,
    'learning_rate': 0.03,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'early_stopping_rounds': 50,
}

# Dark-theme visuals.
VIS_CFG = VisualizationConfig(
    rolling_window_days=90,
    rolling_min_periods=30,
    probability_bins=64,
)

print('Config loaded.')
print('Gaussian grid size =', len(GAUSS_WINDOWS) * len(GAUSS_STDS))

In [None]:
# -------------------------------------------
# Load market data and build Moon-only features
# -------------------------------------------

df_market = load_market_slice(
    start_date=START_DATE,
    end_date=END_DATE,
    use_cache=USE_CACHE,
    verbose=VERBOSE,
)

df_moon_features = build_moon_phase_features(
    df_market=df_market,
    use_cache=USE_CACHE,
    verbose=VERBOSE,
    progress=True,
)

print('Market rows:', len(df_market))
print('Moon feature rows:', len(df_moon_features))
print('Market range:', df_market['date'].min().date(), '->', df_market['date'].max().date())

display(df_moon_features.head(5))

## Bakeoff Run

This section runs a grid over Gaussian label parameters and evaluates multiple models.

Important:
- We select the best Gaussian params **per model** using **validation metrics**.
- Only after that we look at test metrics.

In [None]:
bakeoff = run_moon_model_bakeoff(
    df_market=df_market,
    df_moon_features=df_moon_features,
    gauss_windows=GAUSS_WINDOWS,
    gauss_stds=GAUSS_STDS,
    label_cfg=LABEL_CFG,
    include_xgb=True,
    xgb_params=XGB_PARAMS,
    threshold_gap_penalty=THRESHOLD_GAP_PENALTY,
    threshold_prior_penalty=THRESHOLD_PRIOR_PENALTY,
    use_cache=USE_CACHE,
    verbose=VERBOSE,
)

results_table = bakeoff['results_table']
best_by_val = bakeoff['best_by_val_table']
best_runs = bakeoff['best_runs']

print('Bakeoff results rows:', len(results_table))
print('Best-by-validation table (one row per model):')
display(best_by_val)

In [None]:
# -------------------------------------------------
# Show top configs per model by VALIDATION quality
# -------------------------------------------------
# This makes it easy to see if any model consistently beats random.

show_cols = [
    'model',
    'gauss_window',
    'gauss_std',
    'val_recall_min',
    'val_recall_gap',
    'val_mcc',
    'val_acc',
    'test_recall_min',
    'test_recall_gap',
    'test_mcc',
    'test_acc',
    'p_value_vs_random',
    'baseline_majority_test_acc',
    'baseline_random_test_acc',
    'pred_up_share',
]

for model in sorted(results_table['model'].unique()):
    sub = results_table[results_table['model'] == model].copy()
    sub = sub.sort_values(
        ['val_recall_min', 'val_recall_gap', 'val_mcc', 'val_acc'],
        ascending=[False, True, False, False],
    )
    print()
    print('='*90)
    print('MODEL:', model)
    display(sub[show_cols].head(10))

## Visual Diagnostics For Winners

For each model, we take its best-by-validation configuration and plot:
- confusion matrix
- predicted vs true label background over price
- rolling metrics
- probability histogram

This is the most "human readable" way to see if the model actually does something useful.

In [None]:
for model_name, run in best_runs.items():
    print()
    print('#' * 100)
    print('WINNER MODEL:', model_name)

    pred = run['predictions'].copy()
    test_df = pred[pred['split_role'] == 'test'].copy().reset_index(drop=True)
    test_df = test_df.dropna(subset=['pred_label'])

    # Evaluate baseline (majority) vs model on the SAME test period.
    _ = evaluate_with_visuals(
        df_plot=test_df,
        y_true=test_df['target'].to_numpy(dtype=np.int32),
        y_pred=test_df['baseline_majority'].to_numpy(dtype=np.int32),
        y_prob_up=None,
        title=f"{model_name.upper()} - BEFORE training (majority baseline)",
        vis_cfg=VIS_CFG,
        show_visuals=True,
    )

    _ = evaluate_with_visuals(
        df_plot=test_df,
        y_true=test_df['target'].to_numpy(dtype=np.int32),
        y_pred=test_df['pred_label'].to_numpy(dtype=np.int32),
        y_prob_up=test_df['pred_proba_up'].to_numpy(dtype=float),
        title=f"{model_name.upper()} - AFTER training (Moon-only)",
        vis_cfg=VIS_CFG,
        show_visuals=True,
    )

In [None]:
# -------------------------------------------------
# Quick conclusion helper
# -------------------------------------------------
# This is a simple text summary you can read like a report.

report_cols = [
    'model',
    'gauss_window',
    'gauss_std',
    'val_recall_min',
    'val_recall_gap',
    'val_mcc',
    'test_recall_min',
    'test_recall_gap',
    'test_mcc',
    'test_acc',
    'accuracy_ci95_low',
    'accuracy_ci95_high',
    'p_value_vs_random',
]

display(best_by_val[report_cols])

print()
print('Interpretation guide (very simple):')
print('- If test_acc CI includes 0.50 and p-value is not small (e.g. > 0.05), it looks like random.')
print('- If test_recall_min is below 0.50, the weaker class is basically not predictable.')
print('- If MCC is near 0, it is basically random.')