# Classic Split Notebook: Transit-Natal Core + Optional Progression/Direction Blocks

This notebook is focused on a classic, time-ordered protocol only.

Core feature block (enabled by default):
- transit aspects to natal,
- moon phases and elongations.

Optional prebuilt blocks (disabled by default):
- progressed-to-natal aspects,
- directed-to-natal aspects.

Houses are intentionally excluded.


In [17]:
from __future__ import annotations

from pathlib import Path
import json
import sys
from datetime import datetime
from itertools import product

import numpy as np
import pandas as pd

PROJECT_ROOT = Path('/home/rut/ostrofun')
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight

from RESEARCH.cache_utils import load_cache, save_cache
from RESEARCH.config import cfg as project_cfg
from RESEARCH.astro_engine import (
    init_ephemeris,
    parse_birth_dt_utc,
    calculate_bodies_for_dates,
    calculate_phases_for_dates,
    calculate_transits_for_dates,
    calculate_aspects_for_dates,
    get_natal_bodies,
)
from RESEARCH.astro.aspects import scale_aspects
from src.astro.engine.aspects import calculate_transit_aspects
from src.astro.engine.calculator import calculate_bodies
from src.astro.engine.models import BodyPosition
from src.features.builder import build_transit_aspect_features, build_aspect_pair_features

from RESEARCH.model_training import train_xgb_model, check_cuda_available
from RESEARCH.features import merge_features_with_labels

from RESEARCH2.Moon_cycles.moon_data import (
    MoonLabelConfig,
    load_market_slice,
    build_balanced_labels_for_gauss,
)
from RESEARCH2.Moon_cycles.splits import make_classic_split
from RESEARCH2.Moon_cycles.threshold_utils import tune_threshold_with_balance, predict_proba_up_safe, evaluate_threshold_grid
from RESEARCH2.Moon_cycles.eval_utils import compute_binary_metrics, compute_statistical_significance

pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 180)


In [None]:
# ------------------------------
# Configuration
# ------------------------------

# Data range.
START_DATE = '2017-11-01'
END_DATE = None

# Split protocol: classic only.
SPLIT_MODE = 'classic'
TRAIN_RATIO = 0.50
VAL_RATIO = 0.20

# Feature/caching settings.
ORB_MULT = 0.1
CACHE_NAMESPACE = 'research2_transit_blocks'
USE_CACHE = True
VERBOSE = True
XGB_USE_CUDA, XGB_DEVICE = check_cuda_available()
PROGRESS = True

# Subject birth datetime for natal chart.
BIRTH_DT_UTC = str(project_cfg.subject.get('birth_dt_utc', '2009-10-10T18:15:05Z'))

# Birthdate sweep list from your phase-1 transits-only ranking.
ENABLE_BIRTHDATE_SWEEP =  False
BIRTHDATE_CANDIDATES = [
     '2009-10-10', '2009-10-12', '2009-10-13', '2009-01-03',
    # '2009-03-27', '2009-03-21', '2009-08-22', '2009-11-05', '2009-01-18','2009-12-03',
    # '2009-01-01', '2009-01-06', '2009-04-29', '2009-06-14', '2009-12-25',
    # '2009-04-22', '2009-04-04', '2009-02-17', '2009-07-11', '2009-05-04',
]


def _compose_birth_dt_utc(date_yyyy_mm_dd: str, base_birth_dt_utc: str) -> str:
    """Attach baseline UTC time to a date-only candidate."""
    if 'T' in date_yyyy_mm_dd:
        return date_yyyy_mm_dd
    time_part = str(base_birth_dt_utc).split('T', 1)[1]
    return f"{date_yyyy_mm_dd}T{time_part}"


if ENABLE_BIRTHDATE_SWEEP:
    BIRTH_DT_SWEEP_LIST = [_compose_birth_dt_utc(d, BIRTH_DT_UTC) for d in BIRTHDATE_CANDIDATES]
else:
    BIRTH_DT_SWEEP_LIST = [BIRTH_DT_UTC]

# Label config defaults (used when sweep is disabled).
LABEL_CFG = MoonLabelConfig(
    horizon=1,
    move_share=0.5,
    label_mode='balanced_detrended',
    price_mode='raw',
)
GAUSS_WINDOW = 201
GAUSS_STD = 70.0

# Threshold tuning objective penalties.
THRESHOLD_GAP_PENALTY = 0.25
THRESHOLD_PRIOR_PENALTY = 0.05
THRESHOLD_CANDIDATE_GRID = np.linspace(0.01, 0.99, 197)
THRESHOLD_CURVE_TOP_K = 10

# Optional sensitivity sweep for threshold penalties (no model retraining).
ENABLE_THRESHOLD_SENSITIVITY = True
THRESHOLD_GAP_PENALTY_GRID = [0.00, 0.10, 0.25, 0.40, 0.80]
THRESHOLD_PRIOR_PENALTY_GRID = [0.00, 0.05, 0.10, 0.20, 0.40]

# Feature blocks.
FEATURE_BLOCKS = {
    'classic_transit_phase': True,   # transit->natal + phases/elongations
    'progressed_to_natal': True,     # optional block
    'directed_to_natal': True,       # optional block
}

# Optional extension for the classic block: transit-to-transit aspects.
CLASSIC_INCLUDE_TRANSIT_PAIR_ASPECTS = True

# If True, disabled blocks are still built once and cached.
PRECOMPUTE_DISABLED_BLOCKS = True

# Models used for classification comparison.
# MODEL_SET = ('xgb', 'rf')
MODEL_SET = ('xgb',)

# ---------------------------------------------------------------------
# Sweep options: birth dates + Gaussian labels + key model hyperparameters.
# Model selection is done by VALIDATION only.
# TEST is evaluated only for best validation configs.
# ---------------------------------------------------------------------
ENABLE_GAUSS_HYPER_SWEEP = True

GAUSS_WINDOWS = [151, 201]
GAUSS_STDS = [30.0]

XGB_PARAM_GRID = {
    'n_estimators': [30, 50, 75,100,150],
    'max_depth': [3, 4,6],
    'learning_rate': [0.01],
    'colsample_bytree': [0.7,0.8,0.9],
    'subsample': [0.7,0.8,0.9],
    'early_stopping_rounds': [75],
    'weight_power': [1.0],
    'sideways_penalty': [1.0],
}

RF_PARAM_GRID = {
    'n_estimators': [600, 900],
    'max_depth': [5, 7],
    'min_samples_leaf': [4, 8],
}

# Optional cap for fast dry runs. None = full grid.
MAX_SWEEP_RUNS = None

print('Split mode:', SPLIT_MODE)
print('Ratios train/val/test:', TRAIN_RATIO, VAL_RATIO, 1.0 - TRAIN_RATIO - VAL_RATIO)
print('Birth datetime:', BIRTH_DT_UTC)
print('Birthdate sweep enabled:', ENABLE_BIRTHDATE_SWEEP, 'num_dates=', len(BIRTH_DT_SWEEP_LIST))
print('Feature blocks:', FEATURE_BLOCKS)
print('Use transit pair aspects:', CLASSIC_INCLUDE_TRANSIT_PAIR_ASPECTS)
print('XGB device:', XGB_DEVICE, '(CUDA available:', XGB_USE_CUDA, ')')
print('Sweep enabled:', ENABLE_GAUSS_HYPER_SWEEP)
print('Threshold sensitivity:', ENABLE_THRESHOLD_SENSITIVITY)
print('Threshold grid points:', len(THRESHOLD_CANDIDATE_GRID), 'range=', (float(np.min(THRESHOLD_CANDIDATE_GRID)), float(np.max(THRESHOLD_CANDIDATE_GRID))))
print('Threshold penalties grid sizes:', len(THRESHOLD_GAP_PENALTY_GRID), 'x', len(THRESHOLD_PRIOR_PENALTY_GRID))


Split mode: classic
Ratios train/val/test: 0.5 0.2 0.3
Birth datetime: 2009-01-03T18:15:05Z
Birthdate sweep enabled: False num_dates= 1
Feature blocks: {'classic_transit_phase': True, 'progressed_to_natal': True, 'directed_to_natal': True}
Use transit pair aspects: True
XGB device: cuda (CUDA available: True )
Sweep enabled: True
Threshold sensitivity: False


In [90]:
# ------------------------------
# Feature block builders
# ------------------------------

def _market_range_key(df_market: pd.DataFrame) -> dict:
    return {
        'start_date': pd.to_datetime(df_market['date']).min().strftime('%Y-%m-%d'),
        'end_date': pd.to_datetime(df_market['date']).max().strftime('%Y-%m-%d'),
        'rows': int(len(df_market)),
    }


def _cache_params(
    df_market: pd.DataFrame,
    block_name: str,
    birth_dt_utc: str,
    extra: dict | None = None,
) -> dict:
    params = {
        'kind': block_name,
        **_market_range_key(df_market),
        'orb_mult': float(ORB_MULT),
        'birth_dt_utc': str(birth_dt_utc),
        'schema': 'v1_no_houses',
    }
    if extra:
        params.update(extra)
    return params


def _normalize_phase_cols(df_phases: pd.DataFrame) -> pd.DataFrame:
    out = df_phases.copy()
    out['date'] = pd.to_datetime(out['date'])

    if 'moon_phase_angle' in out.columns:
        rad = np.deg2rad(pd.to_numeric(out['moon_phase_angle'], errors='coerce').astype(float))
        out['moon_phase_angle_trig_sin'] = np.sin(rad)
        out['moon_phase_angle_trig_cos'] = np.cos(rad)

    elong_cols = [c for c in out.columns if c.endswith('_elongation')]
    for col in elong_cols:
        rad = np.deg2rad(pd.to_numeric(out[col], errors='coerce').astype(float))
        out[f'{col}_trig_sin'] = np.sin(rad)
        out[f'{col}_trig_cos'] = np.cos(rad)

    return out


def _features_from_transit_df(df_market: pd.DataFrame, df_transits: pd.DataFrame) -> pd.DataFrame:
    base = df_market[['date']].copy()
    base['date'] = pd.to_datetime(base['date'])

    if df_transits.empty:
        return base

    df_work = df_transits.copy()
    df_work['date'] = pd.to_datetime(df_work['date'])
    feat = build_transit_aspect_features(df_work)
    feat['date'] = pd.to_datetime(feat['date'])

    out = base.merge(feat, on='date', how='left')
    feature_cols = [c for c in out.columns if c != 'date']
    out[feature_cols] = out[feature_cols].fillna(0)
    return out


def _features_from_pair_aspects_df(df_market: pd.DataFrame, df_pair_aspects: pd.DataFrame) -> pd.DataFrame:
    base = df_market[['date']].copy()
    base['date'] = pd.to_datetime(base['date'])

    if df_pair_aspects.empty:
        return base

    df_work = df_pair_aspects.copy()
    df_work['date'] = pd.to_datetime(df_work['date'])
    feat = build_aspect_pair_features(df_work)
    feat['date'] = pd.to_datetime(feat['date'])

    rename_map = {c: f'tr_pair_{c}' for c in feat.columns if c != 'date'}
    feat = feat.rename(columns=rename_map)

    out = base.merge(feat, on='date', how='left')
    feature_cols = [c for c in out.columns if c != 'date']
    out[feature_cols] = out[feature_cols].fillna(0)
    return out


def build_classic_transit_phase_block(df_market: pd.DataFrame, birth_dt_utc: str) -> pd.DataFrame:
    block_name = 'classic_transit_phase'
    params = _cache_params(
        df_market,
        block_name,
        birth_dt_utc=birth_dt_utc,
        extra={'include_transit_pair_aspects': bool(CLASSIC_INCLUDE_TRANSIT_PAIR_ASPECTS)},
    )

    if USE_CACHE:
        cached = load_cache(CACHE_NAMESPACE, block_name, params, verbose=VERBOSE)
        if cached is not None:
            return cached

    settings = init_ephemeris()
    _, geo_by_date = calculate_bodies_for_dates(
        dates=df_market['date'],
        settings=settings,
        center='geo',
        progress=PROGRESS,
    )
    natal_bodies = get_natal_bodies(str(birth_dt_utc), settings, center='geo')

    df_transits = calculate_transits_for_dates(
        bodies_by_date=geo_by_date,
        natal_bodies=natal_bodies,
        settings=settings,
        orb_mult=float(ORB_MULT),
        progress=PROGRESS,
    )

    if not df_transits.empty:
        df_transits = df_transits.copy()
        df_transits['transit_body'] = 'tr_' + df_transits['transit_body'].astype(str)

    transit_feat = _features_from_transit_df(df_market, df_transits)

    out_base = transit_feat
    if CLASSIC_INCLUDE_TRANSIT_PAIR_ASPECTS:
        df_pair_aspects = calculate_aspects_for_dates(
            bodies_by_date=geo_by_date,
            settings=settings,
            orb_mult=float(ORB_MULT),
            progress=PROGRESS,
            prefix='tr_',
        )
        pair_feat = _features_from_pair_aspects_df(df_market, df_pair_aspects)
        out_base = out_base.merge(pair_feat, on='date', how='left')

    df_phases = calculate_phases_for_dates(geo_by_date, progress=PROGRESS)
    df_phases = _normalize_phase_cols(df_phases)

    phase_cols = [c for c in df_phases.columns if c != 'date']
    out = out_base.merge(df_phases[['date', *phase_cols]], on='date', how='left')

    feature_cols = [c for c in out.columns if c != 'date']
    out[feature_cols] = out[feature_cols].fillna(0)

    if USE_CACHE:
        save_cache(out, CACHE_NAMESPACE, block_name, params, verbose=VERBOSE)

    return out


def _secondary_progressed_dt(birth_dt: datetime, market_dt: pd.Timestamp) -> datetime:
    age_days = (market_dt.date() - birth_dt.date()).days
    age_years = float(age_days) / 365.2425
    progressed_dt = birth_dt + pd.to_timedelta(age_years, unit='D')
    if isinstance(progressed_dt, pd.Timestamp):
        return progressed_dt.to_pydatetime()
    return progressed_dt


def build_progressed_to_natal_block(df_market: pd.DataFrame, birth_dt_utc: str) -> pd.DataFrame:
    block_name = 'progressed_to_natal'
    params = _cache_params(df_market, block_name, birth_dt_utc=birth_dt_utc)

    if USE_CACHE:
        cached = load_cache(CACHE_NAMESPACE, block_name, params, verbose=VERBOSE)
        if cached is not None:
            return cached

    settings = init_ephemeris()
    birth_dt = parse_birth_dt_utc(str(birth_dt_utc))
    natal_bodies = get_natal_bodies(str(birth_dt_utc), settings, center='geo')
    aspects_cfg = scale_aspects(settings.aspects, float(ORB_MULT))

    rows = []
    date_list = pd.to_datetime(df_market['date']).reset_index(drop=True)

    for i, market_dt in enumerate(date_list, start=1):
        progressed_dt = _secondary_progressed_dt(birth_dt, market_dt)
        progressed_bodies = calculate_bodies(progressed_dt, settings.bodies, center='geo')

        hits = calculate_transit_aspects(progressed_bodies, natal_bodies, aspects_cfg)
        for h in hits:
            rows.append({
                'date': market_dt.date(),
                'transit_body': f'prog_{h.transit_body}',
                'natal_body': h.natal_body,
                'aspect': h.aspect,
                'orb': h.orb,
                'is_exact': h.is_exact,
                'is_applying': h.is_applying,
            })

        if PROGRESS and (i == 1 or i % 500 == 0 or i == len(date_list)):
            print(f'[progressed_to_natal] birth={str(birth_dt_utc)[:10]} {i}/{len(date_list)} days processed')

    df_transits = pd.DataFrame(rows)
    out = _features_from_transit_df(df_market, df_transits)

    if USE_CACHE:
        save_cache(out, CACHE_NAMESPACE, block_name, params, verbose=VERBOSE)

    return out


def _solar_arc_directed_bodies(
    natal_bodies: list[BodyPosition],
    solar_arc_deg: float,
    market_date: pd.Timestamp,
) -> list[BodyPosition]:
    directed = []
    for nb in natal_bodies:
        directed.append(
            BodyPosition(
                date=market_date.date(),
                body=f'dir_{nb.body}',
                lon=(float(nb.lon) + float(solar_arc_deg)) % 360.0,
                lat=float(nb.lat),
                speed=0.0,
                is_retro=False,
                sign=nb.sign,
                declination=float(nb.declination),
            )
        )
    return directed


def build_directed_to_natal_block(df_market: pd.DataFrame, birth_dt_utc: str) -> pd.DataFrame:
    block_name = 'directed_to_natal'
    params = _cache_params(df_market, block_name, birth_dt_utc=birth_dt_utc)

    if USE_CACHE:
        cached = load_cache(CACHE_NAMESPACE, block_name, params, verbose=VERBOSE)
        if cached is not None:
            return cached

    settings = init_ephemeris()
    birth_dt = parse_birth_dt_utc(str(birth_dt_utc))
    natal_bodies = get_natal_bodies(str(birth_dt_utc), settings, center='geo')
    natal_map = {b.body: b for b in natal_bodies}
    if 'Sun' not in natal_map:
        raise ValueError('Sun is missing in natal bodies; solar-arc direction cannot be built.')

    aspects_cfg = scale_aspects(settings.aspects, float(ORB_MULT))

    rows = []
    date_list = pd.to_datetime(df_market['date']).reset_index(drop=True)

    for i, market_dt in enumerate(date_list, start=1):
        progressed_dt = _secondary_progressed_dt(birth_dt, market_dt)
        progressed_bodies = calculate_bodies(progressed_dt, settings.bodies, center='geo')
        progressed_map = {b.body: b for b in progressed_bodies}

        if 'Sun' not in progressed_map:
            continue

        solar_arc = (float(progressed_map['Sun'].lon) - float(natal_map['Sun'].lon)) % 360.0
        directed_bodies = _solar_arc_directed_bodies(natal_bodies, solar_arc, market_dt)

        hits = calculate_transit_aspects(directed_bodies, natal_bodies, aspects_cfg)
        for h in hits:
            rows.append({
                'date': market_dt.date(),
                'transit_body': h.transit_body,
                'natal_body': h.natal_body,
                'aspect': h.aspect,
                'orb': h.orb,
                'is_exact': h.is_exact,
                'is_applying': h.is_applying,
            })

        if PROGRESS and (i == 1 or i % 500 == 0 or i == len(date_list)):
            print(f'[directed_to_natal] birth={str(birth_dt_utc)[:10]} {i}/{len(date_list)} days processed')

    df_transits = pd.DataFrame(rows)
    out = _features_from_transit_df(df_market, df_transits)

    if USE_CACHE:
        save_cache(out, CACHE_NAMESPACE, block_name, params, verbose=VERBOSE)

    return out


def build_feature_matrix(df_market: pd.DataFrame, birth_dt_utc: str) -> tuple[pd.DataFrame, list[str]]:
    builders = {
        'classic_transit_phase': build_classic_transit_phase_block,
        'progressed_to_natal': build_progressed_to_natal_block,
        'directed_to_natal': build_directed_to_natal_block,
    }

    base = df_market[['date']].copy()
    base['date'] = pd.to_datetime(base['date'])

    used_blocks = []

    for block_name, builder in builders.items():
        enabled = bool(FEATURE_BLOCKS.get(block_name, False))

        if not enabled and not PRECOMPUTE_DISABLED_BLOCKS:
            continue

        print('-' * 100)
        if enabled:
            print(f'Building and USING block: {block_name} | birth={str(birth_dt_utc)[:10]}')
        else:
            print(f'Precomputing (cache only) block: {block_name} | birth={str(birth_dt_utc)[:10]}')

        df_block = builder(df_market, birth_dt_utc=birth_dt_utc)

        if enabled:
            feature_cols = [c for c in df_block.columns if c != 'date']
            rename_map = {c: f'{block_name}__{c}' for c in feature_cols}
            base = base.merge(df_block.rename(columns=rename_map), on='date', how='left')
            used_blocks.append(block_name)

    all_feature_cols = [c for c in base.columns if c != 'date']
    base[all_feature_cols] = base[all_feature_cols].fillna(0)
    return base, used_blocks


In [91]:
# ------------------------------
# Dataset build (features + labels)
# ------------------------------

if SPLIT_MODE != 'classic':
    raise ValueError('This notebook supports only classic split mode for now.')

# 1) Market data.
df_market = load_market_slice(
    start_date=START_DATE,
    end_date=END_DATE,
    use_cache=USE_CACHE,
    verbose=VERBOSE,
)

# 2) Feature matrix cache by birth datetime.
FEATURE_MATRIX_CACHE: dict[str, dict] = {}


def get_feature_matrix_for_birth_dt(birth_dt_utc: str, verbose: bool = False) -> dict:
    if birth_dt_utc in FEATURE_MATRIX_CACHE:
        return FEATURE_MATRIX_CACHE[birth_dt_utc]

    df_features_local, used_blocks_local = build_feature_matrix(df_market, birth_dt_utc=birth_dt_utc)

    out = {
        'birth_dt_utc': str(birth_dt_utc),
        'birth_date': str(birth_dt_utc)[:10],
        'df_features': df_features_local,
        'used_blocks': used_blocks_local,
    }
    FEATURE_MATRIX_CACHE[birth_dt_utc] = out

    if verbose:
        print(
            '[features]',
            f"birth={out['birth_date']}",
            f"blocks={out['used_blocks']}",
            f"rows={len(df_features_local)}",
            f"cols={len([c for c in df_features_local.columns if c != 'date'])}",
        )

    return out


def build_dataset_parts_for_gauss(
    gauss_window: int,
    gauss_std: float,
    birth_dt_utc: str,
    verbose: bool = False,
) -> dict:
    """Build one dataset + classic split for fixed Gaussian labels and birth datetime."""
    feature_pack = get_feature_matrix_for_birth_dt(birth_dt_utc, verbose=verbose)
    df_features_local = feature_pack['df_features']

    df_labels = build_balanced_labels_for_gauss(
        df_market=df_market,
        gauss_window=int(gauss_window),
        gauss_std=float(gauss_std),
        label_cfg=LABEL_CFG,
        use_cache=USE_CACHE,
        verbose=verbose,
    )

    df_dataset = merge_features_with_labels(
        df_features=df_features_local,
        df_labels=df_labels,
        verbose=verbose,
    )

    # Keep close for diagnostics only.
    df_close = df_market[['date', 'close']].copy()
    df_close['date'] = pd.to_datetime(df_close['date'])
    df_dataset = pd.merge(df_dataset, df_close, on='date', how='left')

    feature_cols_local = [c for c in df_dataset.columns if c not in {'date', 'target', 'close'}]

    split = make_classic_split(df_dataset, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)

    train_df_local = df_dataset.iloc[split.train_idx].copy().reset_index(drop=True)
    val_df_local = df_dataset.iloc[split.val_idx].copy().reset_index(drop=True)
    test_df_local = df_dataset.iloc[split.test_idx].copy().reset_index(drop=True)

    return {
        'birth_dt_utc': str(birth_dt_utc),
        'birth_date': str(birth_dt_utc)[:10],
        'used_blocks': list(feature_pack['used_blocks']),
        'gauss_window': int(gauss_window),
        'gauss_std': float(gauss_std),
        'df_dataset': df_dataset,
        'feature_cols': feature_cols_local,
        'train_df': train_df_local,
        'val_df': val_df_local,
        'test_df': test_df_local,
    }


# Build default dataset once (used when sweep is disabled).
default_parts = build_dataset_parts_for_gauss(
    GAUSS_WINDOW,
    GAUSS_STD,
    birth_dt_utc=BIRTH_DT_UTC,
    verbose=VERBOSE,
)
df_dataset = default_parts['df_dataset']
feature_cols = default_parts['feature_cols']
train_df = default_parts['train_df']
val_df = default_parts['val_df']
test_df = default_parts['test_df']

print('Default birth:', {'birth_dt_utc': BIRTH_DT_UTC})
print('Default gauss:', {'window': GAUSS_WINDOW, 'std': GAUSS_STD})
print('Dataset rows:', len(df_dataset), 'Num features:', len(feature_cols))
print('Rows:', {'train': len(train_df), 'val': len(val_df), 'test': len(test_df)})
print('UP share:', {
    'train': float((train_df['target'] == 1).mean()),
    'val': float((val_df['target'] == 1).mean()),
    'test': float((test_df['target'] == 1).mean()),
})


ðŸ“‚ Loading from cache: research2_moon__market__2017-11-01__8953c00f.parquet
----------------------------------------------------------------------------------------------------
Building and USING block: classic_transit_phase | birth=2009-01-03
ðŸ“‚ Loading from cache: research2_transit_blocks__classic_transit_phase__2017-11-01_orb0.15__ad6493d7.parquet
----------------------------------------------------------------------------------------------------
Building and USING block: progressed_to_natal | birth=2009-01-03
ðŸ“‚ Loading from cache: research2_transit_blocks__progressed_to_natal__2017-11-01_orb0.15__03edcc86.parquet
----------------------------------------------------------------------------------------------------
Building and USING block: directed_to_natal | birth=2009-01-03
ðŸ“‚ Loading from cache: research2_transit_blocks__directed_to_natal__2017-11-01_orb0.15__206dc751.parquet
[features] birth=2009-01-03 blocks=['classic_transit_phase', 'progressed_to_natal', 'directed_to_

In [92]:
# ------------------------------
# Model helpers (classification only)
# ------------------------------

VAL_SORT_COLS = ['val_recall_min', 'val_recall_gap', 'val_mcc', 'val_accuracy']
VAL_SORT_ASC = [False, True, False, False]


def _expand_param_grid(grid: dict[str, list]) -> list[dict]:
    """Expand a simple dict-of-lists grid into a list of dicts."""
    if not grid:
        return [{}]
    keys = list(grid.keys())
    combos = product(*(grid[k] for k in keys))
    return [dict(zip(keys, vals)) for vals in combos]


def _make_pred_frame(df_part: pd.DataFrame, split_role: str, proba_up: np.ndarray) -> pd.DataFrame:
    out = df_part[['date', 'target']].copy().reset_index(drop=True)
    out['split_role'] = split_role
    out['pred_proba_up'] = np.asarray(proba_up, dtype=float)
    return out


def _xgb_params_with_defaults(model_params: dict | None = None) -> dict:
    params = {
        'early_stopping_rounds': 50,
        'n_estimators': 500,
        'max_depth': 6,
        'learning_rate': 0.03,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'weight_power': 1.0,
        'sideways_penalty': 1.0,
    }
    if model_params:
        params.update(model_params)
    return params


def _rf_params_with_defaults(model_params: dict | None = None) -> dict:
    params = {
        'n_estimators': 800,
        'max_depth': 6,
        'min_samples_leaf': 8,
        'random_state': 42,
        'n_jobs': 1,
    }
    if model_params:
        params.update(model_params)
    return params


def train_predict_xgb(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: pd.DataFrame,
    feature_cols_local: list[str],
    model_params: dict | None = None,
) -> pd.DataFrame:
    X_train = train_df[feature_cols_local].to_numpy(dtype=float)
    y_train = train_df['target'].to_numpy(dtype=int)
    X_val = val_df[feature_cols_local].to_numpy(dtype=float)
    y_val = val_df['target'].to_numpy(dtype=int)
    X_test = test_df[feature_cols_local].to_numpy(dtype=float)

    xgb_params = _xgb_params_with_defaults(model_params)

    model = train_xgb_model(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        feature_names=feature_cols_local,
        n_classes=2,
        device=XGB_DEVICE,
        verbose=False,
        early_stopping_rounds=int(xgb_params['early_stopping_rounds']),
        n_estimators=int(xgb_params['n_estimators']),
        max_depth=int(xgb_params['max_depth']),
        learning_rate=float(xgb_params['learning_rate']),
        colsample_bytree=float(xgb_params['colsample_bytree']),
        subsample=float(xgb_params['subsample']),
        weight_power=float(xgb_params['weight_power']),
        sideways_penalty=float(xgb_params['sideways_penalty']),
    )

    p_train = predict_proba_up_safe(model=model, X=X_train)
    p_val = predict_proba_up_safe(model=model, X=X_val)
    p_test = predict_proba_up_safe(model=model, X=X_test)

    return pd.concat(
        [
            _make_pred_frame(train_df, 'train', p_train),
            _make_pred_frame(val_df, 'val', p_val),
            _make_pred_frame(test_df, 'test', p_test),
        ],
        ignore_index=True,
    )


def train_predict_rf(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: pd.DataFrame,
    feature_cols_local: list[str],
    model_params: dict | None = None,
) -> pd.DataFrame:
    X_train = train_df[feature_cols_local].to_numpy(dtype=float)
    y_train = train_df['target'].to_numpy(dtype=int)
    X_val = val_df[feature_cols_local].to_numpy(dtype=float)
    X_test = test_df[feature_cols_local].to_numpy(dtype=float)

    rf_params = _rf_params_with_defaults(model_params)

    model = RandomForestClassifier(
        n_estimators=int(rf_params['n_estimators']),
        max_depth=int(rf_params['max_depth']),
        min_samples_leaf=int(rf_params['min_samples_leaf']),
        random_state=int(rf_params['random_state']),
        n_jobs=int(rf_params['n_jobs']),
    )

    w_train = compute_sample_weight(class_weight='balanced', y=y_train)
    model.fit(X_train, y_train, sample_weight=w_train)

    p_train = model.predict_proba(X_train)[:, 1]
    p_val = model.predict_proba(X_val)[:, 1]
    p_test = model.predict_proba(X_test)[:, 1]

    return pd.concat(
        [
            _make_pred_frame(train_df, 'train', p_train),
            _make_pred_frame(val_df, 'val', p_val),
            _make_pred_frame(test_df, 'test', p_test),
        ],
        ignore_index=True,
    )


def predict_for_model(parts: dict, model_name: str, model_params: dict | None) -> pd.DataFrame:
    train_df_local = parts['train_df']
    val_df_local = parts['val_df']
    test_df_local = parts['test_df']
    feature_cols_local = parts['feature_cols']

    if model_name == 'xgb':
        return train_predict_xgb(
            train_df_local,
            val_df_local,
            test_df_local,
            feature_cols_local,
            model_params=model_params,
        )
    if model_name == 'rf':
        return train_predict_rf(
            train_df_local,
            val_df_local,
            test_df_local,
            feature_cols_local,
            model_params=model_params,
        )
    raise ValueError(f'Unsupported model: {model_name}')


def eval_row_from_pred(
    pred_all: pd.DataFrame,
    model_name: str,
    gap_penalty: float,
    prior_penalty: float,
    thresholds: np.ndarray | None = None,
    include_test: bool = True,
) -> dict:
    df_val = pred_all[pred_all['split_role'] == 'val']
    y_val = df_val['target'].to_numpy(dtype=int)
    p_val = df_val['pred_proba_up'].to_numpy(dtype=float)

    t, score = tune_threshold_with_balance(
        y_val=y_val,
        proba_up=p_val,
        gap_penalty=float(gap_penalty),
        prior_penalty=float(prior_penalty),
        thresholds=thresholds,
    )

    out = {
        'model': model_name,
        'val_threshold': float(t),
        'val_threshold_score': float(score),
        'threshold_gap_penalty': float(gap_penalty),
        'threshold_prior_penalty': float(prior_penalty),
    }

    roles = [('val', df_val)]
    if include_test:
        df_test = pred_all[pred_all['split_role'] == 'test']
        roles.append(('test', df_test))

    for role, df_part in roles:
        y = df_part['target'].to_numpy(dtype=int)
        p = df_part['pred_proba_up'].to_numpy(dtype=float)
        pred = (p >= float(t)).astype(int)

        m = compute_binary_metrics(y_true=y, y_pred=pred)
        s = compute_statistical_significance(y_true=y, y_pred=pred, random_baseline=0.5)

        for k, v in m.items():
            out[f'{role}_{k}'] = float(v) if isinstance(v, (float, int)) else v
        out[f'{role}_p_value_vs_random'] = float(s['p_value_vs_random'])

    return out


def eval_classification_with_val_threshold(
    pred_all: pd.DataFrame,
    model_name: str,
    include_test: bool = True,
) -> dict:
    return eval_row_from_pred(
        pred_all=pred_all,
        model_name=model_name,
        gap_penalty=THRESHOLD_GAP_PENALTY,
        prior_penalty=THRESHOLD_PRIOR_PENALTY,
        thresholds=THRESHOLD_CANDIDATE_GRID,
        include_test=include_test,
    )


def _attach_eval_metadata(row: dict, parts: dict, model_params: dict | None) -> dict:
    out = dict(row)
    out['birth_dt_utc'] = str(parts['birth_dt_utc'])
    out['birth_date'] = str(parts['birth_date'])
    out['gauss_window'] = int(parts['gauss_window'])
    out['gauss_std'] = float(parts['gauss_std'])

    safe_params = dict(model_params or {})
    out['model_params_json'] = json.dumps(safe_params, sort_keys=True)
    for k, v in safe_params.items():
        out[f'hp_{k}'] = v

    return out


def run_one_eval_with_pred(
    parts: dict,
    model_name: str,
    model_params: dict | None,
    include_test: bool,
) -> tuple[dict, pd.DataFrame]:
    pred_all = predict_for_model(parts=parts, model_name=model_name, model_params=model_params)
    row = eval_classification_with_val_threshold(
        pred_all=pred_all,
        model_name=model_name,
        include_test=include_test,
    )
    row = _attach_eval_metadata(row=row, parts=parts, model_params=model_params)
    return row, pred_all


def run_one_eval(
    parts: dict,
    model_name: str,
    model_params: dict | None,
    include_test: bool,
) -> dict:
    row, _ = run_one_eval_with_pred(
        parts=parts,
        model_name=model_name,
        model_params=model_params,
        include_test=include_test,
    )
    return row


In [93]:
# ------------------------------
# Run classification benchmark / sweep
# ------------------------------

def _val_rank_key(row: dict) -> tuple:
    """Sorting key for validation-based model selection (lower is better)."""
    return (
        -float(row['val_recall_min']),
        float(row['val_recall_gap']),
        -float(row['val_mcc']),
        -float(row['val_accuracy']),
    )


def _fmt_metrics(prefix: str, row: dict) -> str:
    """Compact but detailed metrics formatter."""
    return (
        f"{prefix}_acc={float(row[f'{prefix}_accuracy']):.4f} "
        f"{prefix}_bal_acc={float(row[f'{prefix}_balanced_accuracy']):.4f} "
        f"{prefix}_f1={float(row[f'{prefix}_f1_macro']):.4f} "
        f"{prefix}_mcc={float(row[f'{prefix}_mcc']):.4f} "
        f"{prefix}_prec_d={float(row[f'{prefix}_precision_down']):.4f} "
        f"{prefix}_prec_u={float(row[f'{prefix}_precision_up']):.4f} "
        f"{prefix}_rec_d={float(row[f'{prefix}_recall_down']):.4f} "
        f"{prefix}_rec_u={float(row[f'{prefix}_recall_up']):.4f} "
        f"{prefix}_rec_min={float(row[f'{prefix}_recall_min']):.4f} "
        f"{prefix}_gap={float(row[f'{prefix}_recall_gap']):.4f} "
        f"{prefix}_p={float(row[f'{prefix}_p_value_vs_random']):.6f}"
    )


def _cfg_str(row: dict) -> str:
    return (
        f"model={row['model']} birth={row.get('birth_date', 'na')} "
        f"gw={int(row['gauss_window'])} std={float(row['gauss_std']):.1f} "
        f"params={row['model_params_json']}"
    )


if not ENABLE_GAUSS_HYPER_SWEEP:
    preds = {}

    if 'xgb' in MODEL_SET:
        print('Training XGB...')
        preds['xgb'] = train_predict_xgb(train_df, val_df, test_df, feature_cols, model_params=None)

    if 'rf' in MODEL_SET:
        print('Training RF...')
        preds['rf'] = train_predict_rf(train_df, val_df, test_df, feature_cols, model_params=None)

    rows = []
    for name, df_pred in preds.items():
        rows.append(eval_classification_with_val_threshold(df_pred, model_name=name, include_test=True))

    df_cls = pd.DataFrame(rows).sort_values(VAL_SORT_COLS, ascending=VAL_SORT_ASC).reset_index(drop=True)
    print('Classification comparison (threshold tuned on validation):')
    display(df_cls[[
        'model', 'val_threshold',
        'val_accuracy', 'val_balanced_accuracy', 'val_f1_macro', 'val_mcc',
        'val_precision_down', 'val_precision_up',
        'val_recall_down', 'val_recall_up', 'val_recall_min', 'val_recall_gap', 'val_p_value_vs_random',
        'test_accuracy', 'test_balanced_accuracy', 'test_f1_macro', 'test_mcc',
        'test_precision_down', 'test_precision_up',
        'test_recall_down', 'test_recall_up', 'test_recall_min', 'test_recall_gap', 'test_p_value_vs_random',
    ]])

    for row in df_cls.to_dict(orient='records'):
        print('[single-run]', _cfg_str({
            'model': row['model'],
            'birth_date': str(BIRTH_DT_UTC)[:10],
            'gauss_window': GAUSS_WINDOW,
            'gauss_std': GAUSS_STD,
            'model_params_json': '{}',
        }))
        print('  ', _fmt_metrics('val', row))
        print('  ', _fmt_metrics('test', row))

else:
    model_param_sets = {}
    if 'xgb' in MODEL_SET:
        model_param_sets['xgb'] = _expand_param_grid(XGB_PARAM_GRID)
    if 'rf' in MODEL_SET:
        model_param_sets['rf'] = _expand_param_grid(RF_PARAM_GRID)

    total_runs = len(BIRTH_DT_SWEEP_LIST) * len(GAUSS_WINDOWS) * len(GAUSS_STDS) * sum(len(v) for v in model_param_sets.values())
    print('Sweep total runs (validation stage):', total_runs)

    sweep_rows = []
    done = 0
    stop = False

    best_global = None
    best_global_key = None

    for birth_dt_utc in BIRTH_DT_SWEEP_LIST:
        if stop:
            break
        for gauss_window in GAUSS_WINDOWS:
            if stop:
                break
            for gauss_std in GAUSS_STDS:
                if stop:
                    break

                parts = build_dataset_parts_for_gauss(
                    int(gauss_window),
                    float(gauss_std),
                    birth_dt_utc=birth_dt_utc,
                    verbose=False,
                )

                for model_name, param_list in model_param_sets.items():
                    if stop:
                        break
                    for params in param_list:
                        row = run_one_eval(parts, model_name, params, include_test=False)
                        sweep_rows.append(row)
                        done += 1

                        current_key = _val_rank_key(row)
                        if best_global_key is None or current_key < best_global_key:
                            best_global_key = current_key
                            best_global = dict(row)

                        print(f'[sweep-run] {done}/{total_runs} {_cfg_str(row)}')
                        print('  ', _fmt_metrics('val', row))
                        if best_global is not None:
                            print(f"[best-now] {_cfg_str(best_global)}")
                            print('  ', _fmt_metrics('val', best_global))

                        if MAX_SWEEP_RUNS is not None and done >= int(MAX_SWEEP_RUNS):
                            stop = True
                            break

    if not sweep_rows:
        raise RuntimeError('Sweep produced no rows.')

    df_sweep_val = pd.DataFrame(sweep_rows).sort_values(VAL_SORT_COLS, ascending=VAL_SORT_ASC).reset_index(drop=True)

    print('Top configs by VALIDATION (selection stage, no test used):')
    preview_cols = [
        'model', 'birth_date', 'birth_dt_utc', 'gauss_window', 'gauss_std', 'model_params_json',
        'val_threshold',
        'val_accuracy', 'val_balanced_accuracy', 'val_f1_macro', 'val_mcc',
        'val_precision_down', 'val_precision_up',
        'val_recall_down', 'val_recall_up', 'val_recall_min', 'val_recall_gap', 'val_p_value_vs_random',
    ]
    display(df_sweep_val[preview_cols].head(50))

    best_val_rows = []
    for model_name in sorted(df_sweep_val['model'].unique()):
        df_model = df_sweep_val[df_sweep_val['model'] == model_name].copy()
        best_row = df_model.sort_values(VAL_SORT_COLS, ascending=VAL_SORT_ASC).iloc[0].to_dict()
        best_val_rows.append(best_row)

    df_best_val = pd.DataFrame(best_val_rows).sort_values(VAL_SORT_COLS, ascending=VAL_SORT_ASC).reset_index(drop=True)

    print('Selected best config per model (based on validation only):')
    display(df_best_val[preview_cols])

    final_rows = []
    final_pred_records = []

    for _, best in df_best_val.iterrows():
        model_name = str(best['model'])
        birth_dt_utc = str(best['birth_dt_utc'])
        gauss_window = int(best['gauss_window'])
        gauss_std = float(best['gauss_std'])
        params = json.loads(str(best['model_params_json']))

        parts = build_dataset_parts_for_gauss(
            gauss_window,
            gauss_std,
            birth_dt_utc=birth_dt_utc,
            verbose=False,
        )
        final_row, pred_all = run_one_eval_with_pred(parts, model_name, params, include_test=True)
        final_rows.append(final_row)

        final_pred_records.append({
            'model': model_name,
            'birth_date': str(parts['birth_date']),
            'birth_dt_utc': str(parts['birth_dt_utc']),
            'gauss_window': int(parts['gauss_window']),
            'gauss_std': float(parts['gauss_std']),
            'model_params_json': final_row['model_params_json'],
            'pred_all': pred_all,
        })

        print(f"[final-test] {_cfg_str(final_row)}")
        print('  ', _fmt_metrics('val', final_row))
        print('  ', _fmt_metrics('test', final_row))

    df_cls = pd.DataFrame(final_rows).sort_values(VAL_SORT_COLS, ascending=VAL_SORT_ASC).reset_index(drop=True)

    print('Final model comparison (selection on val, evaluation on test):')
    display(df_cls[[
        'model', 'birth_date', 'birth_dt_utc', 'gauss_window', 'gauss_std', 'model_params_json', 'val_threshold',
        'val_accuracy', 'val_balanced_accuracy', 'val_f1_macro', 'val_mcc',
        'val_precision_down', 'val_precision_up',
        'val_recall_down', 'val_recall_up', 'val_recall_min', 'val_recall_gap', 'val_p_value_vs_random',
        'test_accuracy', 'test_balanced_accuracy', 'test_f1_macro', 'test_mcc',
        'test_precision_down', 'test_precision_up',
        'test_recall_down', 'test_recall_up', 'test_recall_min', 'test_recall_gap', 'test_p_value_vs_random',
    ]])

    if ENABLE_THRESHOLD_SENSITIVITY:
        sensitivity_rows = []
        curve_rows = []

        for rec in final_pred_records:
            df_val_pred = rec['pred_all'][rec['pred_all']['split_role'] == 'val']
            y_val_curve = df_val_pred['target'].to_numpy(dtype=int)
            p_val_curve = df_val_pred['pred_proba_up'].to_numpy(dtype=float)

            for gap_penalty in THRESHOLD_GAP_PENALTY_GRID:
                for prior_penalty in THRESHOLD_PRIOR_PENALTY_GRID:
                    sr = eval_row_from_pred(
                        pred_all=rec['pred_all'],
                        model_name=rec['model'],
                        gap_penalty=float(gap_penalty),
                        prior_penalty=float(prior_penalty),
                        thresholds=THRESHOLD_CANDIDATE_GRID,
                        include_test=True,
                    )
                    sr['birth_date'] = rec['birth_date']
                    sr['birth_dt_utc'] = rec['birth_dt_utc']
                    sr['gauss_window'] = rec['gauss_window']
                    sr['gauss_std'] = rec['gauss_std']
                    sr['model_params_json'] = rec['model_params_json']
                    sensitivity_rows.append(sr)

                    df_curve = evaluate_threshold_grid(
                        y_true=y_val_curve,
                        proba_up=p_val_curve,
                        gap_penalty=float(gap_penalty),
                        prior_penalty=float(prior_penalty),
                        thresholds=THRESHOLD_CANDIDATE_GRID,
                    )
                    if not df_curve.empty:
                        df_curve = df_curve.head(int(THRESHOLD_CURVE_TOP_K)).copy()
                        df_curve['model'] = rec['model']
                        df_curve['birth_date'] = rec['birth_date']
                        df_curve['birth_dt_utc'] = rec['birth_dt_utc']
                        df_curve['gauss_window'] = rec['gauss_window']
                        df_curve['gauss_std'] = rec['gauss_std']
                        df_curve['model_params_json'] = rec['model_params_json']
                        df_curve['threshold_gap_penalty'] = float(gap_penalty)
                        df_curve['threshold_prior_penalty'] = float(prior_penalty)
                        curve_rows.append(df_curve)

        df_threshold_sensitivity = pd.DataFrame(sensitivity_rows).sort_values(
            ['model', 'birth_date', 'gauss_window', 'gauss_std', 'val_recall_min', 'val_recall_gap', 'val_mcc', 'val_accuracy'],
            ascending=[True, True, True, True, False, True, False, False],
        ).reset_index(drop=True)

        print('Threshold penalty sensitivity (no retrain; threshold retuned on val):')
        display(df_threshold_sensitivity[[
            'model', 'birth_date', 'birth_dt_utc', 'gauss_window', 'gauss_std', 'model_params_json',
            'threshold_gap_penalty', 'threshold_prior_penalty', 'val_threshold', 'val_threshold_score',
            'val_accuracy', 'val_mcc', 'val_recall_min', 'val_recall_gap',
            'test_accuracy', 'test_mcc', 'test_recall_min', 'test_recall_gap', 'test_p_value_vs_random',
        ]])

        df_threshold_sensitivity_best = (
            df_threshold_sensitivity
            .sort_values(VAL_SORT_COLS, ascending=VAL_SORT_ASC)
            .drop_duplicates(subset=['model', 'birth_dt_utc', 'gauss_window', 'gauss_std', 'model_params_json'])
            .reset_index(drop=True)
        )

        print('Best threshold-penalty combo per selected final model config:')
        display(df_threshold_sensitivity_best[[
            'model', 'birth_date', 'gauss_window', 'gauss_std',
            'threshold_gap_penalty', 'threshold_prior_penalty', 'val_threshold', 'val_threshold_score',
            'val_accuracy', 'val_mcc', 'val_recall_min', 'val_recall_gap',
            'test_accuracy', 'test_mcc', 'test_recall_min', 'test_recall_gap', 'test_p_value_vs_random',
        ]])

        if curve_rows:
            df_threshold_curve_top = (
                pd.concat(curve_rows, ignore_index=True)
                .sort_values(
                    [
                        'model', 'birth_date', 'gauss_window', 'gauss_std',
                        'threshold_gap_penalty', 'threshold_prior_penalty',
                        'score', 'threshold',
                    ],
                    ascending=[True, True, True, True, True, True, False, True],
                )
                .reset_index(drop=True)
            )

            print('Top threshold candidates by validation objective score:')
            display(df_threshold_curve_top[[
                'model', 'birth_date', 'birth_dt_utc', 'gauss_window', 'gauss_std', 'model_params_json',
                'threshold_gap_penalty', 'threshold_prior_penalty', 'threshold', 'score',
                'recall_min', 'recall_gap', 'prior_gap', 'pred_up_share', 'true_up_share',
                'accuracy', 'balanced_accuracy', 'mcc', 'f1_macro',
            ]])


Sweep total runs (validation stage): 270
[sweep-run] 1/270 model=xgb birth=2009-01-03 gw=151 std=30.0 params={"colsample_bytree": 0.7, "early_stopping_rounds": 75, "learning_rate": 0.01, "max_depth": 3, "n_estimators": 30, "sideways_penalty": 1.0, "subsample": 0.7, "weight_power": 1.0}
   val_acc=0.4677 val_bal_acc=0.4843 val_f1=0.4156 val_mcc=-0.0412 val_prec_d=0.4648 val_prec_u=0.4811 val_rec_d=0.8077 val_rec_u=0.1609 val_rec_min=0.1609 val_gap=0.6468 val_p=0.948375
[best-now] model=xgb birth=2009-01-03 gw=151 std=30.0 params={"colsample_bytree": 0.7, "early_stopping_rounds": 75, "learning_rate": 0.01, "max_depth": 3, "n_estimators": 30, "sideways_penalty": 1.0, "subsample": 0.7, "weight_power": 1.0}
   val_acc=0.4677 val_bal_acc=0.4843 val_f1=0.4156 val_mcc=-0.0412 val_prec_d=0.4648 val_prec_u=0.4811 val_rec_d=0.8077 val_rec_u=0.1609 val_rec_min=0.1609 val_gap=0.6468 val_p=0.948375
[sweep-run] 2/270 model=xgb birth=2009-01-03 gw=151 std=30.0 params={"colsample_bytree": 0.7, "early_s

Unnamed: 0,model,birth_date,birth_dt_utc,gauss_window,gauss_std,model_params_json,val_threshold,val_accuracy,val_balanced_accuracy,val_f1_macro,val_mcc,val_precision_down,val_precision_up,val_recall_down,val_recall_up,val_recall_min,val_recall_gap,val_p_value_vs_random
0,xgb,2009-01-03,2009-01-03T18:15:05Z,201,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.560531,0.560477,0.560139,0.120799,0.535117,0.585526,0.559441,0.561514,0.559441,0.002074,0.001667
1,xgb,2009-01-03,2009-01-03T18:15:05Z,201,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.560531,0.560477,0.560139,0.120799,0.535117,0.585526,0.559441,0.561514,0.559441,0.002074,0.001667
2,xgb,2009-01-03,2009-01-03T18:15:05Z,151,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.558872,0.559071,0.558599,0.117986,0.533113,0.584718,0.562937,0.555205,0.555205,0.007732,0.002162
3,xgb,2009-01-03,2009-01-03T18:15:05Z,151,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.562189,0.562568,0.561985,0.124974,0.536184,0.588629,0.56993,0.555205,0.555205,0.014725,0.001277
4,xgb,2009-01-03,2009-01-03T18:15:05Z,151,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.562189,0.562568,0.561985,0.124974,0.536184,0.588629,0.56993,0.555205,0.555205,0.014725,0.001277
5,xgb,2009-01-03,2009-01-03T18:15:05Z,151,30.0,"{""colsample_bytree"": 0.8, ""early_stopping_roun...",0.48,0.558872,0.559242,0.558667,0.118331,0.532895,0.585284,0.566434,0.55205,0.55205,0.014383,0.002162
6,xgb,2009-01-03,2009-01-03T18:15:05Z,201,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.552239,0.551394,0.551339,0.102727,0.527586,0.57508,0.534965,0.567823,0.534965,0.032858,0.005757
7,xgb,2009-01-03,2009-01-03T18:15:05Z,151,30.0,"{""colsample_bytree"": 0.8, ""early_stopping_roun...",0.48,0.575456,0.57775,0.575361,0.15581,0.546012,0.610108,0.622378,0.533123,0.533123,0.089255,0.000121
8,xgb,2009-01-03,2009-01-03T18:15:05Z,151,30.0,"{""colsample_bytree"": 0.7, ""early_stopping_roun...",0.47,0.537313,0.536686,0.536579,0.073304,0.511945,0.56129,0.524476,0.548896,0.524476,0.02442,0.036537
9,xgb,2009-01-03,2009-01-03T18:15:05Z,151,30.0,"{""colsample_bytree"": 0.7, ""early_stopping_roun...",0.47,0.537313,0.536686,0.536579,0.073304,0.511945,0.56129,0.524476,0.548896,0.524476,0.02442,0.036537


Selected best config per model (based on validation only):


Unnamed: 0,model,birth_date,birth_dt_utc,gauss_window,gauss_std,model_params_json,val_threshold,val_accuracy,val_balanced_accuracy,val_f1_macro,val_mcc,val_precision_down,val_precision_up,val_recall_down,val_recall_up,val_recall_min,val_recall_gap,val_p_value_vs_random
0,xgb,2009-01-03,2009-01-03T18:15:05Z,201,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.560531,0.560477,0.560139,0.120799,0.535117,0.585526,0.559441,0.561514,0.559441,0.002074,0.001667


[final-test] model=xgb birth=2009-01-03 gw=201 std=30.0 params={"colsample_bytree": 0.9, "early_stopping_rounds": 75, "learning_rate": 0.01, "max_depth": 3, "n_estimators": 100, "sideways_penalty": 1.0, "subsample": 0.8, "weight_power": 1.0}
   val_acc=0.5605 val_bal_acc=0.5605 val_f1=0.5601 val_mcc=0.1208 val_prec_d=0.5351 val_prec_u=0.5855 val_rec_d=0.5594 val_rec_u=0.5615 val_rec_min=0.5594 val_gap=0.0021 val_p=0.001667
   test_acc=0.5232 test_bal_acc=0.5288 test_f1=0.5154 test_mcc=0.0601 test_prec_d=0.5594 test_prec_u=0.5034 test_rec_d=0.3817 test_rec_u=0.6759 test_rec_min=0.3817 test_gap=0.2942 test_p=0.086323
Final model comparison (selection on val, evaluation on test):


Unnamed: 0,model,birth_date,birth_dt_utc,gauss_window,gauss_std,model_params_json,val_threshold,val_accuracy,val_balanced_accuracy,val_f1_macro,val_mcc,val_precision_down,val_precision_up,val_recall_down,val_recall_up,val_recall_min,val_recall_gap,val_p_value_vs_random,test_accuracy,test_balanced_accuracy,test_f1_macro,test_mcc,test_precision_down,test_precision_up,test_recall_down,test_recall_up,test_recall_min,test_recall_gap,test_p_value_vs_random
0,xgb,2009-01-03,2009-01-03T18:15:05Z,201,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",0.47,0.560531,0.560477,0.560139,0.120799,0.535117,0.585526,0.559441,0.561514,0.559441,0.002074,0.001667,0.52323,0.528763,0.515388,0.060105,0.559375,0.503425,0.381663,0.675862,0.381663,0.294199,0.086323


In [94]:
df_sweep_val

Unnamed: 0,model,val_threshold,val_threshold_score,threshold_gap_penalty,threshold_prior_penalty,val_accuracy,val_balanced_accuracy,val_mcc,val_f1_macro,val_precision_down,val_precision_up,val_recall_down,val_recall_up,val_recall_min,val_recall_gap,val_support,val_p_value_vs_random,birth_dt_utc,birth_date,gauss_window,gauss_std,model_params_json,hp_n_estimators,hp_max_depth,hp_learning_rate,hp_colsample_bytree,hp_subsample,hp_early_stopping_rounds,hp_weight_power,hp_sideways_penalty
0,xgb,0.47,0.557844,0.25,0.05,0.560531,0.560477,0.120799,0.560139,0.535117,0.585526,0.559441,0.561514,0.559441,0.002074,603.0,0.001667,2009-01-03T18:15:05Z,2009-01-03,201,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",100,3,0.01,0.9,0.8,75,1.0,1.0
1,xgb,0.47,0.557844,0.25,0.05,0.560531,0.560477,0.120799,0.560139,0.535117,0.585526,0.559441,0.561514,0.559441,0.002074,603.0,0.001667,2009-01-03T18:15:05Z,2009-01-03,201,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",150,3,0.01,0.9,0.8,75,1.0,1.0
2,xgb,0.47,0.551945,0.25,0.05,0.558872,0.559071,0.117986,0.558599,0.533113,0.584718,0.562937,0.555205,0.555205,0.007732,603.0,0.002162,2009-01-03T18:15:05Z,2009-01-03,151,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",75,3,0.01,0.9,0.8,75,1.0,1.0
3,xgb,0.47,0.550031,0.25,0.05,0.562189,0.562568,0.124974,0.561985,0.536184,0.588629,0.569930,0.555205,0.555205,0.014725,603.0,0.001277,2009-01-03T18:15:05Z,2009-01-03,151,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",100,3,0.01,0.9,0.8,75,1.0,1.0
4,xgb,0.47,0.550031,0.25,0.05,0.562189,0.562568,0.124974,0.561985,0.536184,0.588629,0.569930,0.555205,0.555205,0.014725,603.0,0.001277,2009-01-03T18:15:05Z,2009-01-03,151,30.0,"{""colsample_bytree"": 0.9, ""early_stopping_roun...",150,3,0.01,0.9,0.8,75,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,xgb,0.05,-0.273715,0.25,0.05,0.525705,0.500000,0.000000,0.344565,0.000000,0.525705,0.000000,1.000000,0.000000,1.000000,603.0,0.110896,2009-01-03T18:15:05Z,2009-01-03,201,30.0,"{""colsample_bytree"": 0.8, ""early_stopping_roun...",100,3,0.01,0.8,0.9,75,1.0,1.0
266,xgb,0.05,-0.273715,0.25,0.05,0.525705,0.500000,0.000000,0.344565,0.000000,0.525705,0.000000,1.000000,0.000000,1.000000,603.0,0.110896,2009-01-03T18:15:05Z,2009-01-03,201,30.0,"{""colsample_bytree"": 0.7, ""early_stopping_roun...",100,4,0.01,0.7,0.9,75,1.0,1.0
267,xgb,0.05,-0.273715,0.25,0.05,0.525705,0.500000,0.000000,0.344565,0.000000,0.525705,0.000000,1.000000,0.000000,1.000000,603.0,0.110896,2009-01-03T18:15:05Z,2009-01-03,201,30.0,"{""colsample_bytree"": 0.7, ""early_stopping_roun...",150,3,0.01,0.7,0.9,75,1.0,1.0
268,xgb,0.05,-0.273715,0.25,0.05,0.525705,0.500000,0.000000,0.344565,0.000000,0.525705,0.000000,1.000000,0.000000,1.000000,603.0,0.110896,2009-01-03T18:15:05Z,2009-01-03,201,30.0,"{""colsample_bytree"": 0.8, ""early_stopping_roun...",150,3,0.01,0.8,0.9,75,1.0,1.0


## Notes

- This notebook is intentionally classification-first and classic-split only.
- Houses are not used.
- To test optional blocks, set in config:
  - `FEATURE_BLOCKS['progressed_to_natal'] = True`
  - `FEATURE_BLOCKS['directed_to_natal'] = True`
- If you only want to precompute/cache optional blocks, keep them `False` and set `PRECOMPUTE_DISABLED_BLOCKS = True`.
- To include aspects between transit planets, set `CLASSIC_INCLUDE_TRANSIT_PAIR_ASPECTS = True`.
- To run birthdate sweep, use `ENABLE_BIRTHDATE_SWEEP` and edit `BIRTHDATE_CANDIDATES`.
- To run Gaussian + hyperparameter sweep, set `ENABLE_GAUSS_HYPER_SWEEP = True` and edit:
  - `GAUSS_WINDOWS`, `GAUSS_STDS`
  - `XGB_PARAM_GRID`, `RF_PARAM_GRID`
- Selection is validation-only; test is used only for final selected configs.
- Threshold sensitivity is controlled by `ENABLE_THRESHOLD_SENSITIVITY`, `THRESHOLD_GAP_PENALTY_GRID`, `THRESHOLD_PRIOR_PENALTY_GRID`, `THRESHOLD_CANDIDATE_GRID`, `THRESHOLD_CURVE_TOP_K`.
