# üî¨ Body Ablation Research

Grid search over:
- **Gauss params**: window √ó std
- **Coord modes**: geo, helio, both
- **Body exclusions**: top performers from single-body study

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
from itertools import product

# RESEARCH imports
from RESEARCH.data_loader import load_market_data
from RESEARCH.labeling import create_balanced_labels
from RESEARCH.astro_engine import (
    init_ephemeris,
    calculate_bodies_for_dates_multi,
    calculate_aspects_for_dates,
    calculate_phases_for_dates,
)
from RESEARCH.features import build_full_features, merge_features_with_labels
from RESEARCH.model_training import (
    split_dataset,
    prepare_xy,
    train_xgb_model,
    tune_threshold,
    predict_with_threshold,
    check_cuda_available,
)
from RESEARCH.evaluation import evaluate_model_full, compare_models

## Configuration

In [None]:
# Model params (fixed)
MODEL_PARAMS = {
    'n_estimators': 500,
    'max_depth': 6,
    'learning_rate': 0.03,
    'colsample_bytree': 0.6,
    'subsample': 0.8,
}

# GRID SEARCH PARAMETERS
GRID_PARAMS = {
    'gauss_windows': [150, 200, 250],
    'gauss_stds': [50.0, 70.0, 90.0],
    'coord_modes': ['geo', 'helio', 'both'],  # Added helio!
    'orb_mults': [0.1],
}

# Best bodies to exclude (from single-body ablation study)
# Top 5: MeanNode, Pluto, Saturn, Venus, Neptune
ABLATION_BODIES = [
    [],  # Baseline
    ['MeanNode'],
    ['Pluto'],
    ['Saturn'],
    ['Venus'],
    ['Neptune'],
    ['MeanNode', 'Pluto'],
    ['MeanNode', 'Saturn'],
    ['MeanNode', 'Venus'],
    ['Pluto', 'Saturn'],
    ['Pluto', 'Venus'],
]

# Calculate total
n_combos = len(GRID_PARAMS['coord_modes']) * len(GRID_PARAMS['gauss_windows']) * len(GRID_PARAMS['gauss_stds']) * len(ABLATION_BODIES)
print(f'Total combinations: {n_combos}')

## Helper Functions

In [None]:
def train_and_evaluate(
    df_market, df_bodies, geo_by_date, settings,
    gauss_window, gauss_std, orb_mult,
    exclude_bodies=None, device='cpu', verbose=False,
):
    """Train model with specific params and return evaluation."""
    # 1. Create labels
    df_labels = create_balanced_labels(df_market, gauss_window=gauss_window, gauss_std=gauss_std)
    
    # 2. Calculate aspects
    df_aspects = calculate_aspects_for_dates(geo_by_date, settings, orb_mult=orb_mult, progress=False)
    
    # 3. Calculate phases
    df_phases = calculate_phases_for_dates(geo_by_date, progress=False)
    
    # 4. Build features
    df_features = build_full_features(df_bodies, df_aspects, df_phases=df_phases, exclude_bodies=exclude_bodies)
    
    # 5. Merge with labels
    df_dataset = merge_features_with_labels(df_features, df_labels)
    if len(df_dataset) < 100:
        return None
    
    # 6. Split
    train_df, val_df, test_df = split_dataset(df_dataset)
    feature_cols = [c for c in df_dataset.columns if c not in ['date', 'target']]
    X_train, y_train = prepare_xy(train_df, feature_cols)
    X_val, y_val = prepare_xy(val_df, feature_cols)
    X_test, y_test = prepare_xy(test_df, feature_cols)
    
    # 7. Train
    model = train_xgb_model(X_train, y_train, X_val, y_val, feature_cols, n_classes=2, device=device, **MODEL_PARAMS)
    
    # 8. Tune threshold
    best_t, _ = tune_threshold(model, X_val, y_val, metric='recall_min', verbose=verbose)
    
    # 9. Predict
    y_pred = predict_with_threshold(model, X_test, threshold=best_t)
    
    # 10. Metrics
    from sklearn.metrics import classification_report, balanced_accuracy_score, matthews_corrcoef
    report = classification_report(y_test, y_pred, labels=[0, 1], target_names=['DOWN', 'UP'], output_dict=True, zero_division=0)
    
    recall_down, recall_up = report['DOWN']['recall'], report['UP']['recall']
    
    return {
        'model': model,
        'threshold': best_t,
        'n_features': len(feature_cols),
        'recall_min': min(recall_down, recall_up),
        'recall_gap': abs(recall_down - recall_up),
        'recall_down': recall_down,
        'recall_up': recall_up,
        'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
        'mcc': matthews_corrcoef(y_test, y_pred),
        'f1_macro': report['macro avg']['f1-score'],
        'y_test': y_test,
        'y_pred': y_pred,
        'test_dates': test_df['date'].reset_index(drop=True),
    }

## 1. Load Data & Initialize

In [None]:
# Check device
_, device = check_cuda_available()
print(f'Device: {device}')

# Load market data
df_market = load_market_data()
df_market = df_market[df_market['date'] >= '2017-11-01'].reset_index(drop=True)
print(f'Market data: {len(df_market)} rows')

# Initialize ephemeris
settings = init_ephemeris()
print(f'Bodies: {[b.name for b in settings.bodies]}')

## 2. Pre-calculate Body Positions

In [None]:
# Pre-calculate bodies for each coord mode
cached_bodies = {}

for coord_mode in GRID_PARAMS['coord_modes']:
    print(f'\nüìç Calculating bodies for {coord_mode}...')
    df_bodies, geo_by_date, helio_by_date = calculate_bodies_for_dates_multi(
        df_market['date'], settings, coord_mode=coord_mode, progress=True
    )
    cached_bodies[coord_mode] = (df_bodies, geo_by_date, helio_by_date)
    print(f'  ‚Üí {len(df_bodies)} records, {len(df_bodies.columns)} columns')

print('\n‚úÖ All body positions cached!')

## 3. Baseline Evaluation

In [None]:
# Baseline: coord=both, gauss=(200, 70), no exclusion
df_bodies, geo_by_date, _ = cached_bodies['both']

baseline = train_and_evaluate(
    df_market, df_bodies, geo_by_date, settings,
    gauss_window=200, gauss_std=70.0, orb_mult=0.1,
    exclude_bodies=None, device=device, verbose=True
)

print('\n' + '='*60)
print('üìä BASELINE RESULTS')
print('='*60)
print(f"R_MIN:   {baseline['recall_min']:.4f}")
print(f"BAL_ACC: {baseline['balanced_accuracy']:.4f}")
print(f"MCC:     {baseline['mcc']:.4f}")

# Show full evaluation
evaluate_model_full(
    baseline['y_test'], baseline['y_pred'],
    dates=baseline['test_dates'],
    title='BASELINE (both, W=200, S=70, no exclusion)'
)

## 4. Full Grid Search

In [None]:
# Generate all combinations
combos = list(product(
    GRID_PARAMS['coord_modes'],
    GRID_PARAMS['gauss_windows'],
    GRID_PARAMS['gauss_stds'],
    GRID_PARAMS['orb_mults'],
    ABLATION_BODIES,
))

print(f'Total combinations: {len(combos)}')

# Run grid search
results = []

for i, (coord, gw, gs, orb, excl) in enumerate(combos):
    excl_str = ','.join(excl) if excl else 'none'
    
    # Get cached bodies
    df_bodies, geo_by_date, _ = cached_bodies[coord]
    
    # Train and evaluate
    result = train_and_evaluate(
        df_market, df_bodies, geo_by_date, settings,
        gauss_window=gw, gauss_std=gs, orb_mult=orb,
        exclude_bodies=excl if excl else None,
        device=device, verbose=False
    )
    
    if result is None:
        continue
    
    results.append({
        'coord_mode': coord,
        'gauss_window': gw,
        'gauss_std': gs,
        'orb_mult': orb,
        'exclude_bodies': excl_str,
        'n_features': result['n_features'],
        'recall_min': result['recall_min'],
        'recall_gap': result['recall_gap'],
        'balanced_accuracy': result['balanced_accuracy'],
        'mcc': result['mcc'],
        'f1_macro': result['f1_macro'],
        'threshold': result['threshold'],
    })
    
    # Print progress every 10
    if (i + 1) % 10 == 0 or i == len(combos) - 1:
        print(f'[{i+1:3d}/{len(combos)}] {coord:5s} W={gw} S={gs:.0f} excl={excl_str:20s} ‚Üí R_MIN={result["recall_min"]:.3f} MCC={result["mcc"]:.3f}')

results_df = pd.DataFrame(results)
print(f'\n‚úÖ Grid search complete! {len(results_df)} results')

## 5. Results Analysis

In [None]:
# Sort by R_MIN
results_df = results_df.sort_values('recall_min', ascending=False)

print('='*80)
print('üìä TOP 20 RESULTS (by R_MIN)')
print('='*80)
display(results_df.head(20))

In [None]:
# Best by coord mode
print('\nüìä BEST BY COORD MODE:')
for coord in GRID_PARAMS['coord_modes']:
    best_for_coord = results_df[results_df['coord_mode'] == coord].iloc[0]
    print(f"  {coord:5s}: R_MIN={best_for_coord['recall_min']:.4f} MCC={best_for_coord['mcc']:.4f} excl={best_for_coord['exclude_bodies']}")

In [None]:
# Best by body exclusion
print('\nüìä BEST BY BODY EXCLUSION:')
best_by_excl = results_df.groupby('exclude_bodies').agg({
    'recall_min': 'max',
    'mcc': 'max',
    'balanced_accuracy': 'max'
}).sort_values('recall_min', ascending=False)
display(best_by_excl)

## 6. Best Model Evaluation

In [None]:
# Get best configuration
best = results_df.iloc[0]

print('='*60)
print('üèÜ BEST CONFIGURATION')
print('='*60)
print(f"Coord mode:     {best['coord_mode']}")
print(f"Gauss window:   {best['gauss_window']}")
print(f"Gauss std:      {best['gauss_std']}")
print(f"Orb mult:       {best['orb_mult']}")
print(f"Exclude bodies: {best['exclude_bodies']}")
print(f"Features:       {best['n_features']}")
print('-'*40)
print(f"R_MIN:          {best['recall_min']:.4f}")
print(f"BAL_ACC:        {best['balanced_accuracy']:.4f}")
print(f"MCC:            {best['mcc']:.4f}")

In [None]:
# Re-train best model with verbose output
coord_mode = best['coord_mode']
df_bodies, geo_by_date, _ = cached_bodies[coord_mode]

excl = best['exclude_bodies'].split(',') if best['exclude_bodies'] != 'none' else None

best_result = train_and_evaluate(
    df_market, df_bodies, geo_by_date, settings,
    gauss_window=int(best['gauss_window']),
    gauss_std=float(best['gauss_std']),
    orb_mult=float(best['orb_mult']),
    exclude_bodies=excl,
    device=device, verbose=True
)

# Full evaluation with plots
evaluate_model_full(
    best_result['y_test'], best_result['y_pred'],
    dates=best_result['test_dates'],
    title=f"BEST: {best['coord_mode']} W={best['gauss_window']} S={best['gauss_std']} excl={best['exclude_bodies']}"
)

## 7. Save Results

In [None]:
# Save to CSV
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_path = f'RESEARCH/reports/grid_search_{timestamp}.csv'
results_df.to_csv(results_path, index=False)
print(f'üíæ Results saved to: {results_path}')