# üß† Deep Model Tuning for Bitcoin Birth DATE

We are testing a simple idea: if the model is more complex, maybe it can "digest" the astrological transits to the natal chart better.

Date used as the "birth" of Bitcoin: **2009-10-10** (Economic Birth / First Rate)
Features used:
- Transits to the natal chart
- Aspects between transits
- Phases (NO houses)

Plain words:
- "Transits" are where the planets are on each day.
- "Natal" means the sky position on the chosen birth date.
- "Aspects" are angle-based relationships between bodies.
- "Phases" are cycle positions (like Moon phases, but for other bodies too).


In [9]:
# Basic Python tools
import sys
from pathlib import Path

# Data tools
import pandas as pd
import numpy as np

# Grid search helper
from itertools import product

# Progress bar so we can see the loop is working
from tqdm import tqdm

# Dates and time
from datetime import datetime, date, timezone

# Metrics to judge the model
from sklearn.metrics import classification_report, matthews_corrcoef

# Make sure the project is on Python path
PROJECT_ROOT = Path("/home/rut/ostrofun")
sys.path.insert(0, str(PROJECT_ROOT))

# Local project modules (they contain our astro + ML pipeline)
from RESEARCH.config import cfg
from RESEARCH.data_loader import load_market_data
from RESEARCH.labeling import create_balanced_labels
from RESEARCH.astro_engine import (
    init_ephemeris,
    calculate_bodies_for_dates_multi,
    calculate_aspects_for_dates,
    calculate_transits_for_dates,
    calculate_phases_for_dates,
    get_natal_bodies,
)
from RESEARCH.features import build_full_features, merge_features_with_labels
from RESEARCH.model_training import (
    split_dataset,
    prepare_xy,
    train_xgb_model,
    tune_threshold,
    predict_with_threshold,
    check_cuda_available,
)


In [10]:
# ------------------------------------------------------------
# Config
# ------------------------------------------------------------
# The "birth date" we treat as Bitcoin's natal chart date
TARGET_DATE = date(2009, 10, 10)
print(f"üß† Tuning for Birth Date: {TARGET_DATE}")

# Astro feature settings
ASTRO_CONFIG = {
    # Use both geocentric and heliocentric coordinates
    "coord_mode": "both",
    # Orb multiplier (how wide we allow an aspect to count)
    "orb_mult": 0.1,
    # Window and std for Gaussian labeling (smooth target labels)
    "gauss_window": 200,
    "gauss_std": 70.0,
    # If we want to exclude some bodies, we can put them here
    "exclude_bodies": None,
}

# ------------------------------------------------------------
# Deep Grid Search Space
# ------------------------------------------------------------
# We try many model sizes and settings to see what works best.
# Think of it like trying many oven temperatures to bake a cake
# and picking the one that tastes best.
PARAM_GRID = {
    "n_estimators": [500, 900, 1300],
    "max_depth": [6, 8, 10],  # deeper trees = more complex model
    "learning_rate": [0.05, 0.03],
    "colsample_bytree": [0.6, 0.8],
    "subsample": [0.8],
}


üß† Tuning for Birth Date: 2009-10-10


In [11]:
# ------------------------------------------------------------
# 1. Prepare Data
# ------------------------------------------------------------
print("Loading data...")

# Load market data (price history)
df_market = load_market_data()

# Use recent years only (we do not train on very early data)
df_market = df_market[df_market["date"] >= "2017-11-01"].reset_index(drop=True)

# Create balanced labels using a smooth Gaussian window
# This means the target is not just 0/1 noise, it is smoothed in time
# so the model can learn more stable patterns.
df_labels = create_balanced_labels(
    df_market,
    ASTRO_CONFIG["gauss_window"],
    ASTRO_CONFIG["gauss_std"],
)

# Initialize ephemeris (astronomy data engine)
settings = init_ephemeris()

# Check if GPU is available for faster training
_, device = check_cuda_available()

print("Calculating astro...")

# Calculate positions of all bodies for each date
# We ask for both geocentric and heliocentric modes.
df_bodies, geo_by_date, helio_by_date = calculate_bodies_for_dates_multi(
    df_market["date"],
    settings,
    coord_mode="both",
)

# Use geocentric positions as the base for aspects/transits
bodies_by_date = geo_by_date

# Calculate phases (cycle positions) for each date
# Phases are like "where in the cycle" each body is.
df_phases = calculate_phases_for_dates(bodies_by_date)

# ------------------------------------------------------------
# 2. Build Natal Features
# ------------------------------------------------------------
print(f"Building natal features for {TARGET_DATE}...")

# We use noon time to avoid timezone edge cases
natal_dt_str = f"{TARGET_DATE.isoformat()}T12:00:00"

# Get natal positions (sky map on the birth date)
natal_bodies = get_natal_bodies(natal_dt_str, settings)

# Transits: how the current sky relates to the natal sky
# These are the core "natal transit" features
df_transits = calculate_transits_for_dates(
    bodies_by_date,
    natal_bodies,
    settings,
    orb_mult=ASTRO_CONFIG["orb_mult"],
)

# Aspects between transiting bodies (baseline features)
df_aspects = calculate_aspects_for_dates(
    bodies_by_date,
    settings,
    orb_mult=ASTRO_CONFIG["orb_mult"],
)

# ------------------------------------------------------------
# 3. Full Dataset
# ------------------------------------------------------------
print("Merging dataset...")

# Combine all feature blocks into one wide table
# We include both baseline aspects and natal transits.
df_features = build_full_features(
    df_bodies,
    df_aspects,
    df_transits=df_transits,
    df_phases=df_phases,
    include_pair_aspects=True,    # baseline aspects
    include_transit_aspects=True,  # natal transits
)

# Merge features with labels to make the final training dataset
df_dataset = merge_features_with_labels(df_features, df_labels)

print(f"Dataset Shape: {df_dataset.shape}")
print(f"Columns: {len(df_dataset.columns)}")


  df = pd.read_sql_query(query, conn, params=params)


Loading data...
Loaded 5677 rows from DB for subject=btc
Date range: 2010-07-18 -> 2026-01-31
Labels created: 2814 samples
  UP: 1368 (48.6%)
  DOWN: 1446 (51.4%)
  Date range: 2017-11-01 -> 2025-07-15
Calculating astro...
üìç –†–∞—Å—á—ë—Ç –ì–ï–û–¶–ï–ù–¢–†–ò–ß–ï–°–ö–ò–• –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç (–ó–µ–º–ª—è –≤ —Ü–µ–Ω—Ç—Ä–µ)...


Calculating bodies: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3014/3014 [00:00<00:00, 15803.68it/s]


‚òÄÔ∏è –†–∞—Å—á—ë—Ç –ì–ï–õ–ò–û–¶–ï–ù–¢–†–ò–ß–ï–°–ö–ò–• –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç (–°–æ–ª–Ω—Ü–µ –≤ —Ü–µ–Ω—Ç—Ä–µ)...


Calculating bodies: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3014/3014 [00:00<00:00, 28115.32it/s]


‚úÖ –û–±—ä–µ–¥–∏–Ω–µ–Ω–æ: 78364 –∑–∞–ø–∏—Å–µ–π –∏–∑ 2 —Å–∏—Å—Ç–µ–º –∫–æ–æ—Ä–¥–∏–Ω–∞—Ç


Calculating phases & elongations: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3014/3014 [00:00<00:00, 222548.28it/s]


‚úÖ –†–∞—Å—Å—á–∏—Ç–∞–Ω–æ 3014 –¥–Ω–µ–π: —Ñ–∞–∑–∞ –õ—É–Ω—ã + —ç–ª–æ–Ω–≥–∞—Ü–∏–∏ –ø–ª–∞–Ω–µ—Ç
Building natal features for 2009-10-10...


Calculating transits (orb=0.1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3014/3014 [00:00<00:00, 31142.85it/s]
Calculating aspects (orb=0.1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3014/3014 [00:00<00:00, 61256.83it/s]


Merging dataset...
Merged dataset: 3014 samples (ALL days, forward-filled)
Features: 2040
Dataset Shape: (3014, 2042)
Columns: 2042


In [12]:
# ------------------------------------------------------------
# 4. Grid Search
# ------------------------------------------------------------
print("üöÄ Starting Deep Grid Search...")

# Split into train / validation / test so we can judge honestly
train_df, val_df, test_df = split_dataset(df_dataset)

# Features are all columns except date and target label
feature_cols = [c for c in df_dataset.columns if c not in ["date", "target"]]

# Convert dataframes into X (features) and y (labels)
X_train, y_train = prepare_xy(train_df, feature_cols)
X_val, y_val = prepare_xy(val_df, feature_cols)
X_test, y_test = prepare_xy(test_df, feature_cols)

results = []
keys = PARAM_GRID.keys()
combinations = list(product(*PARAM_GRID.values()))

# Try every combination of parameters
for vals in tqdm(combinations, desc="Grid Search"):
    params = dict(zip(keys, vals))

    # Train a model with these parameters
    model = train_xgb_model(
        X_train,
        y_train,
        X_val,
        y_val,
        feature_cols,
        n_classes=2,
        device=device,
        early_stopping_rounds=50,
        verbose=False,
        **params,
    )

    # Choose the best classification threshold on validation data
    # We use recall_min to keep both classes balanced.
    best_t, _ = tune_threshold(model, X_val, y_val, metric="recall_min")

    # Predict on test data using that threshold
    y_test_pred = predict_with_threshold(model, X_test, threshold=best_t)

    # Evaluate with two metrics:
    # R_MIN = minimum recall of the two classes (fairness)
    # MCC = Matthews correlation coefficient (balanced quality)
    report = classification_report(
        y_test,
        y_test_pred,
        output_dict=True,
        zero_division=0,
    )
    r_min = min(report["0"]["recall"], report["1"]["recall"])
    mcc = matthews_corrcoef(y_test, y_test_pred)

    # Save the result
    res_row = params.copy()
    res_row["R_MIN"] = r_min
    res_row["MCC"] = mcc
    results.append(res_row)


üöÄ Starting Deep Grid Search...
Split: Train=2109, Val=452, Test=453


Grid Search:   3%|‚ñé         | 1/36 [00:01<00:36,  1.03s/it]

üéØ Best threshold=0.50, RECALL_MIN=0.4109, gap=0.1823


Grid Search:   6%|‚ñå         | 2/36 [00:01<00:30,  1.11it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4746, gap=0.1291


Grid Search:   8%|‚ñä         | 3/36 [00:02<00:28,  1.15it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4109, gap=0.1823


Grid Search:  11%|‚ñà         | 4/36 [00:03<00:29,  1.08it/s]

üéØ Best threshold=0.48, RECALL_MIN=0.5382, gap=0.0437


Grid Search:  14%|‚ñà‚ñç        | 5/36 [00:04<00:27,  1.11it/s]

üéØ Best threshold=0.48, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  17%|‚ñà‚ñã        | 6/36 [00:05<00:27,  1.11it/s]

üéØ Best threshold=0.47, RECALL_MIN=0.4218, gap=0.2222


Grid Search:  19%|‚ñà‚ñâ        | 7/36 [00:06<00:25,  1.14it/s]

üéØ Best threshold=0.49, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  22%|‚ñà‚ñà‚ñè       | 8/36 [00:07<00:25,  1.09it/s]

üéØ Best threshold=0.49, RECALL_MIN=0.5706, gap=0.0876


Grid Search:  25%|‚ñà‚ñà‚ñå       | 9/36 [00:08<00:25,  1.05it/s]

üéØ Best threshold=0.48, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  28%|‚ñà‚ñà‚ñä       | 10/36 [00:09<00:25,  1.02it/s]

üéØ Best threshold=0.47, RECALL_MIN=0.4218, gap=0.2222


Grid Search:  31%|‚ñà‚ñà‚ñà       | 11/36 [00:10<00:24,  1.01it/s]

üéØ Best threshold=0.49, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  33%|‚ñà‚ñà‚ñà‚ñé      | 12/36 [00:11<00:24,  1.01s/it]

üéØ Best threshold=0.48, RECALL_MIN=0.4400, gap=0.2041


Grid Search:  36%|‚ñà‚ñà‚ñà‚ñå      | 13/36 [00:12<00:22,  1.00it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4109, gap=0.1823


Grid Search:  39%|‚ñà‚ñà‚ñà‚ñâ      | 14/36 [00:13<00:20,  1.05it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4746, gap=0.1291


Grid Search:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 15/36 [00:14<00:19,  1.10it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4109, gap=0.1823


Grid Search:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 16/36 [00:15<00:19,  1.01it/s]

üéØ Best threshold=0.48, RECALL_MIN=0.5382, gap=0.0437


Grid Search:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 17/36 [00:16<00:18,  1.03it/s]

üéØ Best threshold=0.48, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 18/36 [00:17<00:17,  1.05it/s]

üéØ Best threshold=0.47, RECALL_MIN=0.4218, gap=0.2222


Grid Search:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 19/36 [00:17<00:15,  1.07it/s]

üéØ Best threshold=0.49, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 20/36 [00:18<00:14,  1.07it/s]

üéØ Best threshold=0.49, RECALL_MIN=0.5706, gap=0.0876


Grid Search:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 21/36 [00:19<00:14,  1.04it/s]

üéØ Best threshold=0.48, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 22/36 [00:20<00:13,  1.01it/s]

üéØ Best threshold=0.47, RECALL_MIN=0.4218, gap=0.2222


Grid Search:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 23/36 [00:21<00:13,  1.00s/it]

üéØ Best threshold=0.49, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 24/36 [00:23<00:12,  1.01s/it]

üéØ Best threshold=0.48, RECALL_MIN=0.4400, gap=0.2041


Grid Search:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 25/36 [00:23<00:10,  1.05it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4109, gap=0.1823


Grid Search:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 26/36 [00:24<00:09,  1.10it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4746, gap=0.1291


Grid Search:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 27/36 [00:25<00:07,  1.16it/s]

üéØ Best threshold=0.50, RECALL_MIN=0.4109, gap=0.1823


Grid Search:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 28/36 [00:27<00:10,  1.35s/it]

üéØ Best threshold=0.48, RECALL_MIN=0.5382, gap=0.0437


Grid Search:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 29/36 [00:28<00:08,  1.21s/it]

üéØ Best threshold=0.48, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 30/36 [00:29<00:06,  1.13s/it]

üéØ Best threshold=0.47, RECALL_MIN=0.4218, gap=0.2222


Grid Search:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 31/36 [00:30<00:05,  1.05s/it]

üéØ Best threshold=0.49, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 32/36 [00:31<00:04,  1.03s/it]

üéØ Best threshold=0.49, RECALL_MIN=0.5706, gap=0.0876


Grid Search:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 33/36 [00:32<00:03,  1.02s/it]

üéØ Best threshold=0.48, RECALL_MIN=0.5455, gap=0.0817


Grid Search:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 34/36 [00:33<00:02,  1.04s/it]

üéØ Best threshold=0.47, RECALL_MIN=0.4218, gap=0.2222


Grid Search:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 35/36 [00:34<00:01,  1.03s/it]

üéØ Best threshold=0.49, RECALL_MIN=0.5455, gap=0.0817


Grid Search: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 36/36 [00:35<00:00,  1.01it/s]

üéØ Best threshold=0.48, RECALL_MIN=0.4400, gap=0.2041





In [13]:
# ------------------------------------------------------------
# 5. Analysis
# ------------------------------------------------------------
# Sort by the metric we care about most (R_MIN)
df_res = pd.DataFrame(results).sort_values("R_MIN", ascending=False)

print("
üèÜ TOP 10 MODELS:")
print(df_res.head(10))

# Best row is the first after sorting
best = df_res.iloc[0]
print(f"
ü•á WINNER PARAMS:")
print(best.to_dict())

# Compare with a known baseline score
baseline_rmin = 0.587
if best["R_MIN"] > baseline_rmin:
    print(
        f"
üöÄ SUCCESS! Deep model beat baseline! "
        f"({best['R_MIN']:.3f} > {baseline_rmin})"
    )
else:
    print(
        f"
üíÄ FAILURE. Still can't beat baseline. "
        f"({best['R_MIN']:.3f} <= {baseline_rmin})"
    )
    print("Hypothesis: Natal features are just noise.")



üèÜ TOP 10 MODELS:
    n_estimators  max_depth  learning_rate  colsample_bytree  subsample  \
2            500          6           0.03               0.6        0.8   
14           900          6           0.03               0.6        0.8   
26          1300          6           0.03               0.6        0.8   
0            500          6           0.05               0.6        0.8   
24          1300          6           0.05               0.6        0.8   
12           900          6           0.05               0.6        0.8   
34          1300         10           0.03               0.6        0.8   
22           900         10           0.03               0.6        0.8   
10           500         10           0.03               0.6        0.8   
30          1300          8           0.03               0.6        0.8   

       R_MIN       MCC  
2   0.602941  0.315097  
14  0.602941  0.315097  
26  0.602941  0.315097  
0   0.597059  0.309950  
24  0.597059  0.309950  
12 

In [16]:
# ------------------------------------------------------------
# 6. Export Best Model for Production Service
# ------------------------------------------------------------
import joblib

# Retrain the best model so we can save a clean final version
best_params = best.to_dict()
print(f"üîÑ Retraining best model with params: {best_params}")

# Keep only XGBoost parameters and fix types
# (some numbers are floats in the table but should be ints)
xgb_params = {}
for k, v in best_params.items():
    if k in ["R_MIN", "MCC"]:
        continue
    if k in ["n_estimators", "max_depth"]:
        xgb_params[k] = int(v)
    else:
        xgb_params[k] = v

print(f"üìä XGBoost params (fixed types): {xgb_params}")

# Train final model with best params
final_model = train_xgb_model(
    X_train,
    y_train,
    X_val,
    y_val,
    feature_cols,
    n_classes=2,
    device=device,
    early_stopping_rounds=50,
    verbose=False,
    **xgb_params,
)

# Prepare a package for export
# We save both the model and the context needed to reuse it later.
model_data = {
    "model": final_model,
    "feature_names": feature_cols,
    "config": {
        # Astro config
        "birth_date": str(TARGET_DATE),
        "coord_mode": ASTRO_CONFIG["coord_mode"],
        "orb_mult": ASTRO_CONFIG["orb_mult"],
        "gauss_window": ASTRO_CONFIG["gauss_window"],
        "gauss_std": ASTRO_CONFIG["gauss_std"],
        "exclude_bodies": ASTRO_CONFIG.get("exclude_bodies"),
        # XGBoost params
        **xgb_params,
        # Metrics
        "r_min": float(best["R_MIN"]),
        "mcc": float(best["MCC"]),
    },
}

# Save to models_artifacts
output_path = "../models_artifacts/btc_astro_predictor.joblib"
joblib.dump(model_data, output_path)
print(f"‚úÖ Model exported to: {output_path}")
print(f"   Features: {len(feature_cols)}")
print(f"   R_MIN: {best['R_MIN']:.3f}")
print(f"   MCC: {best['MCC']:.3f}")


üîÑ Retraining best model with params: {'n_estimators': 500.0, 'max_depth': 6.0, 'learning_rate': 0.03, 'colsample_bytree': 0.6, 'subsample': 0.8, 'R_MIN': 0.6029411764705882, 'MCC': 0.3150965594739174}
üìä XGBoost params (fixed types): {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.03, 'colsample_bytree': 0.6, 'subsample': 0.8}
‚úÖ Model exported to: ../models_artifacts/btc_astro_predictor.joblib
   Features: 2040
   R_MIN: 0.603
   MCC: 0.315
