# FD001 RUL Prediction — Config 4: Per-Cycle + RT Geometry + ORTHON

**Target benchmark:** 11.72 RMSE (the best prior result)

**Architecture:**
- Layer 1: Cycle number (1)
- Layer 2: Raw sensors — 15 varying (15)
- Layer 3: Rolling stats — 15 sensors × 5 windows × 5 stats (375)
- Layer 4: Delta features — sensor[t] - sensor[t-1] (15)
- Layer 5: RT geometry — fleet healthy baseline, per cycle (5)
- Layer 6: ORTHON features — eigendecomp derivatives + homology + trajectory, asof-joined from ml/ (window → cycle)

**Models:**
- Run A: Ridge / RandomForest / GradientBoosting (comparable to v2)
- Run B: LGB + XGB + HistGBM → RidgeCV stacking (matches 11.72 architecture)

**Key constraint:** Fleet baseline fitted on TRAIN only, applied identically to TRAIN and TEST.
All features are strictly backward-looking (no future lookahead).

In [1]:
import numpy as np
import polars as pl
import pandas as pd
import json
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Paths
TRAIN_BASE = Path('/Users/jasonrudder/domains/cmapss/FD_001/Train')
TEST_BASE  = Path('/Users/jasonrudder/domains/cmapss/FD_001/Test')
TRAIN_ML   = TRAIN_BASE / 'output_time/ml'
TEST_ML    = TEST_BASE  / 'output_time/ml'
RUL_PATH   = Path('/Users/jasonrudder/domains/cmapss/FD_001/RUL_FD001.txt')

# The 15 sensors used in the 11.72 benchmark (excluding 7 constants + op1/op2 operational settings)
SENSORS_15 = ['BPR', 'NRc', 'NRf', 'Nc', 'Nf', 'P15', 'P30', 'Ps30',
               'T24', 'T30', 'T50', 'W31', 'W32', 'htBleed', 'phi']

ROLL_WINDOWS  = [5, 10, 15, 20, 30]
EARLY_LIFE    = 30     # first N cycles define fleet healthy baseline
N_PCA         = 10     # PCA components for fleet baseline
RUL_CAP       = 125
N_FOLDS       = 5
SEED          = 42

print('Config ready.')
print(f'  {len(SENSORS_15)} sensors, {len(ROLL_WINDOWS)} roll windows')
print(f'  Rolling features: {len(SENSORS_15) * len(ROLL_WINDOWS) * 5} = {len(SENSORS_15)} × {len(ROLL_WINDOWS)} × 5 stats')

Config ready.
  15 sensors, 5 roll windows
  Rolling features: 375 = 15 × 5 × 5 stats


## 1. Load Observations → Wide Format

In [3]:
def load_wide(obs_path: Path, sensors: list) -> pd.DataFrame:
    """Load observations.parquet, filter to sensors, pivot to one row per (cohort, cycle)."""
    obs = pl.read_parquet(str(obs_path))
    obs_s = obs.filter(pl.col('signal_id').is_in(sensors))
    wide = (
        obs_s
        .pivot(index=['cohort', 'signal_0'], on='signal_id', values='value',
               aggregate_function='first')
        .sort(['cohort', 'signal_0'])
    )
    df = wide.to_pandas().rename(columns={'signal_0': 'cycle'})
    return df

print('Loading train...')
train_wide = load_wide(TRAIN_BASE / 'observations.parquet', SENSORS_15)
print(f'  Train: {train_wide.shape} — {train_wide["cohort"].nunique()} engines')

print('Loading test...')
test_wide = load_wide(TEST_BASE / 'observations.parquet', SENSORS_15)
print(f'  Test:  {test_wide.shape} — {test_wide["cohort"].nunique()} engines')

Loading train...
  Train: (20631, 17) — 100 engines
Loading test...
  Test:  (13096, 17) — 100 engines


## 2. RUL Labels

In [4]:
def add_train_rul(df: pd.DataFrame, cap: int = 125) -> pd.DataFrame:
    """RUL = max_cycle_per_engine - current_cycle, capped."""
    df = df.copy()
    max_cycle = df.groupby('cohort')['cycle'].transform('max')
    df['RUL'] = np.clip((max_cycle - df['cycle']).values, 0, cap)
    return df

train_wide = add_train_rul(train_wide, RUL_CAP)

# Load test ground truth (ordered by engine number, 1-indexed)
rul_gt = np.loadtxt(str(RUL_PATH))
print(f'Train RUL: {train_wide["RUL"].min():.0f} – {train_wide["RUL"].max():.0f} ({len(train_wide)} rows)')
print(f'Test GT RUL: {rul_gt.min():.0f} – {rul_gt.max():.0f} ({len(rul_gt)} engines)')

Train RUL: 0 – 125 (20631 rows)
Test GT RUL: 7 – 145 (100 engines)


## 3. Per-Cycle Features (Rolling + Delta)

In [5]:
def add_per_cycle_features(df: pd.DataFrame, sensors: list, windows: list) -> pd.DataFrame:
    """Add rolling stats and delta features. Processes per engine to prevent bleed."""
    df = df.sort_values(['cohort', 'cycle']).copy()
    results = []
    for cohort_id, grp in df.groupby('cohort', sort=False):
        grp = grp.reset_index(drop=True)
        # Rolling stats (5 stats × 5 windows × 15 sensors = 375)
        new_cols = {}
        for win in windows:
            r = grp[sensors].rolling(win, min_periods=1)
            means = r.mean()
            stds  = r.std(ddof=0).fillna(0.0)
            mins  = r.min()
            maxs  = r.max()
            for s in sensors:
                new_cols[f'roll_{s}_mean_{win}']  = means[s].values
                new_cols[f'roll_{s}_std_{win}']   = stds[s].values
                new_cols[f'roll_{s}_min_{win}']   = mins[s].values
                new_cols[f'roll_{s}_max_{win}']   = maxs[s].values
                new_cols[f'roll_{s}_range_{win}'] = (maxs[s] - mins[s]).values
        for s in sensors:
            new_cols[f'delta_{s}'] = grp[s].diff(1).fillna(0.0).values
        grp = pd.concat([grp, pd.DataFrame(new_cols, index=grp.index)], axis=1)
        results.append(grp)
    return pd.concat(results, ignore_index=True)

print('Building train per-cycle features...')
train_feat = add_per_cycle_features(train_wide, SENSORS_15, ROLL_WINDOWS)
n_per_cycle = 1 + len(SENSORS_15) + len(SENSORS_15)*len(ROLL_WINDOWS)*5 + len(SENSORS_15)  # cycle+sensors+rolling+delta
print(f'  Train: {train_feat.shape} — expect ~{n_per_cycle} feature columns')

print('Building test per-cycle features...')
test_feat = add_per_cycle_features(test_wide, SENSORS_15, ROLL_WINDOWS)
print(f'  Test:  {test_feat.shape}')

Building train per-cycle features...


  Train: (20631, 408) — expect ~406 feature columns
Building test per-cycle features...


  Test:  (13096, 407)


## 4. Fleet Baseline + RT Geometry

Fleet centroid = mean of the first `EARLY_LIFE` cycles pooled across ALL training engines.  
Fitted on TRAIN only. Applied identically to train and test (no leakage).

5 RT geometry features per cycle:
- `rt_centroid_dist` — L2 distance from fleet mean (in standardized space)
- `rt_centroid_dist_norm` — distance normalized by √n_sensors
- `rt_pc1_proj` — projection onto primary degradation axis
- `rt_pc2_proj` — projection onto secondary axis
- `rt_mahalanobis` — eigenvalue-weighted distance

In [6]:
# Build fleet baseline from early-life train cycles
early_data = train_wide[train_wide['cycle'] <= EARLY_LIFE][SENSORS_15].dropna()
fleet_scaler = StandardScaler()
early_scaled = fleet_scaler.fit_transform(early_data.values)
fleet_pca = PCA(n_components=N_PCA, random_state=SEED)
fleet_pca.fit(early_scaled)

n_early_engines = train_wide[train_wide['cycle'] <= EARLY_LIFE]['cohort'].nunique()
print(f'Fleet baseline: {len(early_data):,} early-life cycles from {n_early_engines} engines')
print(f'PC1 explains {fleet_pca.explained_variance_ratio_[0]:.1%}, '
      f'PC2 {fleet_pca.explained_variance_ratio_[1]:.1%}, '
      f'top-5 cumulative {fleet_pca.explained_variance_ratio_[:5].sum():.1%}')

Fleet baseline: 3,100 early-life cycles from 100 engines
PC1 explains 48.2%, PC2 7.8%, top-5 cumulative 71.7%


In [7]:
def compute_rt_geometry(df: pd.DataFrame, sensors: list,
                        scaler: StandardScaler, pca: PCA) -> pd.DataFrame:
    """Compute 5 RT geometry features per row using the pre-fitted fleet baseline."""
    X = df[sensors].values.astype(np.float64)
    X = np.nan_to_num(X, nan=0.0)
    X_scaled = scaler.transform(X)

    # Centroid distance (fleet centroid = origin in scaled space)
    centroid_dist      = np.linalg.norm(X_scaled, axis=1)
    centroid_dist_norm = centroid_dist / np.sqrt(X_scaled.shape[1])

    # PC projections
    X_proj    = pca.transform(X_scaled)        # (n, n_pcs)
    pc1_proj  = X_proj[:, 0]
    pc2_proj  = X_proj[:, 1] if pca.n_components_ > 1 else np.zeros(len(X_scaled))

    # Mahalanobis approximation
    lambdas   = np.maximum(pca.explained_variance_, 1e-10)
    mahal     = np.sqrt(np.sum(X_proj**2 / lambdas, axis=1))

    return pd.DataFrame({
        'rt_centroid_dist':      centroid_dist,
        'rt_centroid_dist_norm': centroid_dist_norm,
        'rt_pc1_proj':           pc1_proj,
        'rt_pc2_proj':           pc2_proj,
        'rt_mahalanobis':        mahal,
    }, index=df.index)

print('Computing train RT geometry...')
train_rt = compute_rt_geometry(train_feat, SENSORS_15, fleet_scaler, fleet_pca)
print(f'  centroid_dist: mean={train_rt["rt_centroid_dist"].mean():.3f}, '
      f'max={train_rt["rt_centroid_dist"].max():.3f}')

print('Computing test RT geometry (same fleet baseline)...')
test_rt = compute_rt_geometry(test_feat, SENSORS_15, fleet_scaler, fleet_pca)
print(f'  centroid_dist: mean={test_rt["rt_centroid_dist"].mean():.3f}, '
      f'max={test_rt["rt_centroid_dist"].max():.3f}')

Computing train RT geometry...
  centroid_dist: mean=5.783, max=34.758
Computing test RT geometry (same fleet baseline)...
  centroid_dist: mean=4.043, max=17.341


## 5. ORTHON Features — asof Join (window → cycle)

Cohort-level features from `ml/` have one row per (cohort, window).  
`signal_0_end` = last cycle in that window.

For each (cohort, cycle): take the ORTHON row with the largest `signal_0_end ≤ cycle`.  
This is strictly backward-looking — we only use context that was available before cycle t.

In [8]:
# ORTHON files that have signal_0_end for asof join
ORTHON_COHORT_FILES = [
    ('ml_eigendecomp_derivatives.parquet', 'ed'),  # eigendecomp + d1/d2
    ('ml_persistent_homology.parquet',     'ph'),  # topological features
]

# Select centroid columns: key physical features + d1/d2 (avoid 300-column blast)
CENTROID_COLS_SELECT = [
    # Trend & regime
    'centroid_trend_slope', 'centroid_trend_r2', 'centroid_trend_cusum_range',
    'centroid_variance_growth_rate', 'centroid_variance_growth_ratio',
    # Complexity / chaos
    'centroid_hurst_exponent', 'centroid_lyapunov_exponent',
    'centroid_complexity_sample_entropy', 'centroid_complexity_permutation_entropy',
    'centroid_correlation_dimension_value',
    # Frequency
    'centroid_hilbert_freq_drift', 'centroid_spectral_entropy', 'centroid_spectral_slope',
    # Spread
    'dispersion_mean', 'dispersion_max', 'dispersion_std',
    # Attractor
    'centroid_rqa_determinism', 'centroid_rqa_recurrence_rate',
]

def load_orthon_cohort(ml_dir: Path) -> pd.DataFrame:
    """Load and merge cohort-level ORTHON features that have signal_0_end."""
    frames = []

    # Eigendecomp derivatives (eigendecomp + d1/d2)
    p = ml_dir / 'ml_eigendecomp_derivatives.parquet'
    if p.exists():
        df = pl.read_parquet(str(p)).to_pandas()
        drop = ['signal_0_start', 'signal_0_center', 'n_signals', 'n_features', 'n_features_valid',
                'window_index']
        df = df.drop(columns=[c for c in drop if c in df.columns])
        df.columns = [f'ed_{c}' if c not in ['cohort', 'signal_0_end'] else c for c in df.columns]
        frames.append(df)
        print(f'  eigendecomp_derivatives: {len(df)} rows, {df.shape[1]-2} features')

    # Persistent homology
    p = ml_dir / 'ml_persistent_homology.parquet'
    if p.exists():
        df = pl.read_parquet(str(p)).to_pandas()
        drop = ['window_index', 'n_points']
        df = df.drop(columns=[c for c in drop if c in df.columns])
        df.columns = [f'ph_{c}' if c not in ['cohort', 'signal_0_end'] else c for c in df.columns]
        frames.append(df)
        print(f'  persistent_homology: {len(df)} rows, {df.shape[1]-2} features')

    # Centroid (curated selection + d1/d2)
    p = ml_dir / 'ml_centroid_derivatives.parquet'
    if p.exists():
        df = pl.read_parquet(str(p)).to_pandas()
        # Build list: base + d1 + d2 for selected columns, plus signal_0_end, cohort
        keep_base = [c for c in CENTROID_COLS_SELECT if c in df.columns]
        keep_d1   = [f'{c}_d1' for c in keep_base if f'{c}_d1' in df.columns]
        keep_d2   = [f'{c}_d2' for c in keep_base if f'{c}_d2' in df.columns]
        keep_all  = ['cohort', 'signal_0_end'] + keep_base + keep_d1 + keep_d2
        df = df[[c for c in keep_all if c in df.columns]]
        df.columns = [f'cv_{c}' if c not in ['cohort', 'signal_0_end'] else c for c in df.columns]
        frames.append(df)
        print(f'  centroid (curated): {len(df)} rows, {df.shape[1]-2} features')

    if not frames:
        return None

    # Merge all on (cohort, signal_0_end)
    merged = frames[0]
    for f in frames[1:]:
        merged = merged.merge(f, on=['cohort', 'signal_0_end'], how='outer')
    return merged.sort_values(['cohort', 'signal_0_end'])

print('Loading train ORTHON cohort features...')
train_orthon = load_orthon_cohort(TRAIN_ML)
print(f'  Combined: {train_orthon.shape}')

print('Loading test ORTHON cohort features...')
test_orthon = load_orthon_cohort(TEST_ML)
print(f'  Combined: {test_orthon.shape}')

Loading train ORTHON cohort features...
  eigendecomp_derivatives: 789 rows, 131 features
  persistent_homology: 789 rows, 8 features
  centroid (curated): 789 rows, 54 features
  Combined: (789, 195)
Loading test ORTHON cohort features...
  eigendecomp_derivatives: 675 rows, 131 features
  persistent_homology: 675 rows, 8 features
  centroid (curated): 675 rows, 54 features
  Combined: (675, 195)


In [9]:
## 5b. Fleet Normalization of ORTHON Features
# Per-cohort delta normalization:
#   1. For each cohort, record first-window values as that engine's healthy baseline
#   2. For each window: delta = current - first_window  (drift from own healthy state)
#   3. Scale by fleet_std (from first windows of all training engines)
#
# This removes inter-engine absolute differences while preserving intra-engine dynamics.
# Same principle as RT geometry: "how far has this engine drifted from where it started?"
#
# prim_ and traj_ broadcast features are DROPPED — they are static per-engine fingerprints
# with no time dimension and cannot be fleet-normalized meaningfully.

def normalize_orthon_per_cohort(train_df, test_df):
    """
    Per-cohort delta normalization for ORTHON cohort-level features.
    Fleet std is computed from first windows of training engines (healthy reference).
    """
    key_cols = ['cohort', 'signal_0_end']
    feat_cols = [c for c in train_df.columns if c not in key_cols]

    # Fleet first-window: one row per training engine at its earliest observed window
    first_idx  = train_df.groupby('cohort')['signal_0_end'].idxmin()
    fleet_first = train_df.loc[first_idx].set_index('cohort')[feat_cols]
    fleet_std   = fleet_first.std().clip(lower=1e-8)

    def apply_delta(df, is_test=False):
        result = df.copy()
        for cohort_id, grp in df.groupby('cohort', sort=False):
            if cohort_id in fleet_first.index:
                baseline = fleet_first.loc[cohort_id, feat_cols].values
            else:
                # Test engine not in train: use its own first window as baseline
                own_first = grp.loc[grp['signal_0_end'].idxmin(), feat_cols].values
                baseline  = own_first
            delta = grp[feat_cols].values - baseline
            result.loc[grp.index, feat_cols] = delta / fleet_std.values
        return result

    train_norm = apply_delta(train_df)
    test_norm  = apply_delta(test_df, is_test=True)
    return train_norm, test_norm, fleet_first, fleet_std

print('Normalizing train ORTHON features (per-cohort delta from first window)...')
train_orthon_norm, test_orthon_norm, fleet_first_orthon, fleet_std_orthon = \
    normalize_orthon_per_cohort(train_orthon, test_orthon)

# Sanity check: first-window deltas should be ~0 for training engines
first_idx  = train_orthon_norm.groupby('cohort')['signal_0_end'].idxmin()
first_vals = train_orthon_norm.loc[first_idx]
feat_cols_check = [c for c in train_orthon_norm.columns if c not in ['cohort','signal_0_end']]
mean_first = first_vals[feat_cols_check].mean().abs().mean()
print(f'  Mean |first-window delta| (should be ~0): {mean_first:.6f}')
print(f'  ORTHON norm shape: {train_orthon_norm.shape}')
# Show a few feature ranges after normalization
for f in ['ed_effective_dim', 'ed_eigenvalue_0', 'ph_betti_1']:
    if f in train_orthon_norm.columns:
        tr_r = train_orthon_norm[f].agg(['min','max','std'])
        te_r = test_orthon_norm[f].agg(['min','max','std'])
        print(f'  {f:30s}: train=[{tr_r["min"]:.2f},{tr_r["max"]:.2f}] std={tr_r["std"]:.2f}  ' +
              f'test=[{te_r["min"]:.2f},{te_r["max"]:.2f}] std={te_r["std"]:.2f}')

Normalizing train ORTHON features (per-cohort delta from first window)...


  Mean |first-window delta| (should be ~0): 0.000000
  ORTHON norm shape: (789, 195)
  ed_effective_dim              : train=[-4.48,2.49] std=1.12  test=[-4.26,3.31] std=1.53
  ed_eigenvalue_0               : train=[0.00,6.56] std=1.52  test=[-3.13,6.74] std=1.95
  ph_betti_1                    : train=[0.00,0.00] std=0.00  test=[0.00,0.00] std=0.00


In [10]:
def asof_join_orthon(cycle_df: pd.DataFrame, orthon_df: pd.DataFrame) -> pd.DataFrame:
    """Asof join per cohort: cycles reset (0-361) per engine so merge_asof needs per-group processing."""
    orthon_cols = [c for c in orthon_df.columns if c not in ['cohort', 'signal_0_end']]
    results = []
    for cohort_id, grp in cycle_df.groupby('cohort', sort=False):
        grp_s = grp.sort_values('cycle').reset_index(drop=True)
        co    = orthon_df[orthon_df['cohort'] == cohort_id].sort_values('signal_0_end')
        if len(co) == 0:
            for col in orthon_cols:
                grp_s[col] = np.nan
            results.append(grp_s)
            continue
        merged = pd.merge_asof(
            grp_s,
            co.drop(columns=['cohort']),
            left_on='cycle',
            right_on='signal_0_end',
            direction='backward'
        ).drop(columns=['signal_0_end'])
        results.append(merged)
    result = pd.concat(results, ignore_index=True)
    n_nan  = result[orthon_cols[0]].isna().sum() if orthon_cols else 0
    print(f'  asof join: {n_nan:,} rows ({n_nan/len(result):.1%}) with no prior window')
    return result

print('Asof joining train ORTHON...')
train_feat_orthon = asof_join_orthon(train_feat, train_orthon_norm)
print(f'  Result: {train_feat_orthon.shape}')

print('Asof joining test ORTHON...')
test_feat_orthon = asof_join_orthon(test_feat, test_orthon_norm)
print(f'  Result: {test_feat_orthon.shape}')

Asof joining train ORTHON...
  asof join: 4,300 rows (20.8%) with no prior window
  Result: (20631, 601)
Asof joining test ORTHON...


  asof join: 3,100 rows (23.7%) with no prior window
  Result: (13096, 600)


## 6. ORTHON Broadcast Features (signal-level, whole-cohort)

Signal primitives (Hurst, entropy) have no time dimension — they are computed over the full series  
per signal per cohort. Pivot wide (one column per signal per feature), then broadcast to all cycles.

In [11]:
def load_orthon_signal_broadcast(ml_dir: Path, sensors: list) -> pd.DataFrame:
    """Load signal-level ORTHON features, aggregate across signals, return one row per cohort."""
    p = ml_dir / 'ml_signal_primitives.parquet'
    if not p.exists():
        return None
    df = pl.read_parquet(str(p)).to_pandas()
    # Filter to varying sensors only
    df = df[df['signal_id'].isin(sensors)]
    feat_cols = [c for c in df.columns if c not in ['cohort', 'signal_id']]
    # Aggregate: mean + std across signals per cohort
    agg = df.groupby('cohort')[feat_cols].agg(['mean', 'std']).reset_index()
    agg.columns = ['cohort'] + [f'prim_{c[0]}_{c[1]}' for c in agg.columns[1:]]
    print(f'  signal_primitives broadcast: {agg.shape[1]-1} features for {len(agg)} cohorts')
    return agg

def load_orthon_trajectory(ml_dir: Path) -> pd.DataFrame:
    """Load trajectory match scores — one row per cohort, broadcast to all cycles."""
    p = ml_dir / 'ml_trajectory_match.parquet'
    if not p.exists():
        return None
    df = pl.read_parquet(str(p)).to_pandas()
    feat_cols = [c for c in df.columns if c not in ['cohort', 'trajectory_id', 'n_windows']]
    df = df.groupby('cohort')[feat_cols].mean().reset_index()
    df.columns = ['cohort'] + [f'traj_{c}' for c in feat_cols]
    print(f'  trajectory_match broadcast: {df.shape[1]-1} features for {len(df)} cohorts')
    return df

def broadcast_join(cycle_df: pd.DataFrame, broadcast_df: pd.DataFrame) -> pd.DataFrame:
    """Join a per-cohort feature table to the per-cycle DataFrame."""
    return cycle_df.merge(broadcast_df, on='cohort', how='left')

print('Loading broadcast features (train)...')
train_prim = load_orthon_signal_broadcast(TRAIN_ML, SENSORS_15)
train_traj = load_orthon_trajectory(TRAIN_ML)

print('Loading broadcast features (test)...')
test_prim  = load_orthon_signal_broadcast(TEST_ML, SENSORS_15)
test_traj  = load_orthon_trajectory(TEST_ML)

Loading broadcast features (train)...
  signal_primitives broadcast: 16 features for 100 cohorts
  trajectory_match broadcast: 4 features for 100 cohorts
Loading broadcast features (test)...
  signal_primitives broadcast: 16 features for 100 cohorts
  trajectory_match broadcast: 4 features for 95 cohorts


## 7. Assemble Full Feature Matrix

In [12]:
def assemble_features(feat_df, rt_df) -> pd.DataFrame:
    """Combine per-cycle features + RT geometry. ORTHON already joined into feat_df.
    NOTE: prim_ and traj_ broadcast features are excluded — they are static per-engine
    fingerprints that encode engine identity, not degradation.
    """
    return pd.concat([feat_df.reset_index(drop=True), rt_df.reset_index(drop=True)], axis=1)

print('Assembling train...')
train_full = assemble_features(train_feat_orthon, train_rt)
print(f'  Train full: {train_full.shape}')

print('Assembling test...')
test_full = assemble_features(test_feat_orthon, test_rt)
print(f'  Test full:  {test_full.shape}')

# Feature groups for ablation
META_COLS  = ['cohort', 'cycle', 'RUL']
SENSOR_COLS  = SENSORS_15
ROLL_COLS    = [c for c in train_full.columns if c.startswith('roll_')]
DELTA_COLS   = [c for c in train_full.columns if c.startswith('delta_')]
RT_COLS      = [c for c in train_full.columns if c.startswith('rt_')]
ORTHON_COLS  = [c for c in train_full.columns if c.startswith(('ed_','ph_','cv_','prim_','traj_'))]
# Deduplicate while preserving order (prevents LGB feature importance mismatch)
_seen = set()
ALL_FEAT_COLS = []
for _c in ['cycle'] + SENSOR_COLS + ROLL_COLS + DELTA_COLS + RT_COLS + ORTHON_COLS:
    if _c in train_full.columns and _c not in _seen:
        ALL_FEAT_COLS.append(_c)
        _seen.add(_c)

print(f'\nFeature groups:')
print(f'  cycle:   1')
print(f'  sensors: {len(SENSOR_COLS)}')
print(f'  rolling: {len(ROLL_COLS)}')
print(f'  delta:   {len(DELTA_COLS)}')
print(f'  RT geom: {len(RT_COLS)}')
print(f'  ORTHON:  {len(ORTHON_COLS)}')
print(f'  TOTAL:   {len(ALL_FEAT_COLS)}')

Assembling train...
  Train full: (20631, 606)
Assembling test...
  Test full:  (13096, 605)

Feature groups:
  cycle:   1
  sensors: 15
  rolling: 375
  delta:   15
  RT geom: 5
  ORTHON:  193
  TOTAL:   604


## 8. Prepare Matrices

In [13]:
def prepare_matrix(df, feat_cols, rul_col='RUL', imputer=None, fit_imputer=True):
    """Extract X, y from DataFrame. Handles inf/nan. Returns (X, y, groups, imputer)."""
    X = df[feat_cols].values.astype(np.float64)
    X = np.where(np.isinf(X), np.nan, X)
    if fit_imputer:
        imputer = SimpleImputer(strategy='median')
        X = imputer.fit_transform(X)
    else:
        X = imputer.transform(X)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    y = df[rul_col].values.astype(np.float64) if rul_col in df.columns else None
    groups = df['cohort'].values
    return X, y, groups, imputer

print('Preparing train matrix...')
X_train, y_train, groups_train, imputer = prepare_matrix(train_full, ALL_FEAT_COLS)
print(f'  X_train: {X_train.shape}')

# Test: get last cycle per engine (prediction point)
test_cohorts_sorted = sorted(test_full['cohort'].unique(),
                              key=lambda x: int(x.split('_')[-1]))
last_idx = [test_full[test_full['cohort'] == c]['cycle'].idxmax() for c in test_cohorts_sorted]
test_last = test_full.loc[last_idx].reset_index(drop=True)
X_test, _, _, _ = prepare_matrix(test_last, ALL_FEAT_COLS, rul_col='RUL',
                                  imputer=imputer, fit_imputer=False)
y_test = np.clip(rul_gt, 0, RUL_CAP)
print(f'  X_test:  {X_test.shape} ({len(test_cohorts_sorted)} engines)')

Preparing train matrix...


  X_train: (20631, 467)
  X_test:  (100, 467) (100 engines)


## 9. Run A — Standard Models (Ridge / RF / GB)

Same architecture as v2 for apple-to-apple comparison.

In [14]:
def phm08(pred, true):
    d = pred - true
    return float(np.sum(np.where(d < 0, np.exp(-d/13) - 1, np.exp(d/10) - 1)))

def evaluate(name, y_true, y_pred, label=''):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    phm  = phm08(y_pred, y_true)
    bias = np.mean(y_pred - y_true)
    print(f'  {name:25s} {label}  RMSE={rmse:.2f}  MAE={mae:.2f}  R²={r2:.4f}  PHM={phm:,.0f}  bias={bias:+.2f}')
    return {'rmse': rmse, 'mae': mae, 'r2': r2, 'phm08': phm, 'bias': bias}

gkf = GroupKFold(n_splits=N_FOLDS)
scaler_A = StandardScaler()
X_train_s = scaler_A.fit_transform(X_train)
X_test_s  = scaler_A.transform(X_test)

# Run A: Ridge only (fast comparison with v2 Ridge=25.75)
resultsA_cv   = {}
resultsA_test = {}
print('=== Run A — Ridge (OOF + Test) ===')
ridge = Ridge(alpha=1.0)
oof_r = np.clip(cross_val_predict(ridge, X_train_s, y_train, groups=groups_train, cv=gkf, n_jobs=-1), 0, RUL_CAP)
resultsA_cv['ridge'] = evaluate('ridge', y_train, oof_r, '[OOF]')
ridge.fit(X_train_s, y_train)
pred_r = np.clip(ridge.predict(X_test_s), 0, RUL_CAP)
resultsA_test['ridge'] = evaluate('ridge', y_test, pred_r, '[TEST]')

=== Run A — Ridge (OOF + Test) ===


  ridge                     [OOF]  RMSE=15.08  MAE=11.09  R²=0.8690  PHM=76,991  bias=-0.43
  ridge                     [TEST]  RMSE=17.12  MAE=13.85  R²=0.8174  PHM=432  bias=-0.49


## 10. Run B — Stacking Ensemble (LGB + XGB + Hist → RidgeCV)

Matches the architecture that produced the 11.72 benchmark.

In [15]:
# Base learners
lgb_model  = LGBMRegressor(n_estimators=500, max_depth=6, learning_rate=0.05,
                            num_leaves=63, subsample=0.8, colsample_bytree=0.8,
                            min_child_samples=20, random_state=SEED, n_jobs=-1,
                            verbose=-1)
xgb_model  = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05,
                           subsample=0.8, colsample_bytree=0.8,
                           min_child_weight=5, random_state=SEED, n_jobs=-1,
                           verbosity=0)
hist_model = HistGradientBoostingRegressor(max_iter=500, max_depth=6, learning_rate=0.05,
                                            min_samples_leaf=20, random_state=SEED)

# Generate OOF predictions — use n_estimators=300 for speed, 500 for final models
print('Generating OOF predictions for stacking meta-features...')
lgb_oof_m  = LGBMRegressor(n_estimators=300, max_depth=6, learning_rate=0.05, num_leaves=63, subsample=0.8, colsample_bytree=0.8, min_child_samples=20, random_state=SEED, n_jobs=-1, verbose=-1)
xgb_oof_m  = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, min_child_weight=5, random_state=SEED, n_jobs=-1, verbosity=0)
hist_oof_m = HistGradientBoostingRegressor(max_iter=300, max_depth=6, learning_rate=0.05, min_samples_leaf=20, random_state=SEED)
lgb_oof  = cross_val_predict(lgb_oof_m,  X_train, y_train, groups=groups_train, cv=gkf, n_jobs=1)
xgb_oof  = cross_val_predict(xgb_oof_m,  X_train, y_train, groups=groups_train, cv=gkf, n_jobs=1)
hist_oof = cross_val_predict(hist_oof_m, X_train, y_train, groups=groups_train, cv=gkf, n_jobs=1)

lgb_oof  = np.clip(lgb_oof,  0, RUL_CAP)
xgb_oof  = np.clip(xgb_oof,  0, RUL_CAP)
hist_oof = np.clip(hist_oof, 0, RUL_CAP)

print('\n=== Run B — OOF (individual base learners) ===')
evaluate('lgb',  y_train, lgb_oof,  '[OOF]')
evaluate('xgb',  y_train, xgb_oof,  '[OOF]')
evaluate('hist', y_train, hist_oof, '[OOF]')

# Meta-learner on OOF
meta_X_oof = np.column_stack([lgb_oof, xgb_oof, hist_oof])
meta = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0])
meta.fit(meta_X_oof, y_train)
stack_oof = np.clip(meta.predict(meta_X_oof), 0, RUL_CAP)
oof_gap = np.sqrt(mean_squared_error(y_train, stack_oof))

print(f'\nMeta RidgeCV selected alpha: {meta.alpha_}')
print(f'Meta weights: LGB={meta.coef_[0]:.3f}, XGB={meta.coef_[1]:.3f}, Hist={meta.coef_[2]:.3f}')
evaluate('stacking', y_train, stack_oof, '[OOF]')

Generating OOF predictions for stacking meta-features...



=== Run B — OOF (individual base learners) ===
  lgb                       [OOF]  RMSE=13.86  MAE=9.32  R²=0.8893  PHM=78,324  bias=-0.78
  xgb                       [OOF]  RMSE=13.79  MAE=9.24  R²=0.8904  PHM=78,459  bias=-0.77
  hist                      [OOF]  RMSE=13.92  MAE=9.24  R²=0.8885  PHM=84,713  bias=-0.60

Meta RidgeCV selected alpha: 100.0
Meta weights: LGB=0.313, XGB=0.472, Hist=0.218
  stacking                  [OOF]  RMSE=13.70  MAE=9.03  R²=0.8920  PHM=80,711  bias=-0.07


{'rmse': np.float64(13.697541467451435),
 'mae': 9.027777065344507,
 'r2': 0.8919605907854676,
 'phm08': 80710.6356413961,
 'bias': np.float64(-0.07438422582058062)}

In [16]:
# Train final base learners on full train set
print('Training final base learners on full train set...')
lgb_final  = lgb_model.__class__(**lgb_model.get_params()).fit(X_train,  y_train)
xgb_final  = xgb_model.__class__(**xgb_model.get_params()).fit(X_train,  y_train)
hist_final = hist_model.__class__(**hist_model.get_params()).fit(X_train, y_train)

# Test predictions
lgb_test  = np.clip(lgb_final.predict(X_test),   0, RUL_CAP)
xgb_test  = np.clip(xgb_final.predict(X_test),   0, RUL_CAP)
hist_test = np.clip(hist_final.predict(X_test),   0, RUL_CAP)
meta_test = np.column_stack([lgb_test, xgb_test, hist_test])
stack_test = np.clip(meta.predict(meta_test), 0, RUL_CAP)

print('\n=== Run B — Test (individual base learners) ===')
resultsB_test = {}
resultsB_test['lgb']  = evaluate('lgb',      y_test, lgb_test,   '[TEST]')
resultsB_test['xgb']  = evaluate('xgb',      y_test, xgb_test,   '[TEST]')
resultsB_test['hist'] = evaluate('hist',      y_test, hist_test,  '[TEST]')
resultsB_test['stacking'] = evaluate('stacking', y_test, stack_test, '[TEST]')

test_rmse = resultsB_test['stacking']['rmse']
oof_rmse  = oof_gap
gap       = test_rmse - oof_rmse
print(f'\nOOF RMSE: {oof_rmse:.2f}  Test RMSE: {test_rmse:.2f}  Gap: {gap:+.2f}')

Training final base learners on full train set...



=== Run B — Test (individual base learners) ===
  lgb                       [TEST]  RMSE=16.29  MAE=11.45  R²=0.8347  PHM=411  bias=-2.70
  xgb                       [TEST]  RMSE=17.02  MAE=11.96  R²=0.8197  PHM=460  bias=-2.55
  hist                      [TEST]  RMSE=17.13  MAE=12.04  R²=0.8173  PHM=472  bias=-2.35
  stacking                  [TEST]  RMSE=16.60  MAE=11.59  R²=0.8284  PHM=440  bias=-1.87

OOF RMSE: 13.70  Test RMSE: 16.60  Gap: +2.90


## 11. Feature Importance (LGB on full train)

In [17]:
importances = lgb_final.feature_importances_
feat_names  = np.array(ALL_FEAT_COLS)
order = np.argsort(importances)[::-1]

print(f'{"Rank":>4s}  {"Feature":>55s}  {"Importance":>10s}  {"Layer":>8s}')
print('-' * 82)
for rank, i in enumerate(order[:30]):
    name = feat_names[i]
    if name == 'cycle':              layer = 'cycle'
    elif name in SENSORS_15:         layer = 'sensor'
    elif name.startswith('roll_'):   layer = 'rolling'
    elif name.startswith('delta_'):  layer = 'delta'
    elif name.startswith('rt_'):     layer = 'RT_geom'
    elif name.startswith('ed_'):     layer = 'eigenD'
    elif name.startswith('ph_'):     layer = 'homology'
    elif name.startswith('cv_'):     layer = 'centroid'
    elif name.startswith('prim_'):   layer = 'primitives'
    elif name.startswith('traj_'):   layer = 'traj'
    else:                            layer = '?'
    print(f'{rank+1:>4d}  {name:>55s}  {importances[i]:>10.0f}  {layer:>8s}')

print(f'feat_names: {len(feat_names)}, importances: {len(importances)}')
# Sync feat_names to importances length in case of internal LGB deduplication
feat_names_trim = feat_names[:len(importances)]

# Layer-level total importance
print('\n=== Layer importance (total) ===')
layers = {
    'cycle':    ['cycle'],
    'sensors':  SENSORS_15,
    'rolling':  ROLL_COLS,
    'delta':    DELTA_COLS,
    'RT_geom':  RT_COLS,
    'ORTHON':   ORTHON_COLS,
}
total_imp = importances.sum()
for layer_name, cols in layers.items():
    cols_set = set(cols)
    idx = [i for i, c in enumerate(feat_names_trim) if c in cols_set]
    layer_imp = importances[idx].sum() if idx else 0
    print(f'  {layer_name:12s}: {layer_imp:8.0f}  ({layer_imp/total_imp:.1%})')

Rank                                                  Feature  Importance     Layer
----------------------------------------------------------------------------------
   1                                                    cycle         819     cycle
   2                                   ed_condition_number_d1         461    eigenD
   3                                          ed_ratio_2_1_d2         309    eigenD
   4                                          ed_ratio_2_1_d1         270    eigenD
   5                      ed_eigenvalue_entropy_normalized_d2         233    eigenD
   6                                      ed_condition_number         232    eigenD
   7                                        ed_total_variance         230    eigenD
   8                                         rt_centroid_dist         224   RT_geom
   9                                     ed_total_variance_d2         220    eigenD
  10                      ed_eigenvalue_entropy_normalized_d1         185    

## 12. Ablation — What Does Each Layer Add?

In [18]:
def quick_lgb(X_tr, y_tr, X_te, y_te, groups, gkf, rul_cap=125):
    """Quick LGB OOF + test eval."""
    model = LGBMRegressor(n_estimators=300, max_depth=6, learning_rate=0.05,
                          random_state=SEED, n_jobs=-1, verbose=-1)
    oof = np.clip(cross_val_predict(model, X_tr, y_tr, groups=groups, cv=gkf, n_jobs=1), 0, rul_cap)
    model.fit(X_tr, y_tr)
    test_pred = np.clip(model.predict(X_te), 0, rul_cap)
    oof_rmse  = np.sqrt(mean_squared_error(y_tr, oof))
    test_rmse = np.sqrt(mean_squared_error(y_te, test_pred))
    return oof_rmse, test_rmse, test_rmse - oof_rmse

ablation_sets = [
    ('cycle + sensors',                  ['cycle'] + SENSORS_15),
    ('+ rolling (CSV baseline)',          ['cycle'] + SENSORS_15 + ROLL_COLS + DELTA_COLS),
    ('+ RT geometry',                     ['cycle'] + SENSORS_15 + ROLL_COLS + DELTA_COLS + RT_COLS),
    ('+ ORTHON only (no RT)',             ['cycle'] + SENSORS_15 + ROLL_COLS + DELTA_COLS + ORTHON_COLS),
    ('Config 4: + RT + ORTHON',           ALL_FEAT_COLS),
]

print(f'{"Configuration":45s}  {"OOF":>7s}  {"Test":>7s}  {"Gap":>7s}')
print('-' * 70)
ablation_results = {}
for label, cols in ablation_sets:
    cols_avail = [c for c in cols if c in train_full.columns]
    X_tr_abl, y_tr_abl, grps, imp_abl = prepare_matrix(train_full, cols_avail)
    X_te_abl, _, _, _                  = prepare_matrix(test_last, cols_avail, imputer=imp_abl, fit_imputer=False)
    oof_r, tst_r, gap_r = quick_lgb(X_tr_abl, y_train, X_te_abl, y_test, groups_train, gkf)
    print(f'{label:45s}  {oof_r:7.2f}  {tst_r:7.2f}  {gap_r:+7.2f}')
    ablation_results[label] = {'oof_rmse': oof_r, 'test_rmse': tst_r, 'gap': gap_r}

Configuration                                      OOF     Test      Gap
----------------------------------------------------------------------


cycle + sensors                                  16.73    17.45    +0.72


+ rolling (CSV baseline)                         15.30    16.36    +1.07


+ RT geometry                                    15.42    15.15    -0.27


+ ORTHON only (no RT)                            13.84    17.36    +3.52


Config 4: + RT + ORTHON                          13.80    16.48    +2.68


## 13. Summary Table

In [19]:
print('=' * 80)
print('RESULTS SUMMARY — FD001 RUL Prediction')
print('=' * 80)
print(f'{"Config":40s}  {"RMSE":>7s}  {"PHM08":>9s}  {"Gap":>7s}  {"Features"}')
print('-' * 80)

benchmarks = [
    ('CSV Standalone (prior)',             12.16, 224,  '+1.05', '~1,044'),
    ('ORTHON Alone v2 (prior)',            16.31, 382,  '+1.07', '14'),
    ('ORTHON+CSV v3 window-level (prior)', 15.03, 374,  '+0.92', '38'),
    ('RT Geometry baseline (prior 11.72)', 11.72, 188,  '-0.70', '413'),
]
for name, rmse, phm, gap, feats in benchmarks:
    print(f'{name:40s}  {rmse:7.2f}  {phm:9,}  {gap:>7s}  {feats}')

print('-' * 80)
# Run A best
best_A = min(resultsA_test, key=lambda k: resultsA_test[k]['rmse'])
r = resultsA_test[best_A]
gap_A = r['rmse'] - min(resultsA_cv[best_A]['rmse'], r['rmse'])  # approximate
print(f'{"Config 4 Run A (" + best_A + ")": <40s}  {r["rmse"]:7.2f}  {r["phm08"]:9,.0f}  {r["bias"]:+7.2f}  {len(ALL_FEAT_COLS)}')

# Run B stacking
r = resultsB_test['stacking']
gap_B = test_rmse - oof_rmse
print(f'{"Config 4 Run B (stacking)":40s}  {r["rmse"]:7.2f}  {r["phm08"]:9,.0f}  {gap_B:+7.2f}  {len(ALL_FEAT_COLS)}')
print('=' * 80)

best_rmse = r['rmse']
vs_benchmark = best_rmse - 11.72
print(f'\nConfig 4 stacking vs 11.72 benchmark: {vs_benchmark:+.2f} RMSE ({"BEAT" if vs_benchmark < 0 else "missed by"} {abs(vs_benchmark):.2f})')

RESULTS SUMMARY — FD001 RUL Prediction
Config                                       RMSE      PHM08      Gap  Features
--------------------------------------------------------------------------------
CSV Standalone (prior)                      12.16        224    +1.05  ~1,044
ORTHON Alone v2 (prior)                     16.31        382    +1.07  14
ORTHON+CSV v3 window-level (prior)          15.03        374    +0.92  38
RT Geometry baseline (prior 11.72)          11.72        188    -0.70  413
--------------------------------------------------------------------------------
Config 4 Run A (ridge)                      17.12        432    -0.49  604
Config 4 Run B (stacking)                   16.60        440    +2.90  604

Config 4 stacking vs 11.72 benchmark: +4.88 RMSE (missed by 4.88)


## 14. Save Results

In [20]:
out_dir = TRAIN_BASE / 'output_time/ml_results_config4'
out_dir.mkdir(exist_ok=True)

summary = {
    'experiment': 'config4_per_cycle_rt_orthon',
    'dataset':    'FD001',
    'n_train_rows': int(X_train.shape[0]),
    'n_features': int(X_train.shape[1]),
    'feature_groups': {
        'cycle': 1,
        'sensors': len(SENSOR_COLS),
        'rolling': len(ROLL_COLS),
        'delta': len(DELTA_COLS),
        'rt_geometry': len(RT_COLS),
        'orthon': len(ORTHON_COLS),
    },
    'fleet_baseline': {
        'early_life_cycles': EARLY_LIFE,
        'n_pca_components': N_PCA,
        'pc1_explained': float(fleet_pca.explained_variance_ratio_[0]),
    },
    'run_a_test': resultsA_test,
    'run_b_test': resultsB_test,
    'run_b_oof_rmse': float(oof_rmse),
    'run_b_gap': float(gap_B),
    'ablation': ablation_results,
    'benchmark_11_72': 11.72,
    'delta_vs_benchmark': float(vs_benchmark),
    'beat_benchmark': bool(vs_benchmark < 0),
}

with open(out_dir / 'summary.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f'Saved to {out_dir}/summary.json')
print(json.dumps({k: v for k, v in summary.items() if k not in ['ablation']}, indent=2, default=str))

Saved to /Users/jasonrudder/domains/cmapss/FD_001/Train/output_time/ml_results_config4/summary.json
{
  "experiment": "config4_per_cycle_rt_orthon",
  "dataset": "FD001",
  "n_train_rows": 20631,
  "n_features": 467,
  "feature_groups": {
    "cycle": 1,
    "sensors": 15,
    "rolling": 375,
    "delta": 15,
    "rt_geometry": 5,
    "orthon": 193
  },
  "fleet_baseline": {
    "early_life_cycles": 30,
    "n_pca_components": 10,
    "pc1_explained": 0.482018984132283
  },
  "run_a_test": {
    "ridge": {
      "rmse": 17.123635422213194,
      "mae": 13.85078847032697,
      "r2": 0.8174077935615021,
      "phm08": 432.27172593579684,
      "bias": -0.4859616346492115
    }
  },
  "run_b_test": {
    "lgb": {
      "rmse": 16.2924759507697,
      "mae": 11.452509543046444,
      "r2": 0.8347031915108756,
      "phm08": 411.04611077442473,
      "bias": -2.6957856208428486
    },
    "xgb": {
      "rmse": 17.016990136501455,
      "mae": 11.963552231788634,
      "r2": 0.819675064533