
# 03 — Train & Evaluate (RUL)

Supports three algorithms via a parameter:
- `algo = "xgboost"` (tabular)
- `algo = "lstm"` (sequence)
- `algo = "cnn"` (sequence)

This notebook will:
- Load processed features & labels
- For sequence models, windowize the time-series
- Train the model and compute metrics: RMSE, MAE, R², NASA Score, Silhouette Coefficient
- Log results to `experiments/RUL/runs.csv` using `log_run(...)`
    

In [None]:

from pathlib import Path
import json, math
import numpy as np
import pandas as pd
from datetime import datetime

# Paths
NB_PATH = Path.cwd()
ROOT = NB_PATH.parents[1] if NB_PATH.name.lower() == 'rul' else NB_PATH
DATA_DIR = ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
EXPERIMENTS_DIR = ROOT / 'experiments' / 'RUL'

# Load datasets
Xtr = pd.read_parquet(PROCESSED_DIR / 'RUL_X_train.parquet')
Xva = pd.read_parquet(PROCESSED_DIR / 'RUL_X_val.parquet')
Xte = pd.read_parquet(PROCESSED_DIR / 'RUL_X_test.parquet')

ytr = pd.read_parquet(PROCESSED_DIR / 'RUL_y_train.parquet')['RUL'].values
yva = pd.read_parquet(PROCESSED_DIR / 'RUL_y_val.parquet')['RUL'].values
yte = pd.read_parquet(PROCESSED_DIR / 'RUL_y_test.parquet')['RUL'].values

print("Loaded shapes:",
      Xtr.shape, Xva.shape, Xte.shape, "|",
      ytr.shape, yva.shape, yte.shape)
    

In [None]:

# --- Choose algorithm here
algo = "xgboost"   # one of: "xgboost", "lstm", "cnn"
random_seed = 42
window = 20        # for sequence models
stride = 1
id_col, time_col = 'machine_id','timestamp'
    

In [None]:

# --- Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import silhouette_score

def nasa_score(y_true, y_pred, c=10.0):
    # Penalty for underestimation heavier than overestimation
    e = y_pred - y_true
    over = e >= 0
    under = ~over
    s = np.zeros_like(e, dtype=float)
    s[over]  = np.exp(-e[over] / c) - 1.0
    s[under] = np.exp(e[under] / c) - 1.0
    return float(np.mean(s**2))

def compute_metrics(y_true, y_pred, X_for_sil=None):
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    nasa = nasa_score(y_true, y_pred)
    # Silhouette: treat residuals as cluster structure vs. 0; use features if available
    sil = np.nan
    try:
        if X_for_sil is not None and len(np.unique(y_true)) > 1:
            # crude: bin true RUL into 3 groups for silhouette
            bins = np.quantile(y_true, [0.33, 0.66])
            labels = np.digitize(y_true, bins)
            sil = silhouette_score(X_for_sil, labels)
    except Exception as e:
        pass
    return {'rmse': rmse, 'mae': mae, 'r2': r2, 'nasa': nasa, 'silhouette': sil}
    

In [None]:

# --- Helper: sequence windowing per machine (avoid leakage)
def build_windows(X, y, window=20, stride=1, id_col='machine_id'):
    feats = [c for c in X.columns if c not in [id_col, 'timestamp','__line']]
    Xs, ys = [], []
    for gid, gX in X.groupby(id_col):
        idx = gX.index.values
        for start in range(0, len(gX) - window + 1, stride):
            end = start + window
            sl = gX.iloc[start:end]
            Xs.append(sl[feats].values)  # (window, features)
            ys.append(y[idx[end-1]])     # predict RUL at window end
    Xs = np.stack(Xs, axis=0) if Xs else np.empty((0, window, len(feats)))
    ys = np.array(ys)
    return Xs, ys, feats
    

In [None]:

# --- Train/Eval by algo
results = {}
if algo == "xgboost":
    # Tabular: just use the last value alignment already baked into X
    import xgboost as xgb
    feats = [c for c in Xtr.columns if c not in [id_col,'timestamp','__line']]
    model = xgb.XGBRegressor(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9, random_state=random_seed, n_jobs=-1
    )
    model.fit(Xtr[feats], ytr, eval_set=[(Xva[feats], yva)], verbose=False)
    pred = model.predict(Xte[feats])
    results['pred'] = pred
    results['feats'] = feats

elif algo == "lstm":
    import tensorflow as tf
    from tensorflow import keras
    Xtr_w, ytr_w, feats = build_windows(Xtr, ytr, window=window, stride=stride, id_col=id_col)
    Xva_w, yva_w, _    = build_windows(Xva, yva, window=window, stride=stride, id_col=id_col)
    Xte_w, yte_w, _    = build_windows(Xte, yte, window=window, stride=stride, id_col=id_col)

    model = keras.Sequential([
        keras.layers.Input(shape=(window, len(feats))),
        keras.layers.LSTM(64, return_sequences=False),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse')
    model.fit(Xtr_w, ytr_w, validation_data=(Xva_w, yva_w), epochs=20, batch_size=128, verbose=1)
    pred = model.predict(Xte_w).ravel()
    results['pred'] = pred
    results['feats'] = feats

elif algo == "cnn":
    import tensorflow as tf
    from tensorflow import keras
    Xtr_w, ytr_w, feats = build_windows(Xtr, ytr, window=window, stride=stride, id_col=id_col)
    Xva_w, yva_w, _    = build_windows(Xva, yva, window=window, stride=stride, id_col=id_col)
    Xte_w, yte_w, _    = build_windows(Xte, yte, window=window, stride=stride, id_col=id_col)

    model = keras.Sequential([
        keras.layers.Input(shape=(window, len(feats))),
        keras.layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        keras.layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse')
    model.fit(Xtr_w, ytr_w, validation_data=(Xva_w, yva_w), epochs=20, batch_size=128, verbose=1)
    pred = model.predict(Xte_w).ravel()
    results['pred'] = pred
    results['feats'] = feats

else:
    raise ValueError("Unknown algo. Use one of: xgboost, lstm, cnn")
    

In [None]:

# --- Evaluate & log
from math import sqrt
import json
metrics = {}
if algo in {"lstm","cnn"}:
    # yte_w aligned inside windowing call
    # Recompute to get yte_w for metrics
    _, _, feats = results['feats'], results['feats'], results['feats']
    # quick rebuild to get ground-truth aligned with pred
    Xte_w, yte_w, _ = build_windows(Xte, yte, window=window, stride=stride, id_col=id_col)
    y_true = yte_w
else:
    y_true = yte

y_pred = results['pred']
m = compute_metrics(y_true, y_pred, X_for_sil=None)
m
    

In [None]:

# --- Log to experiments/RUL/runs.csv
import pandas as pd, json
from datetime import datetime
runs_csv = EXPERIMENTS_DIR / 'runs.csv'
if not runs_csv.exists():
    pd.DataFrame(columns=[
        'timestamp','task','algo','line','seed','params_json',
        'rmse','mae','r2','nasa','silhouette','dataset_hash','notes','artifact_dir'
    ]).to_csv(runs_csv, index=False)

row = {
    'timestamp': datetime.utcnow().isoformat(),
    'task': 'RUL',
    'algo': algo,
    'line': -1,                 # set if training per-line
    'seed': 42,
    'params_json': json.dumps({'algo': algo, 'window': int(window)}),
    'rmse': m.get('rmse'),
    'mae': m.get('mae'),
    'r2': m.get('r2'),
    'nasa': m.get('nasa'),
    'silhouette': m.get('silhouette'),
    'dataset_hash': '',
    'notes': '',
    'artifact_dir': ''
}
df_runs = pd.read_csv(runs_csv)
df_runs = pd.concat([df_runs, pd.DataFrame([row])], ignore_index=True)
df_runs.to_csv(runs_csv, index=False)
print("Logged:", runs_csv)
    