
# 03 — Train & Evaluate (CoF)

Supports three algorithms via a parameter:
- `algo = "xgboost"`
- `algo = "lstm"`
- `algo = "cnn"`

- Loads processed features & labels
- For sequence models, windowize the time-series
- Trains the model and computes metrics: **F1, Recall, ROC-AUC**
- Logs results to `experiments/CoF/runs.csv`


In [None]:

from pathlib import Path
import json, math
import numpy as np
import pandas as pd
from datetime import datetime

# Paths
NB_PATH = Path.cwd()
ROOT = NB_PATH.parents[1] if NB_PATH.name.lower() == 'cof' else NB_PATH
DATA_DIR = ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
EXPERIMENTS_DIR = ROOT / 'experiments' / 'CoF'

# Load datasets
Xtr = pd.read_parquet(PROCESSED_DIR / 'CoF_X_train.parquet')
Xva = pd.read_parquet(PROCESSED_DIR / 'CoF_X_val.parquet')
Xte = pd.read_parquet(PROCESSED_DIR / 'CoF_X_test.parquet')

ytr = pd.read_parquet(PROCESSED_DIR / 'CoF_y_train.parquet')['CoF'].values
yva = pd.read_parquet(PROCESSED_DIR / 'CoF_y_val.parquet')['CoF'].values
yte = pd.read_parquet(PROCESSED_DIR / 'CoF_y_test.parquet')['CoF'].values

print("Loaded shapes:", Xtr.shape, Xva.shape, Xte.shape, "|", ytr.shape, yva.shape, yte.shape)


In [None]:

# --- Choose algorithm here
algo = "xgboost"   # one of: "xgboost", "lstm", "cnn"
random_seed = 42
window = 20        # for sequence models
stride = 1
id_col, time_col = 'machine_id','timestamp'


In [None]:

# --- Metrics
from sklearn.metrics import f1_score, recall_score, roc_auc_score, confusion_matrix, precision_recall_curve, roc_curve

def compute_metrics_cls(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    f1  = f1_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except Exception:
        auc = float('nan')
    cm = confusion_matrix(y_true, y_pred)
    return {'f1': f1, 'recall': rec, 'roc_auc': auc, 'cm': cm.tolist()}


In [None]:

# --- Helper: sequence windowing per machine (avoid leakage)
def build_windows(X, y, window=20, stride=1, id_col='machine_id'):
    feats = [c for c in X.columns if c not in [id_col, 'timestamp','__line']]
    Xs, ys = [], []
    for gid, gX in X.groupby(id_col):
        idx = gX.index.values
        for start in range(0, len(gX) - window + 1, stride):
            end = start + window
            sl = gX.iloc[start:end]
            Xs.append(sl[feats].values)        # (window, features)
            ys.append(y[idx[end-1]])           # label at window end
    Xs = np.stack(Xs, axis=0) if Xs else np.empty((0, window, len(feats)))
    ys = np.array(ys)
    return Xs, ys, feats


In [None]:

# --- Train/Eval by algo
results = {}
if algo == "xgboost":
    import xgboost as xgb
    feats = [c for c in Xtr.columns if c not in [id_col,'timestamp','__line']]
    scale_pos_weight = max(1.0, (len(ytr) - ytr.sum()) / max(1.0, ytr.sum()))  # handle imbalance
    model = xgb.XGBClassifier(
        n_estimators=600, max_depth=6, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9, random_state=random_seed,
        scale_pos_weight=scale_pos_weight, n_jobs=-1, eval_metric='logloss'
    )
    model.fit(Xtr[feats], ytr, eval_set=[(Xva[feats], yva)], verbose=False)
    prob = model.predict_proba(Xte[feats])[:,1]
    results['prob'] = prob
    results['feats'] = feats

elif algo == "lstm":
    import tensorflow as tf
    from tensorflow import keras
    Xtr_w, ytr_w, feats = build_windows(Xtr, ytr, window=window, stride=stride, id_col=id_col)
    Xva_w, yva_w, _    = build_windows(Xva, yva, window=window, stride=stride, id_col=id_col)
    Xte_w, yte_w, _    = build_windows(Xte, yte, window=window, stride=stride, id_col=id_col)

    model = keras.Sequential([
        keras.layers.Input(shape=(window, len(feats))),
        keras.layers.LSTM(64, return_sequences=False),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='binary_crossentropy', metrics=['Recall'])
    model.fit(Xtr_w, ytr_w, validation_data=(Xva_w, yva_w), epochs=20, batch_size=128, verbose=1)
    prob = model.predict(Xte_w).ravel()
    results['prob'] = prob
    results['feats'] = feats

elif algo == "cnn":
    import tensorflow as tf
    from tensorflow import keras
    Xtr_w, ytr_w, feats = build_windows(Xtr, ytr, window=window, stride=stride, id_col=id_col)
    Xva_w, yva_w, _    = build_windows(Xva, yva, window=window, stride=stride, id_col=id_col)
    Xte_w, yte_w, _    = build_windows(Xte, yte, window=window, stride=stride, id_col=id_col)

    model = keras.Sequential([
        keras.layers.Input(shape=(window, len(feats))),
        keras.layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        keras.layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='binary_crossentropy', metrics=['Recall'])
    model.fit(Xtr_w, ytr_w, validation_data=(Xva_w, yva_w), epochs=20, batch_size=128, verbose=1)
    prob = model.predict(Xte_w).ravel()
    results['prob'] = prob
    results['feats'] = feats

else:
    raise ValueError("Unknown algo. Use one of: xgboost, lstm, cnn")


In [None]:

# --- Evaluate & log
y_prob = results['prob']
y_true = yte if algo=='xgboost' else yte_w
m = compute_metrics_cls(y_true, y_prob, threshold=0.5)
m


In [None]:

# --- Log to experiments/CoF/runs.csv
import pandas as pd, json
from datetime import datetime
runs_csv = EXPERIMENTS_DIR / 'runs.csv'
if not runs_csv.exists():
    pd.DataFrame(columns=[
        'timestamp','task','algo','line','seed','params_json',
        'f1','recall','roc_auc','dataset_hash','notes','artifact_dir'
    ]).to_csv(runs_csv, index=False)

row = {
    'timestamp': datetime.utcnow().isoformat(),
    'task': 'CoF',
    'algo': algo,
    'line': -1,
    'seed': 42,
    'params_json': json.dumps({'algo': algo, 'window': int(window)}),
    'f1': m.get('f1'),
    'recall': m.get('recall'),
    'roc_auc': m.get('roc_auc'),
    'dataset_hash': '',
    'notes': '',
    'artifact_dir': ''
}
df_runs = pd.read_csv(runs_csv)
df_runs = pd.concat([df_runs, pd.DataFrame([row])], ignore_index=True)
df_runs.to_csv(runs_csv, index=False)
print("Logged:", runs_csv)


In [None]:

# --- Curves & Confusion Matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, ConfusionMatrixDisplay, confusion_matrix

y_true = yte if algo=='xgboost' else yte_w

# ROC curve
fpr, tpr, _ = roc_curve(y_true, y_prob)
plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'--')
plt.title('CoF — ROC Curve')
plt.xlabel('FPR'); plt.ylabel('TPR')
plt.tight_layout()
plt.show()

# PR curve
prec, rec, _ = precision_recall_curve(y_true, y_prob)
plt.figure()
plt.plot(rec, prec)
plt.title('CoF — Precision-Recall Curve')
plt.xlabel('Recall'); plt.ylabel('Precision')
plt.tight_layout()
plt.show()

# Confusion matrix at 0.5
cm = confusion_matrix(y_true, (y_prob>=0.5).astype(int))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
plt.figure()
disp.plot(values_format='d')
plt.title('CoF — Confusion Matrix @0.5')
plt.tight_layout()
plt.show()
