
# 02 — Feature Engineering (CoF, Combined Line 10 & Line 20)

This notebook builds **Chance of Failure (CoF)** features from **both Line 10 and Line 20**, even if they have **different sensor columns**:

- Robust project-root detection
- Per-line, **batched** feature engineering with `float32` (memory-safe)
- **Union schema** alignment so both lines share the same feature columns
- **Time split per line (80/10/10)**, then concatenated into combined train/val/test
- **No scaling** required (tree models handle raw/NaN well). You can enable scaling later if needed.


In [1]:

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
import joblib
import shutil

NB_PATH = Path.cwd()

def find_project_root(start: Path) -> Path:
    cur = start
    for _ in range(6):  # look up to 6 levels up
        if (cur / "requirements.txt").exists() or (cur / "configs").exists():
            return cur
        cur = cur.parent
    # Fallback: assume <repo>/notebooks/<task> structure
    try:
        i = [p.name.lower() for p in start.parents].index("notebooks")
        return start.parents[i+1]
    except ValueError:
        return start  # last resort

ROOT = find_project_root(NB_PATH)
DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("PROCESSED_DIR:", PROCESSED_DIR)

# --- If a mislabeled file exists under notebooks/<task>/data/processed, relocate it
wrong_proc = NB_PATH / "data" / "processed"
if wrong_proc.exists():
    for name in ["cof_labeled.parquet", "cof_labeled.csv", "rul_labeled.parquet", "rul_labeled.csv"]:
        src = wrong_proc / name
        if src.exists():
            dst = PROCESSED_DIR / name
            try:
                shutil.move(str(src), str(dst))
                print(f"Moved misplaced file → {dst}")
            except Exception as e:
                print("Relocation warning:", e)


ROOT: d:\Richard Files\WORK\pdm-project
PROCESSED_DIR: d:\Richard Files\WORK\pdm-project\data\processed


In [2]:

# --- Config
id_col   = "machine_id"
time_col = "timestamp"
target   = "CoF"      # binary label (0/1)

# Expect a labeled merged file from 01 (may or may not include __line)
LABELED   = PROCESSED_DIR / "cof_labeled.parquet"
CSV_FALLB = PROCESSED_DIR / "cof_labeled.csv"

# Per-line shard output
SHARDS_L10 = PROCESSED_DIR / "shards_CoF_L10"
SHARDS_L20 = PROCESSED_DIR / "shards_CoF_L20"
for p in [SHARDS_L10, SHARDS_L20]:
    p.mkdir(parents=True, exist_ok=True)

print("Labeled path:", LABELED)


Labeled path: d:\Richard Files\WORK\pdm-project\data\processed\cof_labeled.parquet


In [3]:

# --- Load labeled dataset (robust + fallback)
if not LABELED.exists() and not CSV_FALLB.exists():
    raise FileNotFoundError(
        f"Missing labeled file:\n  {LABELED}\n(or CSV fallback)\n"
        "→ Run notebooks/CoF/01_eda_data_prep_CoF.ipynb to generate it."
    )

try:
    if LABELED.exists():
        df = pd.read_parquet(LABELED)
        src = LABELED
    else:
        df = pd.read_csv(CSV_FALLB)
        src = CSV_FALLB
except Exception as e:
    # Parquet engine missing? Fall back to CSV if available
    if "pyarrow" in str(e).lower() and CSV_FALLB.exists():
        df = pd.read_csv(CSV_FALLB)
        src = CSV_FALLB
    else:
        raise

print("Loaded:", df.shape, "| From:", src)


Loaded: (699840, 222) | From: d:\Richard Files\WORK\pdm-project\data\processed\cof_labeled.parquet


In [4]:

# --- Normalize schema: Mesin→machine_id, Timestamp→timestamp; ensure datetime; ensure __line
df = df.rename(columns={"Mesin":"machine_id", "Timestamp":"timestamp"})
df[time_col] = pd.to_datetime(df[time_col], errors="coerce")

# Ensure __line present; try to infer if missing
if "__line" not in df.columns:
    if "Line" in df.columns:
        df["__line"] = df["Line"]
    else:
        df["__line"] = np.nan  # if unknown, still works as single group

# Ensure target present
if target not in df.columns:
    if "Breakdown" in df.columns:
        df[target] = (df["Breakdown"].astype(float) > 0).astype(int)
    else:
        raise RuntimeError(f"Target column '{target}' not found and cannot infer from 'Breakdown'.")

# Convert object-like sensor cols to numeric where possible
non_feature = {id_col, time_col, "__line", target, "Breakdown", "RUL"}
for c in df.columns:
    if c not in non_feature and df[c].dtype == "object":
        df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.sort_values([id_col, time_col]).reset_index(drop=True)

print("Columns:", len(df.columns))
print("Lines present (unique __line):", df["__line"].dropna().unique()[:10])
print(df[[id_col, time_col, "__line", target]].head())


Columns: 222
Lines present (unique __line): [10 20]
   machine_id           timestamp  __line  CoF
0          10 2025-01-01 00:00:00      10    0
1          10 2025-01-01 00:01:00      10    0
2          10 2025-01-01 00:02:00      10    0
3          10 2025-01-01 00:03:00      10    0
4          10 2025-01-01 00:04:00      10    0


In [5]:

# --- Memory-safe feature engineering helpers
import gc
import numpy as np
import pandas as pd

LAGS  = (1,)     # keep light; expand later
WINS  = (3,)     # keep light; expand later
BATCH = 32       # smaller batch = lower peak RAM
COVERAGE_MIN = 0.85   # require ≥85% non-null per raw sensor
MAX_FEATS     = 800   # hard cap to avoid explosion

def _raw_sensor_list(df_line, id_col="machine_id", time_col="timestamp", target="CoF"):
    """Return ONLY raw numeric sensors (no engineered suffixes)."""
    skip = {id_col, time_col, "__line", target, "Breakdown", "RUL"}
    cand = [c for c in df_line.select_dtypes(include="number").columns if c not in skip]
    # exclude previously engineered columns if cell is re-run
    raw = [c for c in cand if ("_lag" not in c and "_r" not in c)]
    return raw

def _prune_by_coverage(df_line, feats, threshold=COVERAGE_MIN, limit=MAX_FEATS):
    if not feats:
        return []
    cov = df_line[feats].notna().mean().sort_values(ascending=False)
    keep = cov[cov >= threshold].index.tolist()
    if not keep:  # if too strict, soften to top-N by coverage
        keep = cov.index.tolist()
    if limit:
        keep = keep[:limit]
    return keep

def _chunked(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

def engineer_per_line_to_shards(df_line: pd.DataFrame, out_dir: Path):
    id_col, time_col, target = "machine_id", "timestamp", "CoF"
    shard_paths = []

    # 1) pick RAW sensors only
    feats_base_raw = _raw_sensor_list(df_line, id_col, time_col, target)
    feats_base = _prune_by_coverage(df_line, feats_base_raw, COVERAGE_MIN, MAX_FEATS)
    print(f"Raw sensors available: {len(feats_base_raw)} | after coverage/cap: {len(feats_base)}")
    if len(feats_base_raw) != len(feats_base):
        print("ℹ️ Pruned low-coverage or excess sensors to keep memory stable.")

    # 2) group per machine
    for gid, g0 in df_line.groupby(id_col, sort=True):
        g = g0[[id_col, time_col, "__line", target]].copy()
        g[time_col] = pd.to_datetime(g[time_col], errors="coerce")

        # 3) engineer in batches with minimal concat
        for cols in _chunked(feats_base, BATCH):
            block = g0[cols].astype("float32").copy()
            to_concat = [g]  # always include current g (left side)

            # lags
            for L in LAGS:
                lagged = block.shift(L)
                lagged.columns = [f"{c}_lag{L}" for c in cols]
                to_concat.append(lagged)

            # rolling mean (add std later if RAM allows)
            for w in WINS:
                roll_mean = block.rolling(w, min_periods=w).mean()
                roll_mean.columns = [f"{c}_r{w}_mean" for c in cols]
                to_concat.append(roll_mean)

            # single concat per batch (faster & less RAM than many concats)
            g = pd.concat(to_concat, axis=1)
            del block, to_concat
            gc.collect()

        # 4) warm-up trim instead of global dropna
        warmup = max(LAGS or (0,)) + max(WINS or (1,)) - 1
        if warmup > 0 and len(g) > warmup:
            g = g.iloc[warmup:].copy()

        # 5) gentle fill for features only (never touch target)
        feat_cols_all = [c for c in g.columns if c not in {id_col, time_col, "__line", target}]
        if feat_cols_all:
            g[feat_cols_all] = g[feat_cols_all].ffill().bfill()
            # drop rows only if ALL engineered features are NaN
            g = g.dropna(subset=feat_cols_all, how="all")

        if len(g) == 0:
            print(f"⚠️ Empty after warm-up/ffill: machine={gid} — skipping")
            continue

        out_path = out_dir / f"part_{id_col}_{gid}.parquet"
        g.to_parquet(out_path, index=False)
        shard_paths.append(out_path)
        print(f"✓ {out_path.name}: rows={len(g)}, feats={len(feat_cols_all)}")

        del g
        gc.collect()

    return shard_paths


In [7]:

# --- Engineer per line → shards
has_lines = df["__line"].notna().any()
paths_L10, paths_L20 = [], []

if has_lines:
    if (df["__line"] == 10).any():
        paths_L10 = engineer_per_line_to_shards(df[df["__line"] == 10], SHARDS_L10)
    if (df["__line"] == 20).any():
        paths_L20 = engineer_per_line_to_shards(df[df["__line"] == 20], SHARDS_L20)
else:
    df["__line"] = 10
    paths_L10 = engineer_per_line_to_shards(df, SHARDS_L10)

print("Shards L10:", len(paths_L10))
print("Shards L20:", len(paths_L20))


Raw sensors available: 217 | after coverage/cap: 68
ℹ️ Pruned low-coverage or excess sensors to keep memory stable.
✓ part_machine_id_10.parquet: rows=349917, feats=136
Raw sensors available: 217 | after coverage/cap: 74
ℹ️ Pruned low-coverage or excess sensors to keep memory stable.
✓ part_machine_id_20.parquet: rows=349917, feats=148
Shards L10: 1
Shards L20: 1


In [8]:

# --- Build union schema from a sample of shards
ignore = {id_col, time_col, "__line", target}
def collect_feats(paths, k=5):
    feats = set()
    for p in sorted(paths)[:k]:
        g = pd.read_parquet(p)
        feats |= set([c for c in g.columns if c not in ignore])
    return feats

feat10 = collect_feats(paths_L10) if paths_L10 else set()
feat20 = collect_feats(paths_L20) if paths_L20 else set()
ALL_FEATS = sorted(feat10 | feat20)
print("Union features:", len(ALL_FEATS))


Union features: 282


In [9]:

# --- Align shard to union schema
def align_union(g: pd.DataFrame, all_feats, target: str) -> pd.DataFrame:
    g = g.copy()
    for c in all_feats:
        if c not in g.columns:
            g[c] = pd.NA
    cols = [id_col, time_col, "__line"] + all_feats + [target]
    return g[cols]


In [10]:

# --- Compute time cutoffs per line (80/10/10)
def cutoffs_for(paths):
    if not paths: 
        return None, None
    meta = []
    for p in paths:
        gg = pd.read_parquet(p, columns=[time_col])
        gg[time_col] = pd.to_datetime(gg[time_col], errors="coerce")
        meta.append(gg)
    meta = pd.concat(meta, ignore_index=True).dropna(subset=[time_col])
    if meta.empty:
        return None, None
    q1 = meta[time_col].quantile(0.80)
    q2 = meta[time_col].quantile(0.90)
    return q1, q2

cut10_tr, cut10_va = cutoffs_for(paths_L10)
cut20_tr, cut20_va = cutoffs_for(paths_L20)
print("Cutoffs L10:", cut10_tr, "|", cut10_va)
print("Cutoffs L20:", cut20_tr, "|", cut20_va)


Cutoffs L10: 2025-07-14 09:35:48 | 2025-08-07 16:47:24
Cutoffs L20: 2025-07-14 09:35:48 | 2025-08-07 16:47:24


In [11]:

# --- Helper to append parquet
from pathlib import Path
def append_parquet(df, path: Path):
    if path.exists():
        old = pd.read_parquet(path)
        pd.concat([old, df], ignore_index=True).to_parquet(path, index=False)
    else:
        df.to_parquet(path, index=False)


In [12]:

# --- Stream & split per line, write *_L10.parquet / *_L20.parquet
Xtr_L10 = PROCESSED_DIR / "CoF_X_train_L10.parquet"
Xva_L10 = PROCESSED_DIR / "CoF_X_val_L10.parquet"
Xte_L10 = PROCESSED_DIR / "CoF_X_test_L10.parquet"
ytr_L10 = PROCESSED_DIR / "CoF_y_train_L10.parquet"
yva_L10 = PROCESSED_DIR / "CoF_y_val_L10.parquet"
yte_L10 = PROCESSED_DIR / "CoF_y_test_L10.parquet"

Xtr_L20 = PROCESSED_DIR / "CoF_X_train_L20.parquet"
Xva_L20 = PROCESSED_DIR / "CoF_X_val_L20.parquet"
Xte_L20 = PROCESSED_DIR / "CoF_X_test_L20.parquet"
ytr_L20 = PROCESSED_DIR / "CoF_y_train_L20.parquet"
yva_L20 = PROCESSED_DIR / "CoF_y_val_L20.parquet"
yte_L20 = PROCESSED_DIR / "CoF_y_test_L20.parquet"

for p in [Xtr_L10, Xva_L10, Xte_L10, ytr_L10, yva_L10, yte_L10,
          Xtr_L20, Xva_L20, Xte_L20, ytr_L20, yva_L20, yte_L20]:
    if p.exists():
        p.unlink()

def route_and_write(paths, cut_tr, cut_va, tag):
    if not paths or cut_tr is None or cut_va is None:
        print(f"Skipping line {tag}: missing paths or cutoffs")
        return

    for p in paths:
        g = pd.read_parquet(p)
        g = align_union(g, ALL_FEATS, target)

        mask_tr = g[time_col] <= cut_tr
        mask_va = (g[time_col] > cut_tr) & (g[time_col] <= cut_va)
        mask_te = g[time_col] > cut_va

        if tag == "L10":
            append_parquet(g.loc[mask_tr], Xtr_L10); append_parquet(g.loc[mask_va], Xva_L10); append_parquet(g.loc[mask_te], Xte_L10)
            append_parquet(pd.DataFrame({target: g.loc[mask_tr, target].values}), ytr_L10)
            append_parquet(pd.DataFrame({target: g.loc[mask_va, target].values}), yva_L10)
            append_parquet(pd.DataFrame({target: g.loc[mask_te, target].values}), yte_L10)
        else:
            append_parquet(g.loc[mask_tr], Xtr_L20); append_parquet(g.loc[mask_va], Xva_L20); append_parquet(g.loc[mask_te], Xte_L20)
            append_parquet(pd.DataFrame({target: g.loc[mask_tr, target].values}), ytr_L20)
            append_parquet(pd.DataFrame({target: g.loc[mask_va, target].values}), yva_L20)
            append_parquet(pd.DataFrame({target: g.loc[mask_te, target].values}), yte_L20)

route_and_write(paths_L10, cut10_tr, cut10_va, "L10")
route_and_write(paths_L20, cut20_tr, cut20_va, "L20")

print("Per-line splits written (if available).")


  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = p

Per-line splits written (if available).


In [13]:

# --- Merge per-line splits → combined
def concat_splits(base_name):
    files = list(PROCESSED_DIR.glob(f"{base_name}_L*.parquet"))
    if not files:
        return None
    dfs = [pd.read_parquet(f) for f in files if f.exists()]
    if not dfs:
        return None
    out = pd.concat(dfs, ignore_index=True)
    out_path = PROCESSED_DIR / f"{base_name}.parquet"
    out.to_parquet(out_path, index=False)
    print(f"Merged {len(dfs)} → {out_path.name} ({len(out)} rows)")
    return out_path

for nm in ["CoF_X_train", "CoF_X_val", "CoF_X_test",
           "CoF_y_train", "CoF_y_val", "CoF_y_test"]:
    concat_splits(nm)


  out = pd.concat(dfs, ignore_index=True)


Merged 2 → CoF_X_train.parquet (559866 rows)


  out = pd.concat(dfs, ignore_index=True)


Merged 2 → CoF_X_val.parquet (69984 rows)


  out = pd.concat(dfs, ignore_index=True)


Merged 2 → CoF_X_test.parquet (69984 rows)
Merged 2 → CoF_y_train.parquet (559866 rows)
Merged 2 → CoF_y_val.parquet (69984 rows)
Merged 2 → CoF_y_test.parquet (69984 rows)


In [14]:

# --- Diagnostics: positive counts per split
for nm in ["CoF_y_train", "CoF_y_val", "CoF_y_test"]:
    p = PROCESSED_DIR / f"{nm}.parquet"
    if p.exists():
        y = pd.read_parquet(p)
        pos = int((y[target] == 1).sum())
        total = len(y)
        rate = (pos/total)*100 if total>0 else 0
        print(f"{nm}: positives={pos} / {total} ({rate:.2f}%)")


CoF_y_train: positives=930 / 559866 (0.17%)
CoF_y_val: positives=0 / 69984 (0.00%)
CoF_y_test: positives=10260 / 69984 (14.66%)
