# 02 — Feature Engineering for RUL (Combined Line 10 & 20)

This notebook:
- Loads the RUL-labeled dataset
- Engineers time-series features per line (Line 10 and 20)
- Uses sharding + union schema
- Splits train/val/test per line using failure-aware time cutoffs
- Saves combined RUL train/val/test splits to `data/processed/`.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import gc

NB_PATH = Path.cwd()

def find_project_root(start: Path) -> Path:
    cur = start
    for _ in range(6):
        if (cur / "requirements.txt").exists() or (cur / "configs").exists() or (cur / "data").exists():
            return cur
        cur = cur.parent
    try:
        i = [p.name.lower() for p in start.parents].index("notebooks")
        return start.parents[i+1]
    except ValueError:
        return start

ROOT = find_project_root(NB_PATH)
DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("PROCESSED_DIR:", PROCESSED_DIR)

ROOT: d:\Documents\Thesis\pdm-project
PROCESSED_DIR: d:\Documents\Thesis\pdm-project\data\processed


In [2]:
id_col   = "machine_id"
time_col = "timestamp"
target   = "RUL"

LABELED   = PROCESSED_DIR / "rul_labeled.parquet"
CSV_FALLB = PROCESSED_DIR / "rul_labeled.csv"

SHARDS_L10 = PROCESSED_DIR / "shards_RUL_L10"
SHARDS_L20 = PROCESSED_DIR / "shards_RUL_L20"
for p in [SHARDS_L10, SHARDS_L20]:
    p.mkdir(parents=True, exist_ok=True)

if not LABELED.exists() and not CSV_FALLB.exists():
    raise FileNotFoundError("Missing labeled RUL file. Run 01_eda_data_prep_RUL.ipynb first.")

try:
    if LABELED.exists():
        df = pd.read_parquet(LABELED)
        src = LABELED
    else:
        df = pd.read_csv(CSV_FALLB)
        src = CSV_FALLB
except Exception as e:
    if "pyarrow" in str(e).lower() and CSV_FALLB.exists():
        df = pd.read_csv(CSV_FALLB)
        src = CSV_FALLB
    else:
        raise

print("Loaded:", df.shape, "| From:", src)

Loaded: (656762, 216) | From: d:\Documents\Thesis\pdm-project\data\processed\rul_labeled.parquet


In [3]:
df = df.rename(columns={"Mesin":"machine_id", "Timestamp":"timestamp"})
df[time_col] = pd.to_datetime(df[time_col], errors="coerce")

if "__line" not in df.columns:
    if "Line" in df.columns:
        df["__line"] = df["Line"]
    else:
        df["__line"] = np.nan

df[target] = pd.to_numeric(df[target], errors="coerce")

non_feature = {id_col, time_col, "__line", target, "Breakdown"}
for c in df.columns:
    if c not in non_feature and df[c].dtype == "object":
        df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.sort_values([id_col, time_col]).reset_index(drop=True)
df[[id_col, time_col, "__line", target]].head()

Unnamed: 0,machine_id,timestamp,__line,RUL
0,10,2025-01-01 00:00:00,10,17520.0
1,10,2025-01-01 00:01:00,10,17519.0
2,10,2025-01-01 00:02:00,10,17518.0
3,10,2025-01-01 00:03:00,10,17517.0
4,10,2025-01-01 00:04:00,10,17516.0


In [4]:
LAGS  = (1,)
WINS  = (3,)
BATCH = 32
COVERAGE_MIN = 0.85
MAX_FEATS     = 800

def _raw_sensor_list(df_line, id_col, time_col, target):
    skip = {id_col, time_col, "__line", target, "Breakdown"}
    cand = [c for c in df_line.select_dtypes(include="number").columns if c not in skip]
    raw = [c for c in cand if ("_lag" not in c and "_r" not in c)]
    return raw

def _prune_by_coverage(df_line, feats, threshold=COVERAGE_MIN, limit=MAX_FEATS):
    if not feats:
        return []
    cov = df_line[feats].notna().mean().sort_values(ascending=False)
    keep = cov[cov >= threshold].index.tolist()
    if not keep:
        keep = cov.index.tolist()
    if limit:
        keep = keep[:limit]
    return keep

def _chunked(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

def engineer_per_line_to_shards(df_line: pd.DataFrame, out_dir: Path):
    shard_paths = []

    feats_base_raw = _raw_sensor_list(df_line, id_col, time_col, target)
    feats_base = _prune_by_coverage(df_line, feats_base_raw, COVERAGE_MIN, MAX_FEATS)
    print(f"Raw sensors: {len(feats_base_raw)} | after pruning: {len(feats_base)}")

    for gid, g0 in df_line.groupby(id_col, sort=True):
        g = g0[[id_col, time_col, "__line", target]].copy()
        g[time_col] = pd.to_datetime(g[time_col], errors="coerce")

        for cols in _chunked(feats_base, BATCH):
            block = g0[cols].astype("float32").copy()
            to_concat = [g]

            for L in LAGS:
                lagged = block.shift(L)
                lagged.columns = [f"{c}_lag{L}" for c in cols]
                to_concat.append(lagged)

            for w in WINS:
                roll_mean = block.rolling(w, min_periods=w).mean()
                roll_mean.columns = [f"{c}_r{w}_mean" for c in cols]
                to_concat.append(roll_mean)

            g = pd.concat(to_concat, axis=1)
            del block, to_concat
            gc.collect()

        warmup = max(LAGS or (0,)) + max(WINS or (1,)) - 1
        if warmup > 0 and len(g) > warmup:
            g = g.iloc[warmup:].copy()

        feat_cols_all = [c for c in g.columns if c not in {id_col, time_col, "__line", target}]
        if feat_cols_all:
            g[feat_cols_all] = g[feat_cols_all].ffill().bfill()
            g = g.dropna(subset=feat_cols_all, how="all")

        if len(g) == 0:
            print(f"⚠️ Empty after warm-up: machine={gid} — skipping")
            continue

        out_path = out_dir / f"part_{id_col}_{gid}.parquet"
        g.to_parquet(out_path, index=False)
        shard_paths.append(out_path)
        print(f"✓ {out_path.name}: rows={len(g)}, feats={len(feat_cols_all)}")

        del g
        gc.collect()

    return shard_paths

In [5]:
has_lines = df["__line"].notna().any()
paths_L10, paths_L20 = [], []

if has_lines:
    if (df["__line"] == 10).any():
        paths_L10 = engineer_per_line_to_shards(df[df["__line"] == 10], SHARDS_L10)
    if (df["__line"] == 20).any():
        paths_L20 = engineer_per_line_to_shards(df[df["__line"] == 20], SHARDS_L20)
else:
    df["__line"] = 10
    paths_L10 = engineer_per_line_to_shards(df, SHARDS_L10)

print("Shards L10:", len(paths_L10))
print("Shards L20:", len(paths_L20))

Raw sensors: 211 | after pruning: 65
✓ part_machine_id_10.parquet: rows=333598, feats=130
Raw sensors: 211 | after pruning: 72
✓ part_machine_id_20.parquet: rows=323158, feats=144
Shards L10: 1
Shards L20: 1


In [6]:
ignore = {id_col, time_col, "__line", target}

def collect_feats(paths, k=5):
    feats = set()
    for p in sorted(paths)[:k]:
        g = pd.read_parquet(p)
        feats |= set([c for c in g.columns if c not in ignore])
    return feats

feat10 = collect_feats(paths_L10) if paths_L10 else set()
feat20 = collect_feats(paths_L20) if paths_L20 else set()
ALL_FEATS = sorted(feat10 | feat20)
print("Union features:", len(ALL_FEATS))

def align_union(g: pd.DataFrame, all_feats, target: str) -> pd.DataFrame:
    g = g.copy()
    for c in all_feats:
        if c not in g.columns:
            g[c] = pd.NA
    cols = [id_col, time_col, "__line"] + all_feats + [target]
    return g[cols]

Union features: 272


In [7]:
def cutoffs_failure_aware_rul(paths, time_col="timestamp", target="RUL",
                              frac_train=0.70, frac_val=0.85, min_pos=10):
    if not paths:
        return None, None, {"mode":"no_paths"}

    pos_ts = []
    all_min, all_max = None, None
    total_pos = 0

    for p in paths:
        g = pd.read_parquet(p, columns=[time_col, target])
        g[time_col] = pd.to_datetime(g[time_col], errors="coerce")
        g = g.dropna(subset=[time_col, target])

        if not g.empty:
            all_min = g[time_col].min() if all_min is None else min(all_min, g[time_col].min())
            all_max = g[time_col].max() if all_max is None else max(all_max, g[time_col].max())

        mask_fail = g[target] <= 1.0
        npos = int(mask_fail.sum())
        total_pos += npos
        if npos:
            pos_ts.append(g.loc[mask_fail, time_col])

    if total_pos < min_pos:
        meta = []
        for p in paths:
            gg = pd.read_parquet(p, columns=[time_col])
            gg[time_col] = pd.to_datetime(gg[time_col], errors="coerce")
            meta.append(gg)
        meta = pd.concat(meta, ignore_index=True).dropna(subset=[time_col]).sort_values(time_col)
        if meta.empty:
            return None, None, {"mode":"empty_time_meta"}
        q1 = meta[time_col].quantile(frac_train)
        q2 = meta[time_col].quantile(frac_val)
        return q1, q2, {"mode":"time_quantiles_fallback", "total_pos": total_pos}

    pos_ts = pd.concat(pos_ts, ignore_index=True).sort_values()
    cut_train = pos_ts.quantile(frac_train, interpolation="nearest")
    cut_val   = pos_ts.quantile(frac_val,   interpolation="nearest")

    cut_train = pd.to_datetime(cut_train)
    cut_val   = pd.to_datetime(cut_val)

    if cut_train >= cut_val:
        uniq = pos_ts.drop_duplicates().reset_index(drop=True)
        idx_train = int(uniq.searchsorted(cut_train, side="right") - 1)
        idx_val   = min(idx_train + 1, len(uniq) - 1)
        cut_val   = uniq.iloc[idx_val]

    if all_min is not None:
        cut_train = max(cut_train, all_min)
        cut_val   = max(cut_val,   all_min)
    if all_max is not None:
        cut_train = min(cut_train, all_max)
        cut_val   = min(cut_val, all_max)

    return cut_train, cut_val, {"mode":"failure_quantiles", "total_pos": total_pos}

cut10_tr, cut10_va, info10 = cutoffs_failure_aware_rul(paths_L10, time_col=time_col, target=target)
cut20_tr, cut20_va, info20 = cutoffs_failure_aware_rul(paths_L20, time_col=time_col, target=target)
print("L10 cutoffs:", cut10_tr, "|", cut10_va, "|", info10)
print("L20 cutoffs:", cut20_tr, "|", cut20_va, "|", info20)

L10 cutoffs: 2025-08-16 04:05:00 | 2025-08-18 06:03:00 | {'mode': 'failure_quantiles', 'total_pos': 10410}
L20 cutoffs: 2025-08-13 08:46:00 | 2025-08-13 09:23:00 | {'mode': 'failure_quantiles', 'total_pos': 248}


In [8]:
from pathlib import Path

def append_parquet(df, path: Path):
    if path.exists():
        old = pd.read_parquet(path)
        pd.concat([old, df], ignore_index=True).to_parquet(path, index=False)
    else:
        df.to_parquet(path, index=False)

In [9]:
Xtr_L10 = PROCESSED_DIR / "RUL_X_train_L10.parquet"
Xva_L10 = PROCESSED_DIR / "RUL_X_val_L10.parquet"
Xte_L10 = PROCESSED_DIR / "RUL_X_test_L10.parquet"
ytr_L10 = PROCESSED_DIR / "RUL_y_train_L10.parquet"
yva_L10 = PROCESSED_DIR / "RUL_y_val_L10.parquet"
yte_L10 = PROCESSED_DIR / "RUL_y_test_L10.parquet"

Xtr_L20 = PROCESSED_DIR / "RUL_X_train_L20.parquet"
Xva_L20 = PROCESSED_DIR / "RUL_X_val_L20.parquet"
Xte_L20 = PROCESSED_DIR / "RUL_X_test_L20.parquet"
ytr_L20 = PROCESSED_DIR / "RUL_y_train_L20.parquet"
yva_L20 = PROCESSED_DIR / "RUL_y_val_L20.parquet"
yte_L20 = PROCESSED_DIR / "RUL_y_test_L20.parquet"

for p in [Xtr_L10, Xva_L10, Xte_L10, ytr_L10, yva_L10, yte_L10,
          Xtr_L20, Xva_L20, Xte_L20, ytr_L20, yva_L20, yte_L20]:
    if p.exists():
        p.unlink()

def route_and_write(paths, cut_tr, cut_va, tag):
    if not paths or cut_tr is None or cut_va is None:
        print(f"Skipping line {tag}: missing paths or cutoffs")
        return

    for p in paths:
        g = pd.read_parquet(p)
        g = align_union(g, ALL_FEATS, target)

        mask_tr = g[time_col] <= cut_tr
        mask_va = (g[time_col] > cut_tr) & (g[time_col] <= cut_va)
        mask_te = g[time_col] > cut_va

        base_cols = [id_col, time_col, "__line"] + ALL_FEATS
        X_tr = g.loc[mask_tr, base_cols]
        X_va = g.loc[mask_va, base_cols]
        X_te = g.loc[mask_te, base_cols]

        y_tr = pd.DataFrame({target: g.loc[mask_tr, target].values})
        y_va = pd.DataFrame({target: g.loc[mask_va, target].values})
        y_te = pd.DataFrame({target: g.loc[mask_te, target].values})

        if tag == "L10":
            append_parquet(X_tr, Xtr_L10); append_parquet(X_va, Xva_L10); append_parquet(X_te, Xte_L10)
            append_parquet(y_tr, ytr_L10); append_parquet(y_va, yva_L10); append_parquet(y_te, yte_L10)
        else:
            append_parquet(X_tr, Xtr_L20); append_parquet(X_va, Xva_L20); append_parquet(X_te, Xte_L20)
            append_parquet(y_tr, ytr_L20); append_parquet(y_va, yva_L20); append_parquet(y_te, yte_L20)

route_and_write(paths_L10, cut10_tr, cut10_va, "L10")
route_and_write(paths_L20, cut20_tr, cut20_va, "L20")
print("Per-line RUL splits written.")

  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = pd.NA
  g[c] = p

Per-line RUL splits written.


In [10]:
def concat_splits(base_name):
    files = list(PROCESSED_DIR.glob(f"{base_name}_L*.parquet"))
    if not files:
        return None
    dfs = [pd.read_parquet(f) for f in files if f.exists()]
    if not dfs:
        return None
    out = pd.concat(dfs, ignore_index=True)
    out_path = PROCESSED_DIR / f"{base_name}.parquet"
    out.to_parquet(out_path, index=False)
    print(f"Merged {len(dfs)} → {out_path.name} ({len(out)} rows)")
    return out_path

for nm in ["RUL_X_train", "RUL_X_val", "RUL_X_test",
           "RUL_y_train", "RUL_y_val", "RUL_y_test"]:
    concat_splits(nm)

  out = pd.concat(dfs, ignore_index=True)


Merged 2 → RUL_X_train.parquet (650207 rows)


  out = pd.concat(dfs, ignore_index=True)


Merged 2 → RUL_X_val.parquet (3035 rows)


  out = pd.concat(dfs, ignore_index=True)


Merged 2 → RUL_X_test.parquet (3514 rows)
Merged 2 → RUL_y_train.parquet (650207 rows)
Merged 2 → RUL_y_val.parquet (3035 rows)
Merged 2 → RUL_y_test.parquet (3514 rows)


In [11]:
for nm in ["RUL_y_train", "RUL_y_val", "RUL_y_test"]:
    p = PROCESSED_DIR / f"{nm}.parquet"
    if p.exists():
        y = pd.read_parquet(p)
        print(nm, "rows:", len(y),
              "| RUL min:", float(y[target].min()),
              "| RUL max:", float(y[target].max()))

RUL_y_train rows: 650207 | RUL min: 0.0 | RUL max: 297119.0
RUL_y_val rows: 3035 | RUL min: 0.0 | RUL max: 959.0
RUL_y_test rows: 3514 | RUL min: 0.0 | RUL max: 959.0
