# 02 — Feature Engineering (CoF, Streaming)

This notebook streams per-machine to keep memory low.
- Input: `data/processed/cof_labeled.parquet`
- Shards: `data/processed/shards_CoF`
- Target: `CoF`

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
import joblib
import shutil

NB_PATH = Path.cwd()

def find_project_root(start: Path) -> Path:
    cur = start
    for _ in range(6):  # look up to 6 levels up
        if (cur / "requirements.txt").exists() or (cur / "configs").exists():
            return cur
        cur = cur.parent
    # Fallback: assume <repo>/notebooks/<task> structure
    try:
        i = [p.name.lower() for p in start.parents].index("notebooks")
        return start.parents[i+1]
    except ValueError:
        return start  # last resort

ROOT = find_project_root(NB_PATH)
DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("PROCESSED_DIR:", PROCESSED_DIR)

# --- If a mislabeled file exists under notebooks/<task>/data/processed, relocate it
wrong_proc = NB_PATH / "data" / "processed"
if wrong_proc.exists():
    for name in ["cof_labeled.parquet", "cof_labeled.csv", "rul_labeled.parquet", "rul_labeled.csv"]:
        src = wrong_proc / name
        if src.exists():
            dst = PROCESSED_DIR / name
            try:
                shutil.move(str(src), str(dst))
                print(f"Moved misplaced file → {dst}")
            except Exception as e:
                print("Relocation warning:", e)

ROOT: d:\Richard Files\WORK\pdm-project
PROCESSED_DIR: d:\Richard Files\WORK\pdm-project\data\processed


In [2]:
# --- Config
id_col   = "Mesin"
time_col = "Timestamp"
target   = "CoF"

LABELED   = PROCESSED_DIR / "cof_labeled.parquet"
CSV_FALLB = PROCESSED_DIR / "cof_labeled.csv"

SHARDS_DIR = PROCESSED_DIR / "shards_CoF"
SHARDS_DIR.mkdir(parents=True, exist_ok=True)

print("Using labeled file:", LABELED)


Using labeled file: d:\Richard Files\WORK\pdm-project\data\processed\cof_labeled.parquet


In [3]:
# --- Load labeled dataset (robust + fallback)
if not LABELED.exists() and not CSV_FALLB.exists():
    raise FileNotFoundError(
        f"Missing labeled file:\n  {LABELED}\n(or CSV fallback)\n"
        "→ Run notebooks/CoF/01_eda_data_prep_CoF.ipynb to generate it."
    )

try:
    if LABELED.exists():
        df = pd.read_parquet(LABELED)
        src = LABELED
    else:
        df = pd.read_csv(CSV_FALLB, parse_dates=[time_col])
        src = CSV_FALLB
except Exception as e:
    # Parquet engine missing? Fall back to CSV if available
    if "pyarrow" in str(e).lower() and CSV_FALLB.exists():
        df = pd.read_csv(CSV_FALLB, parse_dates=[time_col])
        src = CSV_FALLB
    else:
        raise

df = df.sort_values([id_col, time_col]).reset_index(drop=True)
print("Loaded:", df.shape, "| From:", src)

Loaded: (699840, 222) | From: d:\Richard Files\WORK\pdm-project\data\processed\cof_labeled.parquet


In [4]:
# --- Normalize schema and ensure numeric dtypes
df = df.rename(columns={"Mesin": "machine_id", "Timestamp": "timestamp"})  # unify naming
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Identify potential numeric sensor columns
non_feature = {"machine_id", "timestamp", "Breakdown", "CoF", "__line"}
num_cols = [c for c in df.columns if c not in non_feature]

# Convert any object-like columns to numeric where possible
for c in num_cols:
    if df[c].dtype == "object":
        df[c] = pd.to_numeric(df[c], errors="coerce")

print("Numeric feature columns detected:", len(num_cols))
print(num_cols[:10])
print(df.dtypes.head(15))


Numeric feature columns detected: 217
['Start_Time', 'End_Time', 'Level_1', 'Level_2', 'Detail', 'Energy_Sectors.Extruder_10_Target', 'EXT_10.Extruder_Load', 'EXT_10.Machine_Hour', 'EXT_10.Machine_Run', 'EXT_10.Melt_Press']
timestamp                            datetime64[ns]
Start_Time                                  float64
End_Time                                    float64
Level_1                                     float64
Level_2                                     float64
Detail                                      float64
machine_id                                    int64
__line                                        int64
Energy_Sectors.Extruder_10_Target           float64
EXT_10.Extruder_Load                        float64
EXT_10.Machine_Hour                         float64
EXT_10.Machine_Run                          float64
EXT_10.Melt_Press                           float64
EXT_10.Melt_Temp_1                          float64
EXT_10.Motor_Extruder_Run                   floa

In [5]:
# --- Column selection & downcast
exclude = {id_col, time_col, '__line', target}
exclude |= {c for c in df.columns if c.lower() in {'breakdown','failure','fail','is_failure'}}
num_cols = [c for c in df.select_dtypes(include='number').columns if c not in exclude]
print("Numeric sensors:", len(num_cols))

def downcast_numeric(g: pd.DataFrame) -> pd.DataFrame:
    for c in g.select_dtypes(include='float').columns:
        g[c] = pd.to_numeric(g[c], downcast='float')
    for c in g.select_dtypes(include='integer').columns:
        g[c] = pd.to_numeric(g[c], downcast='integer')
    return g

Numeric sensors: 218


In [6]:
# --- Lightweight feature engineering
def add_lags(g, cols, lags=(1,)):
    for L in lags:
        for c in cols:
            g[f'{c}_lag{L}'] = g[c].shift(L)
    return g

def add_roll_stats(g, cols, windows=(3,5)):
    for w in windows:
        roll = g[cols].rolling(w, min_periods=w)
        g[[f'{c}_roll{w}_mean' for c in cols]] = roll.mean().values
        g[[f'{c}_roll{w}_std'  for c in cols]] = roll.std().values
    return g

In [7]:
import gc

LAGS = (1,)          # keep light first; add (2,) later if OK
WINS = (3,)          # start with (3,) only; add (5,) later if OK
BATCH = 32           # number of columns per batch; tune to your RAM

id_col, time_col, target = "machine_id", "timestamp", "CoF"
shard_paths = []

# base feature candidates = numeric sensors only
non_feature = {id_col, time_col, target, "Breakdown", "__line"}
feats_base = [c for c in df.select_dtypes(include="number").columns if c not in non_feature]
print(f"Base numeric features: {len(feats_base)}")

def chunked(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

for gid, g0 in df.groupby(id_col, sort=True):
    # minimal frame we’ll keep growing
    g = g0[[id_col, time_col, target]].copy()
    g[time_col] = pd.to_datetime(g[time_col], errors="coerce")

    # === per-batch feature engineering to keep memory low ===
    for cols in chunked(feats_base, BATCH):
        block = g0[cols].astype("float32").copy()

        # lags
        for L in LAGS:
            lagged = block.shift(L)
            lagged.columns = [f"{c}_lag{L}" for c in cols]
            g = pd.concat([g, lagged], axis=1)
            del lagged
            gc.collect()

        # rolling MEAN only (add STD later if RAM allows)
        for w in WINS:
            roll_mean = block.rolling(w, min_periods=w).mean()
            roll_mean.columns = [f"{c}_r{w}_mean" for c in cols]
            g = pd.concat([g, roll_mean], axis=1)
            del roll_mean
            gc.collect()

        del block
        gc.collect()

    # warm-up trim (avoid first rows that are NaN due to lags/rolling)
    warmup = max(LAGS or (0,)) + max(WINS or (1,)) - 1
    if warmup > 0 and len(g) > warmup:
        g = g.iloc[warmup:].copy()

    # gentle fill so we don’t drop everything
    feat_cols_all = [c for c in g.columns if c not in {id_col, time_col, target}]
    g[feat_cols_all] = g[feat_cols_all].ffill().bfill()

    # drop rows only if ALL engineered features are NaN (rare now)
    g = g.dropna(subset=feat_cols_all, how="all")

    if len(g) == 0:
        print(f"⚠️ Empty after warm-up/ffill: machine={gid} — skipping")
        continue

    out_path = SHARDS_DIR / f"part_{id_col}_{gid}.parquet"
    g.to_parquet(out_path, index=False)
    shard_paths.append(out_path)
    print(f"✓ Shard {out_path.name}: {len(g)} rows, {len(feat_cols_all)} features")

print(f"Shards written: {len(shard_paths)} → {SHARDS_DIR}")


Base numeric features: 217
✓ Shard part_machine_id_10.parquet: 349917 rows, 434 features
✓ Shard part_machine_id_20.parquet: 349917 rows, 434 features
Shards written: 2 → d:\Richard Files\WORK\pdm-project\data\processed\shards_CoF


In [8]:
paths = sorted(SHARDS_DIR.glob("part_*.parquet"))
print("Shard files:", len(paths))
for p in paths[:5]:
    g = pd.read_parquet(p)
    print(p.name, "| rows:", len(g))

Shard files: 4
part_machine_id_10.parquet | rows: 349917
part_machine_id_20.parquet | rows: 349917
part_Mesin_10.parquet | rows: 0
part_Mesin_20.parquet | rows: 0


In [9]:
from sklearn.preprocessing import StandardScaler
import numpy as np, pandas as pd, joblib

scaler = StandardScaler()
sample_rows = 100_000
X_buf = None
rows_used = 0

for p in shard_paths:
    g = pd.read_parquet(p)
    feats = [c for c in g.columns if c not in [id_col, time_col, target]]
    if len(g) == 0 or not feats:
        print("Skip (empty/no feats):", p.name)
        continue
    # at least some data per row
    mask = g[feats].notna().any(axis=1)
    if not mask.any():
        print("Skip (all-NaN rows):", p.name)
        continue
    X_chunk = g.loc[mask, feats].fillna(0.0).values
    X_buf = X_chunk if X_buf is None else np.vstack([X_buf, X_chunk])
    rows_used += X_chunk.shape[0]
    if rows_used >= sample_rows:
        break

if X_buf is None or X_buf.shape[0] == 0:
    raise RuntimeError("No valid data found to fit StandardScaler — inspect shards again.")

joblib.dump(scaler.fit(X_buf), PROCESSED_DIR / "cof_scaler.joblib")
print("✅ Scaler fitted on", X_buf.shape[0], "rows")


✅ Scaler fitted on 349917 rows


In [10]:
# --- Build global time ordering to compute cutoffs for splits
meta_parts = []
for p in shard_paths:
    g = pd.read_parquet(p, columns=[time_col])
    meta_parts.append(g.assign(path=str(p)))
meta = pd.concat(meta_parts, ignore_index=True).sort_values(time_col).reset_index(drop=True)

tscv = TimeSeriesSplit(n_splits=3)
splits = list(tscv.split(meta))
train_idx, test_idx = splits[-1]
mid = (test_idx[0] + test_idx[-1]) // 2
val_idx = np.arange(test_idx[0], mid+1)
test_idx2 = np.arange(mid+1, test_idx[-1]+1)

cut_train_t = meta.iloc[train_idx][-1][time_col]
cut_val_t   = meta.iloc[val_idx][-1][time_col]
print('Cutoffs:', cut_train_t, '|', cut_val_t)

KeyError: -1

In [None]:
# --- Helper to append parquet
from pathlib import Path
def append_parquet(df, path: Path):
    if path.exists():
        old = pd.read_parquet(path)
        pd.concat([old, df], ignore_index=True).to_parquet(path, index=False)
    else:
        df.to_parquet(path, index=False)

In [None]:
# --- Stream shards → scale & route to split
Xtr_path = PROCESSED_DIR / 'CoF_X_train.parquet'
Xva_path = PROCESSED_DIR / 'CoF_X_val.parquet'
Xte_path = PROCESSED_DIR / 'CoF_X_test.parquet'
ytr_path = PROCESSED_DIR / 'CoF_y_train.parquet'
yva_path = PROCESSED_DIR / 'CoF_y_val.parquet'
yte_path = PROCESSED_DIR / 'CoF_y_test.parquet'

for p in [Xtr_path, Xva_path, Xte_path, ytr_path, yva_path, yte_path]:
    if p.exists():
        p.unlink()

for p in shard_paths:
    g = pd.read_parquet(p)
    feats = [c for c in g.columns if c not in [id_col, time_col, target]]
    keep  = [id_col, time_col]
    X_scaled = pd.DataFrame(scaler.transform(g[feats].values), columns=feats)
    X_scaled = pd.concat([g[keep].reset_index(drop=True), X_scaled.reset_index(drop=True)], axis=1)

    mask_tr = g[time_col] <= cut_train_t
    mask_va = (g[time_col] > cut_train_t) & (g[time_col] <= cut_val_t)
    mask_te = g[time_col] > cut_val_t

    append_parquet(X_scaled.loc[mask_tr], Xtr_path)
    append_parquet(X_scaled.loc[mask_va], Xva_path)
    append_parquet(X_scaled.loc[mask_te], Xte_path)

    append_parquet(pd.DataFrame({target: g.loc[mask_tr, target].values}), ytr_path)
    append_parquet(pd.DataFrame({target: g.loc[mask_va, target].values}), yva_path)
    append_parquet(pd.DataFrame({target: g.loc[mask_te, target].values}), yte_path)

print('Saved datasets:')
print('  ', Xtr_path.name, Xva_path.name, Xte_path.name)
print('  ', ytr_path.name, yva_path.name, yte_path.name)
