
# 02 — Feature Engineering (CoF)

- Load `data/processed/cof_labeled.parquet`
- Create time-series features (lags, rolling stats, diffs)
- Time-series split (train/val/test) and scaling
- Save `CoF_X/*.parquet`, `CoF_y/*.parquet`, and a fitted scaler


In [None]:

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

NB_PATH = Path.cwd()
ROOT = NB_PATH.parents[1] if NB_PATH.name.lower() == 'cof' else NB_PATH
DATA_DIR = ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'

id_col   = 'Mesin'       # keep in sync with configs
time_col = 'Timestamp'
target   = 'CoF'
breakdown_col = 'Breakdown'  # if present; will be excluded from features

# --- Load & sort
df = pd.read_parquet(PROCESSED_DIR / 'cof_labeled.parquet')
df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
df = df.dropna(subset=[time_col]).sort_values([id_col, time_col]).reset_index(drop=True)

print("Loaded:", df.shape, "| positives:", int(df[target].sum()))


In [None]:

# --- Select numeric sensor columns (exclude id/time/line/targets/flags)
exclude = {id_col, time_col, target, '__line'}
exclude |= {c for c in df.columns if c.lower() in {'breakdown','failure','fail','is_failure','rul'}}
num_cols = [c for c in df.select_dtypes(include='number').columns if c not in exclude]
print("Numeric sensor columns:", len(num_cols))
num_cols[:10]


In [None]:

# --- Feature engineering helpers
def add_lags(g, cols, lags=(1,2,3)):
    for L in lags:
        for c in cols:
            g[f'{c}_lag{L}'] = g[c].shift(L)
    return g

def add_roll_stats(g, cols, windows=(3,5,15)):
    for w in windows:
        for c in cols:
            g[f'{c}_roll{w}_mean'] = g[c].rolling(w).mean()
            g[f'{c}_roll{w}_std']  = g[c].rolling(w).std()
            g[f'{c}_roll{w}_min']  = g[c].rolling(w).min()
            g[f'{c}_roll{w}_max']  = g[c].rolling(w).max()
    return g

def add_diffs(g, cols):
    for c in cols:
        g[f'{c}_diff1'] = g[c].diff(1)
    return g


In [None]:

# --- Apply per machine to avoid leakage
fe = []
for gid, g in df.groupby(id_col):
    g = g.copy()
    g = add_lags(g, num_cols, lags=(1,2,3))
    g = add_roll_stats(g, num_cols, windows=(3,5,15))
    g = add_diffs(g, num_cols)
    fe.append(g)

df_fe = pd.concat(fe, ignore_index=True).dropna().reset_index(drop=True)
print("After FE:", df_fe.shape)


In [None]:

# --- Split by time using TimeSeriesSplit
X = df_fe.drop(columns=[target])
y = df_fe[target].values

keep = [id_col, time_col, '__line'] if '__line' in X.columns else [id_col, time_col]
X_num = X.drop(columns=keep, errors='ignore')

scaler = StandardScaler(with_mean=True, with_std=True)
X_num_scaled = scaler.fit_transform(X_num.values)
X_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns, index=X_num.index)
X_scaled = pd.concat([X[keep], X_scaled], axis=1)

import joblib
joblib.dump(scaler, PROCESSED_DIR / 'cof_scaler.joblib')

from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
splits = list(tscv.split(X_scaled))
(train_idx, test_idx) = splits[-1]
mid = (test_idx[0] + test_idx[-1]) // 2
val_idx = np.arange(test_idx[0], mid+1)
test_idx2 = np.arange(mid+1, test_idx[-1]+1)

print("Index sizes -> train:", len(train_idx), "val:", len(val_idx), "test:", len(test_idx2))

# Save parquet datasets
X_scaled.iloc[train_idx].to_parquet(PROCESSED_DIR / 'CoF_X_train.parquet', index=False)
X_scaled.iloc[val_idx].to_parquet(PROCESSED_DIR / 'CoF_X_val.parquet', index=False)
X_scaled.iloc[test_idx2].to_parquet(PROCESSED_DIR / 'CoF_X_test.parquet', index=False)

pd.DataFrame({target: y[train_idx]}).to_parquet(PROCESSED_DIR / 'CoF_y_train.parquet', index=False)
pd.DataFrame({target: y[val_idx]}).to_parquet(PROCESSED_DIR / 'CoF_y_val.parquet', index=False)
pd.DataFrame({target: y[test_idx2]}).to_parquet(PROCESSED_DIR / 'CoF_y_test.parquet', index=False)

print("Saved processed train/val/test and scaler.")
