
# 01 — EDA & Data Prep (CoF)

Goals:
- Load Line 10/20 raw data (CSV as configured)
- Parse timestamps, sort, align by `machine_id`
- Basic EDA (missingness, ranges, sampling frequency)
- Build **Chance of Failure (CoF)** labels using the provided `breakdown` column (1 if breakdown event occurred)
- Additionally compute **forward-looking** labels: CoF = 1 if any breakdown occurs within the next `horizon_minutes` (default: 30 min)
- Save a clean, labeled snapshot to `data/processed/cof_labeled.parquet`


In [1]:

from pathlib import Path
import sys, json, os
import pandas as pd
import numpy as np
import yaml
from datetime import timedelta

# Paths
NB_PATH = Path.cwd()
ROOT = NB_PATH.parents[1] if NB_PATH.name.lower() == 'cof' else NB_PATH
DATA_DIR = ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'
CONFIGS_DIR = ROOT / 'configs'

for d in [DATA_DIR, RAW_DIR, PROCESSED_DIR, CONFIGS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)


ROOT: d:\Richard Files\WORK\pdm-project


In [2]:

# --- Load configs
def load_yaml(p):
    import yaml, io
    with open(p, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

base_cfg   = load_yaml(CONFIGS_DIR / 'base.yaml')
task_cfg   = load_yaml(CONFIGS_DIR / 'task_cof.yaml')
line10_cfg = load_yaml(CONFIGS_DIR / 'line10.yaml') if (CONFIGS_DIR / 'line10.yaml').exists() else None
line20_cfg = load_yaml(CONFIGS_DIR / 'line20.yaml') if (CONFIGS_DIR / 'line20.yaml').exists() else None

base_cfg, task_cfg


({'random_seed': 42,
  'time_index': 'timestamp',
  'id_col': 'machine_id',
  'target_rul': 'RUL',
  'target_cof': 'CoF',
  'freq': '1min'},
 {'task': 'CoF',
  'metrics': ['f1', 'recall', 'roc_auc'],
  'train_val_test_split': {'method': 'time_series_split', 'n_splits': 3}})

In [3]:

# --- Load raw data (CSV; adapt for SQL/Influx if needed)
def load_raw_from_cfg(line_cfg: dict) -> pd.DataFrame:
    if line_cfg is None:
        return pd.DataFrame()
    src = line_cfg.get('source', 'csv')
    path = line_cfg.get('path')
    if src == 'csv' and path:
        p = Path(path)
        if p.exists():
            df = pd.read_csv(p)
            df['__line'] = line_cfg.get('line')
            return df
    print("WARN: Could not load", line_cfg)
    return pd.DataFrame()

df10 = load_raw_from_cfg(line10_cfg)
df20 = load_raw_from_cfg(line20_cfg)

df = pd.concat([df10, df20], ignore_index=True) if not df10.empty or not df20.empty else df10.copy()
print("Shapes -> L10:", df10.shape, "| L20:", df20.shape, "| Combined:", df.shape)
display(df.head(3))


  df = pd.read_csv(p)
  df = pd.read_csv(p)


Shapes -> L10: (349920, 79) | L20: (349920, 153) | Combined: (699840, 221)


Unnamed: 0,Timestamp,Mesin,Energy_Sectors.Extruder_10_Target,EXT_10.Extruder_Load,EXT_10.Machine_Hour,EXT_10.Machine_Run,EXT_10.Melt_Press,EXT_10.Melt_Temp_1,EXT_10.Motor_Extruder_Run,EXT_10.Panel_Humidity,...,EXT_20_Cincinnati.Temp_Setpoint_Dies_16,EXT_20_Cincinnati.Vacuum_Press_Actual,EXT_20_Cincinnati.Vacuum_Press_Setpoint,EXT_20_Cincinnati.Vacuum_Pump_Run,PM_Extruder_20.A_avg,PM_Extruder_20.Frequency,PM_Extruder_20.P,PM_Extruder_20.Power_Factor,PM_Extruder_20.V_avg,PM_Extruder_20.W_tot
0,2025-01-01 00:00:00.000,10,,,,,,,,,...,,,,,,,,,,
1,2025-01-01 00:01:00.000,10,,,,,,,,,...,,,,,,,,,,
2,2025-01-01 00:02:00.000,10,,,,,,,,,...,,,,,,,,,,


In [4]:

# --- Standardize columns
time_col = base_cfg.get('time_index', 'Timestamp')
id_col   = base_cfg.get('id_col', 'machine_id')
breakdown_col = base_cfg.get('breakdown_col', 'Breakdown')

# Try to infer timestamp if not present
if time_col not in df.columns:
    for cand in ['Timestamp','time','datetime','DateTime','ts','date']:
        if cand in df.columns:
            time_col = cand
            break

# Try to infer id if not present
if id_col not in df.columns:
    for cand in ['machine_id','Mesin','asset_id','AssetID','line_id']:
        if cand in df.columns:
            id_col = cand
            break

print("Using time_col:", time_col, "| id_col:", id_col, "| breakdown_col:", breakdown_col)


Using time_col: Timestamp | id_col: Mesin | breakdown_col: Breakdown


In [5]:
# --- Parse timestamp & sort
df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
df[breakdown_col] = df[breakdown_col].astype(int)
df = df.dropna(subset=[time_col]).sort_values([id_col, time_col]).reset_index(drop=True)

In [6]:



# Ensure breakdown column exists
breakdown_col = None
for c in df.columns:
    if c.lower() in {'breakdown','failure','fail','is_failure'}:
        breakdown_col = c
        break

if breakdown_col is None:
    raise ValueError("No breakdown/failure column found. Please ensure your file has a column named 'breakdown' (0/1).")
else:
    # Normalize breakdown to 0/1 int
    df[breakdown_col] = (df[breakdown_col].astype(str).isin(['1','True','true'])).astype(int)

print("Breakdown column:", breakdown_col, "| Positive events:", int(df[breakdown_col].sum()))


Breakdown column: Breakdown | Positive events: 10639


In [7]:

# --- Optional resampling to a fixed frequency
freq = base_cfg.get('freq', None)  # e.g., '1min'
if freq:
    non_num = df.select_dtypes(exclude='number').columns.tolist()
    non_num = list(dict.fromkeys([c for c in non_num if c not in [time_col]] + [id_col, '__line']))
    num = df.select_dtypes(include='number').columns.tolist()
    num = [c for c in num if c not in [id_col] and c != '__line']

    out_frames = []
    for gid, g in df.groupby(id_col):
        g = g.set_index(time_col).sort_index()
        g_num = g[num].resample(freq).mean()
        g_non = g[non_num].resample(freq).ffill().bfill()
        # For breakdown: any event in the interval → 1
        if breakdown_col in g:
            g_bd = g[[breakdown_col]].resample(freq).max()
            g_num[breakdown_col] = g_bd[breakdown_col]
        g_ = pd.concat([g_non, g_num], axis=1)
        g_[id_col] = gid
        out_frames.append(g_.reset_index())

    df = pd.concat(out_frames, ignore_index=True).sort_values([id_col, time_col]).reset_index(drop=True)

print("After optional resample:", df.shape)


After optional resample: (699840, 221)


In [8]:

# --- Build CoF label (future breakdown within horizon)
horizon_minutes = 30

def label_future_breakdown(g, time_col, breakdown_col, horizon_minutes=30):
    times = g[time_col].values
    bd = g[breakdown_col].values.astype(int)
    out = np.zeros(len(g), dtype=int)
    for i in range(len(g)):
        t0 = times[i]
        j = i + 1
        while j < len(g) and (times[j] - t0) <= np.timedelta64(horizon_minutes, 'm'):
            if bd[j] == 1:
                out[i] = 1
                break
            j += 1
    return out

# Pre-allocate and fill per group (always 1D)
cof = pd.Series(0, index=df.index, name='CoF')
for _, g in df.groupby(id_col):
    vals = label_future_breakdown(g, time_col, breakdown_col, horizon_minutes)
    cof.loc[g.index] = vals

df['CoF'] = cof.astype(int)

print(f"CoF positives (within next {horizon_minutes} min): {int(df['CoF'].sum())}")
display(df[[id_col, time_col, breakdown_col, 'CoF']].head(12))


CoF positives (within next 30 min): 11190


Unnamed: 0,Mesin,Timestamp,Breakdown,CoF
0,10,2025-01-01 00:00:00,0,0
1,10,2025-01-01 00:01:00,0,0
2,10,2025-01-01 00:02:00,0,0
3,10,2025-01-01 00:03:00,0,0
4,10,2025-01-01 00:04:00,0,0
5,10,2025-01-01 00:05:00,0,0
6,10,2025-01-01 00:06:00,0,0
7,10,2025-01-01 00:07:00,0,0
8,10,2025-01-01 00:08:00,0,0
9,10,2025-01-01 00:09:00,0,0


In [9]:

# --- Save snapshot
out_path = PROCESSED_DIR / 'cof_labeled.parquet'
df.to_parquet(out_path, index=False)
print("Saved:", out_path, "| shape:", df.shape)


Saved: d:\Richard Files\WORK\pdm-project\data\processed\cof_labeled.parquet | shape: (699840, 222)
