
# 01 — EDA & Data Prep (RUL)

Goals:
- Load Line 10/20 raw data (CSV/SQL as configured)
- Parse timestamps, sort, align by machine_id
- Basic EDA (missingness, ranges, sampling frequency)
- Build **RUL labels** per record
- Save a clean, labeled snapshot to `data/processed/`
    

In [None]:

from pathlib import Path
import sys, json, os
import pandas as pd
import numpy as np
import yaml
from datetime import timedelta

# Paths
NB_PATH = Path.cwd()
ROOT = NB_PATH.parents[1] if NB_PATH.name.lower() == 'rul' else NB_PATH
DATA_DIR = ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'
CONFIGS_DIR = ROOT / 'configs'

for d in [DATA_DIR, RAW_DIR, PROCESSED_DIR, CONFIGS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Import helper from overview if needed
print("ROOT:", ROOT)    
    

In [None]:

# --- Load configs
def load_yaml(p): 
    import yaml, io
    with open(p, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

base_cfg   = load_yaml(CONFIGS_DIR / 'base.yaml')
task_cfg   = load_yaml(CONFIGS_DIR / 'task_rul.yaml')
line10_cfg = load_yaml(CONFIGS_DIR / 'line10.yaml') if (CONFIGS_DIR / 'line10.yaml').exists() else None
line20_cfg = load_yaml(CONFIGS_DIR / 'line20.yaml') if (CONFIGS_DIR / 'line20.yaml').exists() else None

base_cfg, task_cfg
    

In [None]:

# --- Load raw data (CSV; adapt for SQL/Influx if needed)
def load_raw_from_cfg(line_cfg: dict) -> pd.DataFrame:
    if line_cfg is None:
        return pd.DataFrame()
    src = line_cfg.get('source', 'csv')
    path = line_cfg.get('path')
    if src == 'csv' and path:
        p = Path(path)
        if p.exists():
            df = pd.read_csv(p)
            df['__line'] = line_cfg.get('line')
            return df
    print("WARN: Could not load", line_cfg)
    return pd.DataFrame()

df10 = load_raw_from_cfg(line10_cfg)
df20 = load_raw_from_cfg(line20_cfg)

df = pd.concat([df10, df20], ignore_index=True) if not df10.empty or not df20.empty else df10.copy()
print("Shapes -> L10:", df10.shape, "| L20:", df20.shape, "| Combined:", df.shape)
display(df.head(3))
    

In [None]:

# --- Standardize columns
time_col = base_cfg.get('time_index', 'timestamp')
id_col   = base_cfg.get('id_col', 'machine_id')

# Try to infer timestamp if not present
if time_col not in df.columns:
    for cand in ['timestamp','time','datetime','DateTime','ts','date']:
        if cand in df.columns:
            time_col = cand
            break

# Try to infer id if not present
if id_col not in df.columns:
    for cand in ['machine_id','machine','asset_id','AssetID','line_id']:
        if cand in df.columns:
            id_col = cand
            break

print("Using time_col:", time_col, "| id_col:", id_col)
    

In [None]:

# --- Parse timestamp & sort
df[time_col] = pd.to_datetime(df[time_col], errors='coerce', utc=True)
df = df.dropna(subset=[time_col]).sort_values([id_col, time_col]).reset_index(drop=True)

# Basic EDA: missingness
missing = df.isna().mean().sort_values(ascending=False).head(20)
print("Top missing columns:")
display(missing.to_frame('missing_ratio'))
    

In [None]:

# --- Resample/regularize per machine (optional)
# If you have sub-minute readings, configure base_cfg['freq'] accordingly.
freq = base_cfg.get('freq', None)
if freq:
    # numeric-only resample (mean); keep id, line, and other constant columns forward-filled
    non_num = df.select_dtypes(exclude='number').columns.tolist()
    non_num = list(dict.fromkeys([c for c in non_num if c not in [time_col]] + [id_col, '__line']))
    num = df.select_dtypes(include='number').columns.tolist()
    num = [c for c in num if c not in [id_col] and c != '__line']

    out_frames = []
    for gid, g in df.groupby(id_col):
        g = g.set_index(time_col).sort_index()
        g_num = g[num].resample(freq).mean()
        g_non = g[non_num].resample(freq).ffill().bfill()
        g_ = pd.concat([g_non, g_num], axis=1)
        g_[id_col] = gid
        out_frames.append(g_.reset_index())

    df = pd.concat(out_frames, ignore_index=True).sort_values([id_col, time_col]).reset_index(drop=True)

print("After optional resample:", df.shape)
    

In [None]:

# --- Build RUL Labels
# Strategy:
# 1) If a binary failure column exists (e.g., 'failure'==1), compute RUL as time to next failure (minutes).
# 2) Otherwise, allow threshold-based failure criteria (placeholder).
# 3) Records after the last failure per machine will have NaN RUL by default (can be trimmed or imputed).

possible_failure_cols = [c for c in df.columns if c.lower() in {'failure','fail','is_failure','breakdown','fault'}]
failure_col = possible_failure_cols[0] if possible_failure_cols else None

def minutes_until_next_failure(g: pd.DataFrame, time_col: str, failure_col: str):
    times = g[time_col].view('int64') // 10**9  # seconds
    nxt_fail_time = np.full(len(g), np.nan)
    next_time = None
    for i in range(len(g)-1, -1, -1):
        if failure_col and g.iloc[i][failure_col] in [1, True, '1']:
            next_time = times.iloc[i]
        if next_time is not None:
            nxt_fail_time[i] = (next_time - times.iloc[i]) / 60.0
    return nxt_fail_time

if failure_col:
    df['RUL'] = (
        df.groupby(id_col, group_keys=False)
          .apply(lambda g: pd.Series(minutes_until_next_failure(g, time_col, failure_col), index=g.index))
          .astype(float)
    )
else:
    # Placeholder: no explicit failure flag. You can implement your own logic here.
    # Example: mark failures when some sensor exceeds a threshold, then compute RUL similarly.
    df['RUL'] = np.nan
    print("No failure column found. `RUL` set to NaN; please implement your criteria.")

display(df[[id_col, time_col, 'RUL']].head(10))
    

In [None]:

# --- Clean tails: optionally drop rows with NaN RUL (after last failure per machine)
drop_na_rul = True
if drop_na_rul:
    before = len(df)
    df = df[~df['RUL'].isna()].copy()
    after = len(df)
    print(f"Dropped {before-after} rows with NaN RUL (post-last-failure tail).")
    

In [None]:

# --- Save snapshot
out_path = PROCESSED_DIR / 'rul_labeled.parquet'
df.to_parquet(out_path, index=False)
print("Saved:", out_path, "| shape:", df.shape)

# quick sanity: per-machine RUL distribution
summary = df.groupby(id_col)['RUL'].agg(['count','min','max','median']).reset_index()
display(summary.head(10))
    