
# 00 — Project Overview & Setup (PDM: RUL + CoF)

This notebook wires up the **Predictive Maintenance** project for two tasks:

- **RUL Regression** (CNN, LSTM, XGBoost)
- **Chance of Failure (CoF) Classification** (CNN, LSTM, XGBoost)

It sets project paths, loads configs, verifies data availability, initializes a lightweight experiment registry, and builds a **data dictionary** preview for Line 10/20.


In [1]:

# --- Imports & Path Setup
from pathlib import Path
import sys, os, json
from datetime import datetime
import pandas as pd
import numpy as np
import yaml

# Project root = two levels up from this notebook if you keep the suggested structure
# Fallback: current working directory
NB_PATH = Path.cwd()
ROOT = NB_PATH.parent if NB_PATH.name.lower() in {'rul','cof','notebooks'} else NB_PATH
ROOT = ROOT if (ROOT / 'notebooks').exists() else NB_PATH

DATA_DIR = ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
INTERIM_DIR = DATA_DIR / 'interim'
PROCESSED_DIR = DATA_DIR / 'processed'
CONFIGS_DIR = ROOT / 'configs'
EXPERIMENTS_DIR = ROOT / 'experiments'
SRC_DIR = ROOT / 'src'

for d in [DATA_DIR, RAW_DIR, INTERIM_DIR, PROCESSED_DIR, CONFIGS_DIR, EXPERIMENTS_DIR, SRC_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Allow 'src' imports even if not installed as a package yet
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

print(f"ROOT       : {ROOT}")
print(f"DATA_DIR   : {DATA_DIR}")
print(f"CONFIGS_DIR: {CONFIGS_DIR}")
print(f"EXPERIMENTS: {EXPERIMENTS_DIR}")
print(f"SRC        : {SRC_DIR}")


ROOT       : d:\Richard Files\WORK\pdm-project
DATA_DIR   : d:\Richard Files\WORK\pdm-project\data
CONFIGS_DIR: d:\Richard Files\WORK\pdm-project\configs
EXPERIMENTS: d:\Richard Files\WORK\pdm-project\experiments
SRC        : d:\Richard Files\WORK\pdm-project\src


In [2]:

# --- Config helpers & defaults
BASE_CFG = CONFIGS_DIR / 'base.yaml'
TASK_RUL = CONFIGS_DIR / 'task_rul.yaml'
TASK_COF = CONFIGS_DIR / 'task_cof.yaml'
ALGO_CNN = CONFIGS_DIR / 'algo_cnn.yaml'
ALGO_LSTM = CONFIGS_DIR / 'algo_lstm.yaml'
ALGO_XGB = CONFIGS_DIR / 'algo_xgb.yaml'
LINE10 = CONFIGS_DIR / 'line10.yaml'
LINE20 = CONFIGS_DIR / 'line20.yaml'

def ensure_yaml(path: Path, content: dict):
    if not path.exists():
        with path.open('w', encoding='utf-8') as f:
            yaml.safe_dump(content, f, sort_keys=False)
        print(f"Created default config: {path.name}")
    else:
        print(f"Found config: {path.name}")

# Reasonable defaults — adjust as needed later
ensure_yaml(BASE_CFG, {
    'random_seed': 42,
    'time_index': 'timestamp',
    'id_col': 'machine_id',
    'target_rul': 'RUL',
    'target_cof': 'CoF',
    'freq': '1min',  # resample frequency if needed
})

ensure_yaml(TASK_RUL, {
    'task': 'RUL',
    'metrics': ['rmse','mae','r2','nasa','silhouette'],
    'train_val_test_split': {'method': 'time_series_split', 'n_splits': 3}
})

ensure_yaml(TASK_COF, {
    'task': 'CoF',
    'metrics': ['f1','recall','roc_auc'],
    'train_val_test_split': {'method': 'time_series_split', 'n_splits': 3}
})

ensure_yaml(ALGO_CNN, {
    'algo': 'cnn',
    'epochs': 50,
    'batch_size': 128,
    'learning_rate': 1e-3,
})

ensure_yaml(ALGO_LSTM, {
    'algo': 'lstm',
    'epochs': 50,
    'batch_size': 128,
    'learning_rate': 1e-3,
    'hidden_size': 64,
    'num_layers': 2
})

ensure_yaml(ALGO_XGB, {
    'algo': 'xgboost',
    'n_estimators': 500,
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.9,
    'colsample_bytree': 0.9
})

ensure_yaml(LINE10, {'line': 10, 'source': 'csv', 'path': str(RAW_DIR / 'Line10' / 'DM_Machine_Learning_Line_10.csv')})
ensure_yaml(LINE20, {'line': 20, 'source': 'csv', 'path': str(RAW_DIR / 'Line20' / 'DM_Machine_Learning_Line_20.csv')})

def load_yaml(path: Path) -> dict:
    with path.open('r', encoding='utf-8') as f:
        return yaml.safe_load(f)

base_cfg = load_yaml(BASE_CFG)
print("Base config loaded:", base_cfg)


Created default config: base.yaml
Created default config: task_rul.yaml
Created default config: task_cof.yaml
Created default config: algo_cnn.yaml
Created default config: algo_lstm.yaml
Created default config: algo_xgb.yaml
Created default config: line10.yaml
Created default config: line20.yaml
Base config loaded: {'random_seed': 42, 'time_index': 'timestamp', 'id_col': 'machine_id', 'target_rul': 'RUL', 'target_cof': 'CoF', 'freq': '1min'}


In [4]:

# --- Data discovery & preview
# If a dataset exists in /mnt/data (as in chat attachment), place it under data/raw/Line10 for consistency.
mnt_line10 = Path('/mnt/data/DM_Machine_Learning_Line_10.csv')
proj_line10 = RAW_DIR / 'Line10' / 'DM_Machine_Learning_Line_10.csv'
proj_line10.parent.mkdir(parents=True, exist_ok=True)

if mnt_line10.exists():
    try:
        # Copy only if project copy doesn't exist yet
        if not proj_line10.exists():
            proj_line10.write_bytes(mnt_line10.read_bytes())
            print(f"Copied Line 10 CSV into project: {proj_line10}")
    except Exception as e:
        print("Copy skipped, reason:", e)

# Attempt to load Line 10 (if present)
df10 = None
if proj_line10.exists():
    try:
        df10 = pd.read_csv(proj_line10)
        print(f"Loaded Line 10 shape: {df10.shape}")
        display(df10.head())
    except Exception as e:
        print("Failed to read Line 10 CSV:", e)
else:
    print("Line 10 CSV not found yet — expected at:", proj_line10)


Loaded Line 10 shape: (86400, 78)


  df10 = pd.read_csv(proj_line10)


Unnamed: 0,Timestamp,Mesin,Energy_Sectors.Extruder_10_Target,EXT_10.Extruder_Load,EXT_10.Machine_Hour,EXT_10.Machine_Run,EXT_10.Melt_Press,EXT_10.Melt_Temp_1,EXT_10.Motor_Extruder_Run,EXT_10.Panel_Humidity,...,PM_Extruder_10.V_avg,PM_Extruder_10.W_tot,Start_Time,End_Time,Level_1,Level_2,Level_3,Detail,rn,Breakdown
0,2025-01-01 00:00:00.000,10,,,,,,,,,...,,,,,,,,,1,0
1,2025-01-01 00:01:00.000,10,,,,,,,,,...,,,,,,,,,1,0
2,2025-01-01 00:02:00.000,10,,,,,,,,,...,,,,,,,,,1,0
3,2025-01-01 00:03:00.000,10,,,,,,,,,...,,,,,,,,,1,0
4,2025-01-01 00:04:00.000,10,,,,,,,,,...,,,,,,,,,1,0


In [5]:

# --- Build a quick data dictionary for any loaded frame
def make_data_dictionary(df: pd.DataFrame, n_cat_top=10) -> pd.DataFrame:
    info = []
    for col in df.columns:
        s = df[col]
        dtype = str(s.dtype)
        n_missing = int(s.isna().sum())
        missing_pct = float(100 * n_missing / len(s)) if len(s) else 0.0
        nunique = int(s.nunique(dropna=True))

        entry = {
            'column': col,
            'dtype': dtype,
            'n_missing': n_missing,
            'missing_pct': round(missing_pct, 2),
            'n_unique': nunique,
        }

        if np.issubdtype(s.dtype, np.number):
            entry.update({
                'min': s.min(skipna=True),
                'max': s.max(skipna=True),
                'mean': s.mean(skipna=True),
                'std': s.std(skipna=True),
            })
        else:
            # capture top categories for object columns
            vc = s.value_counts(dropna=True).head(n_cat_top)
            entry['top_values'] = "; ".join([f"{k}:{int(v)}" for k, v in vc.items()])

        info.append(entry)

    return pd.DataFrame(info)

if df10 is not None and not df10.empty:
    dd10 = make_data_dictionary(df10)
    # Save for reference
    dd_path = PROCESSED_DIR / 'data_dictionary_line10.csv'
    dd10.to_csv(dd_path, index=False)
    print(f"Data dictionary saved to: {dd_path}")
    try:
        # Display nicely in the UI
        from caas_jupyter_tools import display_dataframe_to_user
        display_dataframe_to_user("Data Dictionary — Line 10", dd10)
    except Exception as e:
        display(dd10.head(20))
else:
    print("No Line 10 dataframe available to profile.")


Data dictionary saved to: d:\Richard Files\WORK\pdm-project\data\processed\data_dictionary_line10.csv


Unnamed: 0,column,dtype,n_missing,missing_pct,n_unique,top_values,min,max,mean,std
0,Timestamp,object,0,0.0,86400,2025-03-01 23:20:00.000:1; 2025-03-01 23:21:00...,,,,
1,Mesin,int64,0,0.0,1,,10.0,10.0,10.0,0.0
2,Energy_Sectors.Extruder_10_Target,float64,1982,2.29,2,,0.185,0.193,0.1915941,0.003044759
3,EXT_10.Extruder_Load,float64,7801,9.03,3332,,0.0,78.41,63.01244,14.1375
4,EXT_10.Machine_Hour,float64,1982,2.29,1,,0.0,0.0,0.0,0.0
5,EXT_10.Machine_Run,float64,7802,9.03,2,,0.0,1.0,0.9593501,0.197479
6,EXT_10.Melt_Press,float64,7801,9.03,1643,,0.0,363.8,285.5382,62.78127
7,EXT_10.Melt_Temp_1,float64,7801,9.03,1421,,0.0,3276.7,216.8174,242.0476
8,EXT_10.Motor_Extruder_Run,float64,7802,9.03,2,,0.0,1.0,0.9593501,0.197479
9,EXT_10.Panel_Humidity,float64,2033,2.35,304,,40.7,72.1,52.90589,3.700098


In [6]:

# --- Experiment registry bootstrap
for task in ['RUL', 'CoF']:
    task_dir = EXPERIMENTS_DIR / task
    artifacts = task_dir / 'artifacts'
    task_dir.mkdir(parents=True, exist_ok=True)
    artifacts.mkdir(parents=True, exist_ok=True)
    runs_csv = task_dir / 'runs.csv'
    if not runs_csv.exists():
        cols = [
            'timestamp','task','algo','line','seed','params_json',
            # RUL metrics
            'rmse','mae','r2','nasa','silhouette',
            # CoF metrics
            'f1','recall','roc_auc',
            # bookkeeping
            'dataset_hash','notes','artifact_dir'
        ]
        pd.DataFrame(columns=cols).to_csv(runs_csv, index=False)
        print(f"Created experiment registry: {runs_csv}")
    else:
        print(f"Found experiment registry: {runs_csv}")


Created experiment registry: d:\Richard Files\WORK\pdm-project\experiments\RUL\runs.csv
Created experiment registry: d:\Richard Files\WORK\pdm-project\experiments\CoF\runs.csv


In [7]:

# --- Utility to log experiment runs
def log_run(task:str, algo:str, line:int, seed:int, params:dict,
            metrics:dict, dataset_hash:str='', notes:str='') -> Path:
    task_dir = EXPERIMENTS_DIR / task
    artifacts_dir = task_dir / 'artifacts' / f"{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}_{algo}_L{line}_s{seed}"
    artifacts_dir.mkdir(parents=True, exist_ok=True)

    runs_csv = task_dir / 'runs.csv'
    row = {
        'timestamp': datetime.utcnow().isoformat(),
        'task': task, 'algo': algo, 'line': line, 'seed': seed,
        'params_json': json.dumps(params, ensure_ascii=False),
        'dataset_hash': dataset_hash, 'notes': notes,
        'artifact_dir': str(artifacts_dir)
    }
    # Merge metrics keys that exist in schema
    df_runs = pd.read_csv(runs_csv)
    for k,v in metrics.items():
        if k in df_runs.columns:
            row[k] = v

    df_runs = pd.concat([df_runs, pd.DataFrame([row])], ignore_index=True)
    df_runs.to_csv(runs_csv, index=False)
    print(f"Logged run → {runs_csv.name}: {row['algo']} | {row['task']} | L{line} | seed={seed}")
    return artifacts_dir

print("`log_run` ready. Import this cell into training notebooks (or paste function there).")


`log_run` ready. Import this cell into training notebooks (or paste function there).



## Next steps (linked notebooks)

- `RUL/01_eda_data_prep_RUL.ipynb` — unify time index, handle missingness, align sensors vs. OEE, build/verify **RUL labels**.
- `RUL/02_feature_engineering_RUL.ipynb` — rolling stats, trend/lag, frequency features, normalization; snapshot to `data/processed`.
- `RUL/03_train_eval_RUL.ipynb` (+ `03a/b/c`) — parameterized training for **CNN/LSTM/XGBoost**, log metrics to `experiments/RUL/runs.csv`.
- `RUL/04_model_comparison_RUL.ipynb` — aggregate & visualize (RMSE/MAE/R²/NASA/Silhouette).

We will mirror the flow for **CoF** under `notebooks/CoF/`.
