# 02 - Data Preprocessing



## Objective
Clean columns, coerce numeric types, infer unit/cycle, impute sensor values, and create labels. Persist preprocessed dataset.

---

### Run order
Execute notebooks in numeric order: 01 → 02 → 03 → 04 → 05. Each notebook mounts Google Drive and reads/writes intermediate artifacts so it can run independently.

---


In [None]:

import pandas as pd
import numpy as np
import os
from google.colab import drive

# Mount
if not os.path.exists('/content/drive/MyDrive'):
    drive.mount('/content/drive')
else:
    print('Drive already mounted!')

RAW_IN = '/content/drive/MyDrive/dsp-poc/data/df_raw.parquet'
PREP_OUT = '/content/drive/MyDrive/dsp-poc/data/df_preprocessed.parquet'

# Load raw snapshot
df = pd.read_parquet(RAW_IN)
print('Loaded raw snapshot:', df.shape)

# Drop empty/unnamed/blank columns
df = df.dropna(axis=1, how='all')
df = df.loc[:, ~(df.columns.astype(str).str.strip() == '')]
df = df.drop(columns=[c for c in df.columns if 'Unnamed' in str(c)], errors='ignore')

# Drop timestamp-like columns
df = df.drop(columns=[c for c in df.columns if 'timestamp' in c.lower()], errors='ignore')

# Use last column as RUL, coerce numeric
RUL_COL = df.columns[-1]
print('Using last column as RUL:', RUL_COL)
df[RUL_COL] = pd.to_numeric(df[RUL_COL], errors='coerce')
df = df.dropna(subset=[RUL_COL]).reset_index(drop=True)
df = df.rename(columns={RUL_COL: 'RUL'})

# Infer unit boundaries based on RUL increasing (new unit)
rul_vals = df['RUL'].values
unit_ids, current_unit, prev = [], 0, None
for i, v in enumerate(rul_vals):
    if i == 0:
        current_unit = 1
        unit_ids.append(current_unit)
        prev = v
        continue
    if v > prev:
        current_unit += 1
    unit_ids.append(current_unit)
    prev = v

df['unit'] = unit_ids

# Cycle per unit
df['cycle'] = df.groupby('unit').cumcount() + 1
print('Detected units:', df['unit'].nunique())

# Sensor columns (exclude target & meta)
exclude_cols = ['RUL', 'unit', 'cycle']
sensor_cols = [c for c in df.columns if c not in exclude_cols]
print('Sensor columns count:', len(sensor_cols))

# Coerce numeric and impute per unit (ffill/bfill), then global median
for c in sensor_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df[sensor_cols] = df.groupby('unit')[sensor_cols].transform(lambda g: g.ffill().bfill())
df[sensor_cols] = df[sensor_cols].fillna(df[sensor_cols].median())

# Label: fail within horizon
HORIZON = 30
df['fail_in_H'] = (df['RUL'] <= HORIZON).astype(int)

# Persist
df.to_parquet(PREP_OUT)
print('Saved preprocessed dataset to:', PREP_OUT)


Mounted at /content/drive
Loaded raw snapshot: (166441, 53)
Using last column as RUL: rul
Detected units: 7
Sensor columns count: 50
Saved preprocessed dataset to: /content/drive/MyDrive/dsp-poc/data/df_preprocessed.parquet
