In [1]:
import pandas as pd
import numpy as np
import os

# ---------------------------
# 1. Load raw dataset
# ---------------------------
raw_path = "../data/raw/industrial_sensor_data_final.csv"
df = pd.read_csv(raw_path)

print("Raw dataset loaded:", df.shape)
print(df.head())
print(df["maintenance_flag"].value_counts())

# ---------------------------
# 2. Sort + timestamp conversion
# ---------------------------
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values(by=["machine_id", "timestamp"])

# ---------------------------
# 3. Feature Engineering
# ---------------------------

# Rolling mean STD
df["temp_roll_mean"] = (
    df.groupby("machine_id")["temperature"]
    .rolling(15, min_periods=1).mean()
    .reset_index(level=0, drop=True)
)

df["vib_roll_std"] = (
    df.groupby("machine_id")["vibration"]
    .rolling(15, min_periods=1).std()
    .reset_index(level=0, drop=True)
)

# Diffs
df["temp_diff"] = df.groupby("machine_id")["temperature"].diff().fillna(0)
df["pressure_diff"] = df.groupby("machine_id")["pressure"].diff().fillna(0)

# Time since maintenance
df["time_since_last_maint"] = (
    df.groupby("machine_id")["timestamp"].diff().dt.total_seconds().fillna(0)
)

# Replace NaN
df = df.fillna(0)

# ---------------------------
# 4. Save processed dataset
# ---------------------------
os.makedirs("../data/processed", exist_ok=True)

processed_path = "../data/processed/industrial_sensor_cleaned.csv"
df.to_csv(processed_path, index=False)

print("Processed dataset saved to:", processed_path)
print("Final shape:", df.shape)

Raw dataset loaded: (8640, 7)
             timestamp machine_id  temperature  vibration  pressure    speed  \
0  2025-11-01 00:00:00        M01       60.139     0.2986    24.948  1199.06   
1  2025-11-01 00:00:00        M02       63.658     0.3595    26.426  1299.46   
2  2025-11-01 00:00:00        M03       66.033     0.3842    28.007  1397.31   
3  2025-11-01 00:00:00        M04       69.235     0.4474    29.623  1503.78   
4  2025-11-01 00:00:00        M05       71.875     0.4892    31.083  1606.13   

   maintenance_flag  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0  
maintenance_flag
0    8616
1      24
Name: count, dtype: int64
Processed dataset saved to: ../data/processed/industrial_sensor_cleaned.csv
Final shape: (8640, 12)
