# AI4I 2020 Dataset Preprocessing
This notebook loads the AI4I 2020 predictive maintenance dataset, cleans it, performs basic feature processing, and saves a clean version for model training. This includes label encoding, removing constant columns, and optional RUL approximation for consistent modeling across datasets.

In [1]:
import pandas as pd
import numpy as np
import os

# Load dataset
df = pd.read_csv('../data/ai4i2020/ai4i2020.csv')
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [3]:
# Drop UID if present and rename FailureType column
if 'UDI' in df.columns:
    df.drop(['UDI'], axis=1, inplace=True)

# Create a derived FailureType column for single-label classification
def collapse_failure_types(row):
    for col in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
        if row[col] == 1:
            return col
    return 'NoFailure'

df['FailureType'] = df.apply(collapse_failure_types, axis=1)
df['FailureType'] = df['FailureType'].astype('category')
df['FailureTypeLabel'] = df['FailureType'].cat.codes

# Optional: Add synthetic RUL estimate (for compatibility)
df['RUL'] = 100 - df['Tool wear [min]']  # Example logic
df['RUL'] = df['RUL'].clip(lower=0)

# Save clean version
os.makedirs('../data/processed', exist_ok=True)
df.to_csv('../data/processed/ai4i_cleaned.csv', index=False)
df.head()

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,FailureType,FailureTypeLabel,RUL
0,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,NoFailure,1,100
1,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,NoFailure,1,97
2,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,NoFailure,1,95
3,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,NoFailure,1,93
4,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,NoFailure,1,91
