In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [42]:
# Load synthetic data 
data_path = "../data/synthetic_data.csv"
df = pd.read_csv(data_path)

# Add rolling statistics for degradation trends
window = 6  # 1-hour rolling window (since 6 samples = 1 hour)
df['temp_rolling_avg'] = df.groupby('machine_id')['temperature'].transform(
    lambda x: x.rolling(window=window, min_periods=1).mean()
)
df['vibration_rolling_max'] = df.groupby('machine_id')['vibration'].transform(
    lambda x: x.rolling(window=window, min_periods=1).max()
)

# Drop rows with NaN (from rolling stats)
df.dropna(inplace=True)

# Split into features (X) and target (y)
X = df[['temperature', 'pressure', 'vibration', 'temp_rolling_avg', 'vibration_rolling_max']]
y = df['failure']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [43]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
xgb = XGBClassifier(n_estimators=100, random_state=42, scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]))

# Train models
rf.fit(X_train_scaled, y_train)
xgb.fit(X_train_scaled, y_train)

# Predictions
y_pred_rf = rf.predict(X_test_scaled)
y_pred_xgb = xgb.predict(X_test_scaled)

def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation for {model_name}:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")

Evaluation for Random Forest:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2483
           1       0.82      0.38      0.52       109

    accuracy                           0.97      2592
   macro avg       0.90      0.69      0.75      2592
weighted avg       0.97      0.97      0.96      2592

Confusion Matrix:
 [[2474    9]
 [  68   41]]
Evaluation for XGBoost:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2483
           1       0.62      0.60      0.61       109

    accuracy                           0.97      2592
   macro avg       0.80      0.79      0.80      2592
weighted avg       0.97      0.97      0.97      2592

Confusion Matrix:
 [[2444   39]
 [  44   65]]


In [44]:
# A/B test simulation on sample with and without model

failures_in_test_set = y_test.sum()  # 109 failures
cost_control = failures_in_test_set * 10000  
cost_test = (0.60 * failures_in_test_set * 2000) + (0.15 * failures_in_test_set * 10000)  # $0.45M
savings = cost_control - cost_test  # $1.32M savings
print(f"Estimated savings: ${savings/1e6:.2f}M per 2000 samples")

Estimated savings: $0.80M per 2000 samples
