
# 🛠️ Equipment Failure Prediction (Predictive Maintenance)
**Project Goal:** Predict the *Remaining Useful Life (RUL)* of an oilfield pump using sensor data.

---
### Objectives
- Perform exploratory data analysis (EDA) on sensor readings
- Train regression models to predict RUL
- Use cross-validation for model evaluation
- Apply SHAP for feature importance and interpretability
- Visualize results with interactive plots


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import plotly.express as px

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

# Load the synthetic dataset
df = pd.read_csv('equipment_failure_data.csv')
df.head()


In [None]:

# Basic info and summary
display(df.describe())
display(df.info())

# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap of Sensor Data")
plt.show()

# Example sensor trend
fig = px.line(df.head(500), x='timestamp', y=['pressure','temperature','vibration'], title='Sensor Trends (first 500 hours)')
fig.show()


In [None]:

# Feature selection
features = ['pressure','temperature','vibration','flow_rate','current_draw','run_time_hours','maintenance_flag']
target = 'RUL'

X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Random Forest Model
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost Model
xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Evaluate models
def evaluate(model_name, y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} — RMSE: {rmse:.2f}, R²: {r2:.3f}")

evaluate("Random Forest", y_test, y_pred_rf)
evaluate("XGBoost", y_test, y_pred_xgb)


In [None]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X, y, cv=kf, scoring='r2')
print("Cross-Validation R² Scores:", cv_scores)
print("Average R²:", np.mean(cv_scores))


In [None]:

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar")


In [None]:

# Predicted vs Actual
fig = px.scatter(x=y_test, y=y_pred_rf, labels={'x':'Actual RUL','y':'Predicted RUL'}, title='Predicted vs Actual RUL (Random Forest)')
fig.add_shape(type='line', x0=0, y0=0, x1=max(y_test), y1=max(y_test), line=dict(dash='dot'))
fig.show()

# Feature Importance
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
fig = px.bar(importances, title="Feature Importance (Random Forest)")
fig.show()
