In [None]:
# Basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Metrics
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve
)

# Imbalance Handling
from imblearn.over_sampling import SMOTE

# Model Saving
import joblib

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv("predictive_maintenance_dataset.csv")
df.head()

In [None]:
df = df.drop(columns=[
    'UDI',
    'Product ID',
    'TWF',
    'HDF',
    'PWF',
    'OSF',
    'RNF'
])

In [None]:
print("Shape:", df.shape)
print("\nClass Distribution:\n", df["Machine failure"].value_counts())

sns.countplot(x="Machine failure", data=df)
plt.title("Class Distribution")
plt.show()

In [None]:
df = pd.get_dummies(df, columns=["Type"], drop_first=True)

In [None]:
X = df.drop("Machine failure", axis=1)
y = df["Machine failure"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

In [None]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

model.fit(X_train_bal, y_train_bal)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_prob)

plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    model,
    X_train_bal,
    y_train_bal,
    cv=cv,
    scoring="roc_auc"
)

print("Cross Validation ROC-AUC:", cv_scores.mean())

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[-10:]

plt.figure(figsize=(8,5))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), X.columns[indices])
plt.title("Top 10 Feature Importance")
plt.show()

In [None]:
!pip install shap

import shap

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values[:, :, 1], X_test)

In [None]:
joblib.dump(model, "model.pkl")

In [None]:
loaded_model = joblib.load("model.pkl")

In [None]:
y_pred = loaded_model.predict(X_test)
y_prob = loaded_model.predict_proba(X_test)[:,1]

results = X_test.copy()
results["Actual"] = y_test.values
results["Predicted"] = y_pred
results["Failure_Probability"] = y_prob

results.head(100)

In [None]:
results[results["Predicted"] == 1].head(10)

In [None]:
sample = X_test.iloc[0:1]

prediction = loaded_model.predict(sample)
probability = loaded_model.predict_proba(sample)[:,1]

print("Prediction:", prediction[0])
print("Failure Probability:", probability[0])

In [None]:
import pandas as pd

def load_data(path):
    df = pd.read_csv(path)

    # Remove leakage columns
    df = df.drop(columns=[
        'UDI',
        'Product ID',
        'TWF',
        'HDF',
        'PWF',
        'OSF',
        'RNF'
    ])

    # One-hot encode categorical column
    df = pd.get_dummies(df, columns=["Type"], drop_first=True)

    return df

In [None]:
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import os

def train_model(data_path):

    df = load_data(data_path)

    X = df.drop("Machine failure", axis=1)
    y = df["Machine failure"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    # Handle imbalance
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

    # Hyperparameter tuning
    param_grid = {
        "n_estimators": [200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    }

    model = RandomForestClassifier(random_state=42)

    grid = GridSearchCV(
        model,
        param_grid,
        cv=StratifiedKFold(n_splits=5),
        scoring="roc_auc",
        n_jobs=-1
    )

    grid.fit(X_train_bal, y_train_bal)

    best_model = grid.best_estimator_

    y_prob = best_model.predict_proba(X_test)[:,1]
    roc_score = roc_auc_score(y_test, y_prob)

    print("Best Parameters:", grid.best_params_)
    print("Test ROC-AUC:", roc_score)

    # Create the 'models' directory if it doesn't exist
    os.makedirs("models", exist_ok=True)
    joblib.dump(best_model, "models/model.pkl")

    return best_model

In [None]:
import joblib
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

def evaluate_model(model_path, X_test, y_test):

    model = joblib.load(model_path)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_prob))

    fpr, tpr, _ = roc_curve(y_test, y_prob)

    plt.plot(fpr, tpr)
    plt.plot([0,1], [0,1], '--')
    plt.title("ROC Curve")
    plt.show()

In [None]:
if __name__ == "__main__":
    train_model("/content/predictive_maintenance_dataset.csv")

In [None]:
import shap
import joblib
import pandas as pd

model = joblib.load("models/model.pkl")
raw_df = pd.read_csv("/content/predictive_maintenance_dataset.csv")

# Preprocess the DataFrame using the load_data function
# The load_data function was defined in a previous cell (MscAfJf43FrC)
processed_df = load_data("/content/predictive_maintenance_dataset.csv")

# Separate features (X) from the target for SHAP explanation
X_shap = processed_df.drop("Machine failure", axis=1)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_shap)

# Assuming shap_values is a list of arrays or a 3D array for binary classification
# shap_values[1] typically refers to the SHAP values for the positive class (class 1).
# If it's a 3D array (num_samples, num_features, num_classes), then shap_values[:, :, 1] is correct.
# Given the kernel state for shap_values, it appears to be a 3D array, so the previous fix was correct.
shap.summary_plot(shap_values[:, :, 1], X_shap)

KeyboardInterrupt: 

In [None]:
import joblib

loaded_model = joblib.load("model.pkl")

y_pred = loaded_model.predict(X_test)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = model.feature_importances_
indices = np.argsort(importances)[-10:]

plt.figure(figsize=(8,5))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), X.columns[indices])
plt.title("Top 10 Important Features")
plt.show()

In [None]:
custom_pred = (y_prob > 0.3).astype(int)

In [None]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def compare_models(model_path_1, model_path_2, X_test, y_test):

    # Load models
    model1 = joblib.load("/content/model.pkl")
    model2 = joblib.load("/content/models/model.pkl")

    models = {
        model_path_1: model1,
        model_path_2: model2
    }

    results = []

    for name, model in models.items():

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc = roc_auc_score(y_test, y_prob)

        results.append([name, acc, prec, rec, f1, roc])

    results_df = pd.DataFrame(results, columns=[
        "Model Path", "Accuracy", "Precision",
        "Recall", "F1 Score", "ROC-AUC"
    ])

    return results_df.sort_values(by="ROC-AUC", ascending=False)