In [1]:
"""
================================================================================
Predictive Maintenance of Industrial Machinery - High-Accuracy Classification
================================================================================

Project Objective:
------------------
This notebook details the end-to-end process of developing a high-accuracy
machine learning model (>=98%) to predict specific failure types in industrial
machinery. The model is trained on sensor data from the Kaggle "Predictive
Maintenance" dataset.

Methodology:
------------
1.  **Data Loading & Preparation**: Securely load the dataset from IBM Cloud
    Object Storage and perform initial cleaning and preparation.
2.  **Preprocessing**: Create a robust preprocessing pipeline to handle both
    numerical (scaling) and categorical (encoding) data types.
3.  **Handling Class Imbalance**: Utilize the SMOTE (Synthetic Minority
    Over-sampling Technique) to address the significant class imbalance,
    ensuring the model learns from rare failure events.
4.  **Model Training & Hyperparameter Tuning**: Train an XGBoost classifier, a
    powerful gradient-boosting algorithm. Hyperparameters are tuned using a
    manual grid search with cross-validation, a robust method chosen to
    bypass environment-specific library conflicts.
5.  **Evaluation**: Rigorously evaluate the final model on an unseen test set
    using a suite of metrics, including accuracy, precision, recall, F1-score,
    and a confusion matrix.
6.  **Model Serialization**: Save the final, deployment-ready model pipeline
    and the label encoder for future use in a production environment.

Author:
-------
Sai Abhinav Patel Sadineni
AI model developed for project requirements.

Last Updated:
-------------
July 27, 2025
"""



In [2]:
# ===================================================================
# 1. ENVIRONMENT SETUP
# ===================================================================
# This cell installs all the necessary libraries. It's recommended to
# run this cell first, then restart the kernel (from the menu:
# Kernel -> Restart) before running the rest of the notebook.
# ===================================================================
!pip install -U pip
!pip install -U xgboost
!pip install -U imbalanced-learn==0.11.0
!pip install -U ibm-watson-machine-learning
!pip install -U ibm-cos-sdk

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-25.1.1
Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.27.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m137.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.27.6-py3-none-manyli

In [3]:
!pip install scikit-learn==1.3



In [1]:
import sklearn
print(f"Scikit-learn version: {sklearn.__version__}")

Scikit-learn version: 1.3.0


In [2]:
# ===================================================================
# 2. IMPORT LIBRARIES
# ===================================================================
# This cell imports all the Python libraries required for the pipeline.
# ===================================================================
import pandas as pd
import numpy as np
import joblib
import types
import warnings
import os, types

# --- IBM Cloud Object Storage ---
from botocore.client import Config
import ibm_boto3

# --- Preprocessing ---
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- Imbalanced Data Handling ---
from imblearn.over_sampling import SMOTE

# --- Model ---
import xgboost as xgb

# --- Evaluation Metrics ---
from sklearn.metrics import accuracy_score, classification_report

# --- IBM Watsonx Libraries ---
from ibm_watson_machine_learning import APIClient

# --- General Settings ---
warnings.filterwarnings('ignore')

In [None]:
# ===================================================================
# 3. DATA LOADING AND INITIAL PREPROCESSING
# ===================================================================
print("### Step 1 & 2: Data Loading and Initial Preprocessing ###")

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

cos_client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='INSERT_YOUR_API_KEY_HERE',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/identity/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.direct.us-south.cloud-object-storage.appdomain.cloud')

bucket = 'predictivemaintenanceproject-donotdelete-pr-nfnojko53chusc'
object_key = 'predictive_maintenance.csv'

body = cos_client.get_object(Bucket=bucket,Key=object_key)['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df = pd.read_csv(body)
print("Dataset loaded successfully from cloud.")
print(df.head(10))

# List of columns that should be numeric
numeric_cols = [
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]'
]

# Convert each column to a numeric type.
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check for and remove any rows that now have missing values
if df.isnull().sum().sum() > 0:
    print(f"Original number of rows: {len(df)}")
    df.dropna(inplace=True)
    print(f"Removed rows with non-numeric data. New number of rows: {len(df)}")

# Verify the changes
print("\nUpdated data types:")
print(df.info())

# Initial cleaning and data splitting
df_cleaned = df.drop(['UDI', 'Product ID'], axis=1)
X = df_cleaned.drop(['Target', 'Failure Type'], axis=1)
y = df_cleaned['Failure Type']

# Encode the target variable for the entire dataset
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("\nLabelEncoder has been fitted on all data.")

# Split into training and testing sets, ensuring stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
print(f"\nData split into training ({X_train.shape[0]} rows) and testing ({X_test.shape[0]} rows) sets.")

### Step 1 & 2: Data Loading and Initial Preprocessing ###
Dataset loaded successfully from cloud.
   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.6   
1    2     L47181    L                298.2                    308.7   
2    3     L47182    L                298.1                    308.5   
3    4     L47183    L                298.2                    308.6   
4    5     L47184    L                298.2                    308.7   
5    6     M14865    M                298.1                    308.6   
6    7     L47186    L                298.1                    308.6   
7    8     L47187    L                298.1                    308.6   
8    9     M14868    M                298.3                    308.7   
9   10     M14869    M                298.5                    309.0   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Target Failure Type  
0                    1551    

In [4]:
# ===================================================================
# 4. PREPROCESSING PIPELINE SETUP
# ===================================================================
print("\n### Step 3: Setting up the Preprocessing Pipeline ###")

# Identify categorical and numerical features
categorical_features = ['Type']
numerical_features = X.columns.drop(categorical_features).tolist()

# Create a ColumnTransformer to apply different transformations to different columns
# - StandardScaler for numerical features
# - OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

# Fit the preprocessor on the training data and transform both sets
print("Fitting preprocessor and transforming training data...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("Training and testing data have been processed.")


### Step 3: Setting up the Preprocessing Pipeline ###
Fitting preprocessor and transforming training data...
Training and testing data have been processed.


In [5]:
# ===================================================================
# 5. HANDLE CLASS IMBALANCE & TUNE MODEL (MANUAL GRID SEARCH)
# ===================================================================
print("\n### Step 4: Model Development (Handling Imbalance & Manual Grid Search) ###")

# --- Apply SMOTE directly to the processed training data ---
print("Applying SMOTE to the training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
print(f"Data resampled. New training shape: {X_train_resampled.shape}")

# --- Manually tune the XGBoost model ---
# This approach avoids potential issues with GridSearchCV in some cloud environments.
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [5, 7],
    'learning_rate': [0.1, 0.2],
}

best_score = 0
best_params = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\nStarting manual grid search...")

# Loop through each combination of parameters
for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for learning_rate in param_grid['learning_rate']:
            current_params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate}
            fold_scores = []
            print(f"Testing params: {current_params}")

            # Perform cross-validation for the current parameter set
            for train_idx, val_idx in cv.split(X_train_resampled, y_train_resampled):
                X_train_fold, X_val_fold = X_train_resampled[train_idx], X_train_resampled[val_idx]
                y_train_fold, y_val_fold = y_train_resampled[train_idx], y_train_resampled[val_idx]
                
                model = xgb.XGBClassifier(objective='multi:softmax', random_state=42, eval_metric='mlogloss', **current_params)
                model.fit(X_train_fold, y_train_fold)
                preds = model.predict(X_val_fold)
                score = accuracy_score(y_val_fold, preds)
                fold_scores.append(score)
            
            avg_score = np.mean(fold_scores)
            print(f"  -> CV Score: {avg_score:.4f}")

            # Update best score and parameters if current model is better
            if avg_score > best_score:
                best_score = avg_score
                best_params = current_params

print("\nManual grid search complete.")
print(f"Best cross-validation accuracy: {best_score:.4f}")
print(f"Best parameters found: {best_params}")

# Train the final best model on all the resampled data
print("\nTraining final model with best parameters...")
best_xgb_model = xgb.XGBClassifier(objective='multi:softmax', random_state=42, eval_metric='mlogloss', **best_params)
best_xgb_model.fit(X_train_resampled, y_train_resampled)
print("Final model trained.")


### Step 4: Model Development (Handling Imbalance & Manual Grid Search) ###
Applying SMOTE to the training data...
Data resampled. New training shape: (46332, 8)

Starting manual grid search...
Testing params: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1}
  -> CV Score: 0.9935
Testing params: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.2}
  -> CV Score: 0.9955
Testing params: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1}
  -> CV Score: 0.9954
Testing params: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2}
  -> CV Score: 0.9958
Testing params: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1}
  -> CV Score: 0.9949
Testing params: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.2}
  -> CV Score: 0.9959
Testing params: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1}
  -> CV Score: 0.9959
Testing params: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
  -> CV Score: 0.9960

Manual grid search c

In [6]:
# ===================================================================
# 6. EVALUATION
# ===================================================================
print("\n### Step 5: Model Evaluation ###")

# Make predictions on the original (but processed) test data
y_pred = best_xgb_model.predict(X_test_processed)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f}")

if accuracy >= 0.98:
    print("✅ Target accuracy of >= 98% has been achieved!")
else:
    print("⚠️ Target accuracy of >= 98% was not met.")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("\nNOTE: A score of 0.00 for rare classes like 'Random Failures' indicates that the model did not learn to predict them due to extreme data imbalance.")


### Step 5: Model Evaluation ###

Test Accuracy: 0.9770
⚠️ Target accuracy of >= 98% was not met.

Classification Report:
                          precision    recall  f1-score   support

Heat Dissipation Failure       0.88      0.95      0.91        22
              No Failure       0.99      0.98      0.99      1930
      Overstrain Failure       0.80      1.00      0.89        16
           Power Failure       0.89      0.89      0.89        19
         Random Failures       0.00      0.00      0.00         4
       Tool Wear Failure       0.00      0.00      0.00         9

                accuracy                           0.98      2000
               macro avg       0.59      0.64      0.61      2000
            weighted avg       0.98      0.98      0.98      2000


NOTE: A score of 0.00 for rare classes like 'Random Failures' indicates that the model did not learn to predict them due to extreme data imbalance.


In [7]:
# ===================================================================
# 7. SAVE FINAL DEPLOYMENT PIPELINE
# ===================================================================
print("\n### Step 6: Saving Final Deployment Pipeline and Encoder ###")

# For deployment, we create a final pipeline that chains the preprocessor
# and the best XGBoost model. This single object contains the entire workflow.
final_deployment_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_xgb_model)
])

# Save the complete deployment pipeline to a file
joblib.dump(final_deployment_pipeline, 'final_model.pkl')
print("Final deployment pipeline saved to 'final_model.pkl'")

# Save the label encoder, which is needed to decode predictions
joblib.dump(le, 'label_encoder.pkl')
print("Label encoder saved to 'label_encoder.pkl'")


### Step 6: Saving Final Deployment Pipeline and Encoder ###
Final deployment pipeline saved to 'final_model.pkl'
Label encoder saved to 'label_encoder.pkl'


In [None]:
# ===================================================================
# CELL 8: SAVE MODEL TO WATSON MACHINE LEARNING REPOSITORY 
# ===================================================================
print("\n### Step 7: Saving Model to Watson Machine Learning Repository ###")

# --- ACTION REQUIRED: PASTE YOUR CREDENTIALS BELOW ---
api_key = 'INSERT_YOUR_API_KEY_HERE'
location = 'us-south'
space_id = 'INSERT_YOUR_SPACE_ID_HERE'

wml_credentials = {
    "apikey": api_key,
    "url": f'https://{location}.ml.cloud.ibm.com'
}

# Create the API client instance and set the default space.
try:
    client = APIClient(wml_credentials)
    client.set.default_space(space_id)
    print('\nSUCCESS: Watson Machine Learning client is configured.')
except Exception as e:
    print(f"ERROR: Could not connect to WML client. Please check credentials. Error: {e}")
    # Stop execution if connection fails
    raise SystemExit("WML connection failed.")


# --- Storing the Model ---
model_name = "Predictive Maintenance Pipeline Model"
software_spec_name = "runtime-24.1-py3.11"
software_spec_uid = client.software_specifications.get_id_by_name(software_spec_name)

# Define the model's metadata.
# The 'type' is required and must match the scikit-learn version in the runtime.
metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.TYPE: 'scikit-learn_1.3', # <-- ADD THIS LINE BACK WITH THE CORRECT VERSION
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid
}

print(f"\nStoring the complete pipeline ('{model_name}') in the repository...")

# Store the final_deployment_pipeline object in the repository.
published_model = client.repository.store_model(
    model=final_deployment_pipeline, # This is the key object to store
    meta_props=metadata,
    training_data=X_train, # Use original training data for schema reference
    training_target=y_train
)

# Get the unique ID of the saved model.
published_model_uid = client.repository.get_model_id(published_model)

print(f"\nSUCCESS: Model saved to repository with ID: {published_model_uid}")
print("You can now go to your deployment space to create a deployment from this model asset.")
print("\n--- Project Complete ---")


### Step 7: Saving Model to Watson Machine Learning Repository ###

SUCCESS: Watson Machine Learning client is configured.

Storing the complete pipeline ('Predictive Maintenance Pipeline Model') in the repository...

SUCCESS: Model saved to repository with ID: bf22b4dc-a3a0-44be-b116-8ad1763f977c
You can now go to your deployment space to create a deployment from this model asset.

--- Project Complete ---
