<a href="https://colab.research.google.com/github/sneha-4515/IoT-Predictive-Maintenance-Engine/blob/main/IoT_Predictive_Maintenance_Engine_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# üè≠ FactoryGuard AI ‚Äì Predictive Maintenance Engine
## Week 1 + Week 2

This notebook includes:

### ‚úÖ Week 1 ‚Äì Data Engineering
- Data Loading
- Dataset Merging
- Missing Value Handling
- Feature Engineering (Lag + Rolling)
- Target Creation (Failure in Next 24 Hours)
- Data Leakage Check

### ‚úÖ Week 2 ‚Äì Modeling & Hyperparameter Tuning
- Train Test Split
- Class Imbalance Handling (SMOTE)
- Logistic Regreesion
- Random Forest
- XGBoost
- RandomizedSearchCV
- Evaluation (F1 & Recall Focus)


Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")


Loading Dataset

In [2]:
!gdown 1EEbHCVVvjBRl0MNn7c7mOCoQ6LGINqR9
! gdown 1EhOAKgE39ZPfySWR5ZxokI1TxOFOwlcV
! gdown 13_u-MMHacev2MD2bFcxzJQ4CkNyd9ktA

Downloading...
From: https://drive.google.com/uc?id=1EEbHCVVvjBRl0MNn7c7mOCoQ6LGINqR9
To: /content/PdM_telemetry.csv
100% 80.1M/80.1M [00:00<00:00, 214MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EhOAKgE39ZPfySWR5ZxokI1TxOFOwlcV
To: /content/PdM_failures.csv
100% 24.3k/24.3k [00:00<00:00, 4.00MB/s]
Downloading...
From: https://drive.google.com/uc?id=13_u-MMHacev2MD2bFcxzJQ4CkNyd9ktA
To: /content/PdM_machines.csv
100% 1.58k/1.58k [00:00<00:00, 6.85MB/s]


In [3]:
telemetry = pd.read_csv("PdM_telemetry.csv", parse_dates=["datetime"])
failures = pd.read_csv("PdM_failures.csv", parse_dates=["datetime"])
machines = pd.read_csv("PdM_machines.csv")

print("Telemetry Shape:", telemetry.shape)
print("Failures Shape:", failures.shape)
print("Failures Shape:", machines.shape)


Telemetry Shape: (876100, 6)
Failures Shape: (761, 3)
Failures Shape: (100, 3)


Merging Dataset

In [4]:
telemetry = telemetry.merge(machines, on="machineID", how="left")
telemetry = telemetry.sort_values(["machineID", "datetime"])

telemetry.head()


Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,model,age
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,model3,18
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,model3,18
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,model3,18
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,model3,18
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,model3,18


In [5]:
telemetry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876100 entries, 0 to 876099
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   datetime   876100 non-null  datetime64[ns]
 1   machineID  876100 non-null  int64         
 2   volt       876100 non-null  float64       
 3   rotate     876100 non-null  float64       
 4   pressure   876100 non-null  float64       
 5   vibration  876100 non-null  float64       
 6   model      876100 non-null  object        
 7   age        876100 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 53.5+ MB


In [6]:
telemetry['model'].value_counts()

Unnamed: 0_level_0,count
model,Unnamed: 1_level_1
model3,306635
model4,280352
model2,148937
model1,140176


In [7]:
telemetry = pd.get_dummies(telemetry, drop_first=True)

In [8]:
telemetry.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,age,model_model2,model_model3,model_model4
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,18,False,True,False
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,18,False,True,False
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,18,False,True,False
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,18,False,True,False
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,18,False,True,False


In [9]:
telemetry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876100 entries, 0 to 876099
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   datetime      876100 non-null  datetime64[ns]
 1   machineID     876100 non-null  int64         
 2   volt          876100 non-null  float64       
 3   rotate        876100 non-null  float64       
 4   pressure      876100 non-null  float64       
 5   vibration     876100 non-null  float64       
 6   age           876100 non-null  int64         
 7   model_model2  876100 non-null  bool          
 8   model_model3  876100 non-null  bool          
 9   model_model4  876100 non-null  bool          
dtypes: bool(3), datetime64[ns](1), float64(4), int64(2)
memory usage: 49.3 MB


CREATING TARGET (Failure in Next 24 Hours)

In [10]:
failures['failure_flag'] = 1

telemetry = telemetry.merge(
    failures[['machineID','datetime','failure_flag']],
    on=['machineID','datetime'],
    how='left'
)

telemetry['failure_flag'] = telemetry['failure_flag'].fillna(0)

telemetry['failure_next_24h'] = (
    telemetry.groupby('machineID')['failure_flag']
    .transform(lambda x: x.shift(-1).rolling(24).max())
)

telemetry['failure_next_24h'] = telemetry['failure_next_24h'].fillna(0)


In [11]:
telemetry['failure_next_24h'].value_counts()

Unnamed: 0_level_0,count
failure_next_24h,Unnamed: 1_level_1
0.0,858916
1.0,17226


Feture Engineering

In [12]:
sensor_cols = ['volt','rotate','pressure','vibration']

for col in sensor_cols:
    telemetry[f'{col}_lag1'] = telemetry.groupby('machineID')[col].shift(1)
    telemetry[f'{col}_lag2'] = telemetry.groupby('machineID')[col].shift(2)

for col in sensor_cols:
    telemetry[f'{col}_roll4'] = (
        telemetry.groupby('machineID')[col]
        .rolling(4)
        .mean()
        .reset_index(0,drop=True)
    )

telemetry = telemetry.dropna()


In [13]:
telemetry.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,age,model_model2,model_model3,model_model4,...,rotate_lag1,rotate_lag2,pressure_lag1,pressure_lag2,vibration_lag1,vibration_lag2,volt_roll4,rotate_roll4,pressure_roll4,vibration_roll4
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,18,False,True,False,...,527.349825,402.74749,75.237905,95.460525,34.178847,43.413973,168.137453,423.687682,98.256232,40.950662
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,18,False,True,False,...,346.149335,527.349825,109.248561,75.237905,41.122144,34.178847,163.485495,427.905881,97.95841,36.176369
5,2015-01-01 11:00:00,1,172.504839,430.323362,95.927042,35.655017,18,False,True,False,...,435.376873,346.149335,111.886648,109.248561,25.990511,41.122144,165.891899,434.799849,98.075039,34.23663
6,2015-01-01 12:00:00,1,156.556031,499.071623,111.755684,42.75392,18,False,True,False,...,430.323362,435.376873,95.927042,111.886648,35.655017,25.990511,162.283431,427.730298,107.204484,36.380398
7,2015-01-01 13:00:00,1,172.522781,409.624717,101.001083,35.482009,18,False,True,False,...,499.071623,430.323362,111.755684,95.927042,42.75392,35.655017,164.798418,443.599144,105.142614,34.970364


In [14]:
telemetry.info()

<class 'pandas.core.frame.DataFrame'>
Index: 875842 entries, 3 to 876141
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   datetime          875842 non-null  datetime64[ns]
 1   machineID         875842 non-null  int64         
 2   volt              875842 non-null  float64       
 3   rotate            875842 non-null  float64       
 4   pressure          875842 non-null  float64       
 5   vibration         875842 non-null  float64       
 6   age               875842 non-null  int64         
 7   model_model2      875842 non-null  bool          
 8   model_model3      875842 non-null  bool          
 9   model_model4      875842 non-null  bool          
 10  failure_flag      875842 non-null  float64       
 11  failure_next_24h  875842 non-null  float64       
 12  volt_lag1         875842 non-null  float64       
 13  volt_lag2         875842 non-null  float64       
 14  rotate_la

Train_Test Split

In [15]:
X = telemetry.drop(columns=['failure_next_24h','datetime','failure_flag'])
y = telemetry['failure_next_24h']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Handling Class Imbalance(SMOTE)

In [16]:
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)


Logistic Regression Model(Base Model)

In [17]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)

log_model.fit(X_train_sm, y_train_sm)

log_preds = log_model.predict(X_test)

print("Logistic Regression F1:", f1_score(y_test, log_preds))
print("Logistic Regression Recall:", recall_score(y_test, log_preds))
print(classification_report(y_test, log_preds))

Logistic Regression F1: 0.04366994793052335
Logistic Regression Recall: 0.3335268505079826
              precision    recall  f1-score   support

         0.0       0.98      0.72      0.83    171724
         1.0       0.02      0.33      0.04      3445

    accuracy                           0.71    175169
   macro avg       0.50      0.53      0.44    175169
weighted avg       0.96      0.71      0.82    175169



In [18]:
# ================================
# 8Ô∏è‚É£ RANDOM FOREST MODEL
# ================================

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train_sm, y_train_sm)

rf_preds = rf.predict(X_test)

print("Random Forest F1:", f1_score(y_test, rf_preds))
print("Random Forest Recall:", recall_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))


KeyboardInterrupt: 

In [None]:

# ================================
# 9Ô∏è‚É£ XGBOOST MODEL
# ================================

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)

xgb.fit(X_train_sm, y_train_sm)

xgb_preds = xgb.predict(X_test)

print("XGBoost F1:", f1_score(y_test, xgb_preds))
print("XGBoost Recall:", recall_score(y_test, xgb_preds))
print(classification_report(y_test, xgb_preds))


In [None]:

# ================================
# üîü RANDOMIZED SEARCH CV
# ================================

param_dist = {
    'n_estimators': [200,300,400],
    'max_depth': [4,6,8],
    'learning_rate': [0.01,0.05,0.1],
    'subsample': [0.7,0.8,1.0],
    'colsample_bytree': [0.7,0.8,1.0]
}

random_search = RandomizedSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1',
    cv=3,
    verbose=1,
    random_state=42
)

random_search.fit(X_train_sm, y_train_sm)

best_model = random_search.best_estimator_

best_preds = best_model.predict(X_test)

print("Best XGBoost F1:", f1_score(y_test, best_preds))
print("Best XGBoost Recall:", recall_score(y_test, best_preds))
print(classification_report(y_test, best_preds))
