In [2]:
import pandas as pd
import numpy as np

## Read Data

In [79]:
df = pd.read_csv("maintenance_dataset.csv")

In [80]:
df.head()

Unnamed: 0,UID,Machine ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Power,Tool Wear Failure,Heat Dissipation Failure,Power Failure,Overstrain Failure,Random Failure,Machine Failure
0,1,M14860,Medium,298.1,308.6,1551,42.8,0,66382.8,No,No,No,No,No,No
1,2,L47181,Low,298.2,308.7,1408,46.3,3,65190.4,No,No,No,No,No,No
2,3,L47182,Low,298.1,308.5,1498,49.4,5,74001.2,No,No,No,No,No,No
3,4,L47183,Low,298.2,308.6,1433,39.5,7,56603.5,No,No,No,No,No,No
4,5,L47184,Low,298.2,308.7,1408,40.0,9,56320.0,No,No,No,No,No,No


In [81]:
df["Machine Failure"].value_counts()

No     9643
Yes     357
Name: Machine Failure, dtype: int64

In [82]:
df.isna().sum()

UID                         0
Machine ID                  0
Type                        0
Air temperature [K]         0
Process temperature [K]     0
Rotational speed [rpm]      0
Torque [Nm]                 0
Tool wear [min]             0
Power                       0
Tool Wear Failure           0
Heat Dissipation Failure    0
Power Failure               0
Overstrain Failure          0
Random Failure              0
Machine Failure             0
dtype: int64

In [83]:
df.dtypes

UID                           int64
Machine ID                   object
Type                         object
Air temperature [K]         float64
Process temperature [K]     float64
Rotational speed [rpm]        int64
Torque [Nm]                 float64
Tool wear [min]               int64
Power                       float64
Tool Wear Failure            object
Heat Dissipation Failure     object
Power Failure                object
Overstrain Failure           object
Random Failure               object
Machine Failure              object
dtype: object

## Label Encoding for Categorical values

In [87]:
## Drop unwanted Columns
df1 = df.drop(["UID", "Machine ID", "Tool Wear Failure", "Heat Dissipation Failure", "Power Failure", "Overstrain Failure",	"Random Failure"], axis = 1)
df1.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Power,Machine Failure
0,Medium,298.1,308.6,1551,42.8,0,66382.8,No
1,Low,298.2,308.7,1408,46.3,3,65190.4,No
2,Low,298.1,308.5,1498,49.4,5,74001.2,No
3,Low,298.2,308.6,1433,39.5,7,56603.5,No
4,Low,298.2,308.7,1408,40.0,9,56320.0,No


In [89]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df1['Type'] = le.fit_transform(df['Type'])
df1['Machine Failure'] = le.fit_transform(df['Machine Failure'])
df1.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Power,Machine Failure
0,2,298.1,308.6,1551,42.8,0,66382.8,0
1,1,298.2,308.7,1408,46.3,3,65190.4,0
2,1,298.1,308.5,1498,49.4,5,74001.2,0
3,1,298.2,308.6,1433,39.5,7,56603.5,0
4,1,298.2,308.7,1408,40.0,9,56320.0,0


## Splitting Dataset

In [90]:
X = df1.drop(["Machine Failure"], axis = 1)
y = df1["Machine Failure"]

In [91]:
from sklearn.model_selection import train_test_split

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

## Standard Scaling

In [94]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [95]:
scaling = ["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]", "Power"]

In [96]:
X_train[scaling] = sc.fit_transform(X_train[scaling])
X_test[scaling] = sc.transform(X_test[scaling])

## Sampling Data

In [97]:
from imblearn.combine import SMOTEENN

In [99]:
# create the over-sampling and under-sampling object
smote_enn = SMOTEENN(sampling_strategy = 0.8, random_state = 42)

In [100]:
# resample the data
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [101]:
y_resampled.value_counts()

0    6634
1    5690
Name: Machine Failure, dtype: int64

## Train Model

In [106]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [103]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(random_state=42)

In [105]:
rf_pred = rf.predict(X_test)

In [107]:
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      2422
           1       0.39      0.79      0.53        78

    accuracy                           0.96      2500
   macro avg       0.69      0.88      0.75      2500
weighted avg       0.97      0.96      0.96      2500

