In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [10]:

# Load the dataset
file_path = '/content/predictive_maintenance.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset and summary information
data_info = data.info()
data_head = data.head()
# Replace column names if they are different
data.columns = data.columns.str.strip()  # Remove any leading or trailing spaces


data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


(None,
    UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
 0    1     M14860    M                298.1                    308.6   
 1    2     L47181    L                298.2                    308.7   
 2    3     L47182    L                298.1                    308.5   
 3    4     L47183    L                298.2                    308.6   
 4    5     L47184    L                298.2                    308.7   
 
    Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Target Failure Type  
 0                    1551         42.8                0       0   No Failure  
 1                    1408         46.3                3       0   No Failure  
 2                    1498         49.4                5       0   No Failure  
 3                    1433         39.5                7       0   No Failure  
 4                    1408         40.0                9       0   No Failure  )

In [11]:
#datapreprocessing
# Handling missing values (if any)
data.dropna(inplace=True)  # Drop missing values or use imputation

# Encode categorical variables
# Encode categorical variables with the correct column names
data = pd.get_dummies(data, columns=['Product ID', 'Type', 'Failure Type'], drop_first=True)

# Define features (X) and target (y)
X = data.drop(['UDI', 'Target'], axis=1)  # Exclude columns not needed for prediction
y = data['Target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [12]:
# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [13]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[1939    0]
 [   2   59]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1939
           1       1.00      0.97      0.98        61

    accuracy                           1.00      2000
   macro avg       1.00      0.98      0.99      2000
weighted avg       1.00      1.00      1.00      2000

Accuracy Score: 0.999
