In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
data = pd.read_csv("/content/drive/MyDrive/DATASETS/MachineData02.csv")

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Timestamp                           1000 non-null   object 
 1   Equipment ID                        1000 non-null   object 
 2   Equipment Type                      1000 non-null   object 
 3   Location                            1000 non-null   object 
 4   Operating Temperature (°C)          1000 non-null   int64  
 5   Operating Pressure (bar)            1000 non-null   int64  
 6   Flow Rate (m3/h)                    1000 non-null   float64
 7   Power Consumption (kW)              1000 non-null   int64  
 8   Speed (RPM)                         1000 non-null   int64  
 9   Vibration Level (mm/s)              1000 non-null   float64
 10  Health Status                       1000 non-null   object 
 11  Fault Type                          857 non-

In [28]:
data.describe()

Unnamed: 0,Operating Temperature (°C),Operating Pressure (bar),Flow Rate (m3/h),Power Consumption (kW),Speed (RPM),Vibration Level (mm/s),Wear & Tear (%),Maintenance Interval (days),Failure Probability (%),Time Since Last Maintenance (days),Ambient Temperature (°C),Humidity (%)
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,119.162,17.401,4.4875,64.362,2294.398,0.58,43.188,20.223,29.755,25.069,27.515,57.842
std,23.328694,4.570896,0.882889,31.280624,454.165481,0.244355,25.248765,11.76519,17.16746,8.804644,1.706073,7.269793
min,80.0,10.0,2.9,10.0,1500.0,0.15,2.0,1.0,1.0,10.0,25.0,45.0
25%,99.0,13.0,3.7,38.0,1895.25,0.38,20.0,10.0,15.0,17.0,26.0,52.0
50%,120.0,17.0,4.5,63.0,2303.5,0.58,44.0,19.5,28.0,25.0,27.0,58.0
75%,139.0,21.0,5.3,91.0,2681.25,0.79,66.0,30.25,45.0,33.0,29.0,64.0
max,160.0,25.0,6.0,120.0,3100.0,1.0,85.0,40.0,60.0,40.0,30.0,70.0


Preprocessing

In [29]:
# Handle missing values
# Filling missing categorical values with mode and numeric with median
data.dropna(inplace=True)


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 857 entries, 1 to 999
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Timestamp                           857 non-null    object 
 1   Equipment ID                        857 non-null    object 
 2   Equipment Type                      857 non-null    object 
 3   Location                            857 non-null    object 
 4   Operating Temperature (°C)          857 non-null    int64  
 5   Operating Pressure (bar)            857 non-null    int64  
 6   Flow Rate (m3/h)                    857 non-null    float64
 7   Power Consumption (kW)              857 non-null    int64  
 8   Speed (RPM)                         857 non-null    int64  
 9   Vibration Level (mm/s)              857 non-null    float64
 10  Health Status                       857 non-null    object 
 11  Fault Type                          857 non-null  

In [31]:
# Encoding categorical features
categorical_cols = ['Equipment Type', 'Location', 'Health Status', 'Fault Type',
                    'Failure Mode Prediction', 'Air Quality']
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])

In [32]:
# Droping non-relevant columns
data_cleaned = data.drop(columns=['Timestamp', 'Equipment ID', 'Predicted Failure Date'])


In [41]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 857 entries, 1 to 999
Data columns (total 18 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Equipment Type                      857 non-null    int64  
 1   Location                            857 non-null    int64  
 2   Operating Temperature (°C)          857 non-null    int64  
 3   Operating Pressure (bar)            857 non-null    int64  
 4   Flow Rate (m3/h)                    857 non-null    float64
 5   Power Consumption (kW)              857 non-null    int64  
 6   Speed (RPM)                         857 non-null    int64  
 7   Vibration Level (mm/s)              857 non-null    float64
 8   Health Status                       857 non-null    int64  
 9   Fault Type                          857 non-null    int64  
 10  Wear & Tear (%)                     857 non-null    int64  
 11  Maintenance Interval (days)         857 non-null  

Checking all unique values

In [42]:
for col in data_cleaned.columns:
    print(f"Unique values in column '{col}':")
    print(data[col].unique())
    print()

Unique values in column 'Equipment Type':
[0 1 3 2 4]

Unique values in column 'Location':
[3 2 1 0 4]

Unique values in column 'Operating Temperature (°C)':
[113 116 126 107 140 139 118 154 150 158 111 132  84 124  86  81 160 100
 147 144 153 134 159 130 115 120  92 135 131 101 103  99 121 108 129  80
 151 156 141  95  82 106 136  96  89 125  98  83 142 128 112  91 123 138
 104  88 105 114  97 137 127 119  93  85 110 149 109  87 122 148 145 133
 157 152  90 102 155  94 117 143 146]

Unique values in column 'Operating Pressure (bar)':
[15 17 21 19 23 10 22 18 12 20 11 14 25 16 24 13]

Unique values in column 'Flow Rate (m3/h)':
[3.5 5.  4.  4.8 4.3 5.2 4.7 3.3 5.5 5.7 3.6 5.9 4.9 3.8 4.2 5.1 5.6 4.1
 4.4 5.4 4.6 3.7 6.  4.5 3.9 3.4 3.  5.8 3.1 3.2 2.9 5.3]

Unique values in column 'Power Consumption (kW)':
[ 74  51  59  79  80  66  94 110  11  81  67  24  76  38  27  89  25  71
  78  57  88  93 111  65  43 116  17  34  61  58  70  87  69  96  99  95
  35 104 107  68 102 115  50  56 101

In [33]:
# Splitting data into features and target variable
X = data_cleaned.drop(columns=['Failure Probability (%)'])
y = data_cleaned['Failure Probability (%)']


In [34]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [35]:
# 1. Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)


In [36]:
print("Random Forest Regressor Results")
print("MSE:", rf_mse)
print("R2 Score:", rf_r2)

Random Forest Regressor Results
MSE: 326.6756569767442
R2 Score: -0.08953622607726519


In [37]:
# 2. XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_y_pred)
xgb_r2 = r2_score(y_test, xgb_y_pred)

print("XGBoost Regressor Results")
print("MSE:", xgb_mse)
print("R2 Score:", xgb_r2)


XGBoost Regressor Results
MSE: 393.74786376953125
R2 Score: -0.31323695182800293


In [38]:
# Summary of results
print("Summary of Model Performances:")
print(f"Random Forest Regressor - MSE: {rf_mse}, R2 Score: {rf_r2}")
print(f"XGBoost Regressor - MSE: {xgb_mse}, R2 Score: {xgb_r2}")

Summary of Model Performances:
Random Forest Regressor - MSE: 326.6756569767442, R2 Score: -0.08953622607726519
XGBoost Regressor - MSE: 393.74786376953125, R2 Score: -0.31323695182800293


In [39]:
import joblib

# Save the trained XGBoost model to a file
model_filename = 'xgb_regressor_model.joblib'
joblib.dump(xgb_model, model_filename)

print(f"Model saved to {model_filename}")


Model saved to xgb_regressor_model.joblib


In [40]:
# Save the preprocessing objects
joblib.dump(encoder, 'label_encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')


['scaler.joblib']

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load the data
file_path = '/content/drive/MyDrive/DATASETS/MachineData02.csv'
data = pd.read_csv(file_path)

# Handle missing values
# Remove rows with missing values
data.dropna(inplace=True)

# Encode categorical features
categorical_cols = ['Equipment Type', 'Location', 'Health Status', 'Fault Type',
                    'Failure Mode Prediction', 'Air Quality']
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])

# Drop non-relevant columns
data_cleaned = data.drop(columns=['Timestamp', 'Equipment ID', 'Predicted Failure Date'])

# Splitting data into features and target variable
X = data_cleaned.drop(columns=['Failure Probability (%)'])
y = data_cleaned['Failure Probability (%)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Neural Network Model
nn_model = Sequential()
nn_model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
nn_model.add(Dropout(0.2))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dropout(0.2))
nn_model.add(Dense(1, activation='linear'))

nn_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
history = nn_model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

# Evaluate the model
nn_y_pred = nn_model.predict(X_test)
nn_mse = mean_squared_error(y_test, nn_y_pred)
nn_r2 = r2_score(y_test, nn_y_pred)

print("Neural Network Regressor Results")
print("MSE:", nn_mse)
print("R2 Score:", nn_r2)

# Summary of results
print("Summary of Model Performances:")
print(f"Neural Network Regressor - MSE: {nn_mse}, R2 Score: {nn_r2}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 1109.6636 - mse: 1109.6636 - val_loss: 1069.3274 - val_mse: 1069.3274
Epoch 2/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1003.1123 - mse: 1003.1123 - val_loss: 884.6292 - val_mse: 884.6292
Epoch 3/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 779.6188 - mse: 779.6188 - val_loss: 651.1387 - val_mse: 651.1387
Epoch 4/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 589.2731 - mse: 589.2731 - val_loss: 446.0966 - val_mse: 446.0966
Epoch 5/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 376.9231 - mse: 376.9231 - val_loss: 359.8805 - val_mse: 359.8805
Epoch 6/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 311.9138 - mse: 311.9138 - val_loss: 350.4783 - val_mse: 350.4783
Epoch 7/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [24]:
 # Saving the trained model
nn_model.save('fault_prediction_model.h5')
print("Model saved as 'fault_prediction_model.h5'")




Model saved as 'fault_prediction_model.h5'
