In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
df = pd.read_csv('ai4i2020.csv')

In [3]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
import seaborn as sns

# Plotting numerical features against target variable (Machine failure)
plt.figure(figsize=(15, 10))

# Air temperature vs Machine failure
plt.subplot(2, 3, 1)
sns.boxplot(x='Machine failure', y='Air temperature [K]', data=df)
plt.title('Air Temperature vs Machine Failure')

# Process temperature vs Machine failure
plt.subplot(2, 3, 2)
sns.boxplot(x='Machine failure', y='Process temperature [K]', data=df)
plt.title('Process Temperature vs Machine Failure')

# Rotational speed vs Machine failure
plt.subplot(2, 3, 3)
sns.boxplot(x='Machine failure', y='Rotational speed [rpm]', data=df)
plt.title('Rotational Speed vs Machine Failure')

# Torque vs Machine failure
plt.subplot(2, 3, 4)
sns.boxplot(x='Machine failure', y='Torque [Nm]', data=df)
plt.title('Torque vs Machine Failure')

# Tool wear vs Machine failure
plt.subplot(2, 3, 5)
sns.boxplot(x='Machine failure', y='Tool wear [min]', data=df)
plt.title('Tool Wear vs Machine Failure')

plt.tight_layout()
plt.show()


In [None]:
# Product Type vs Machine failure
plt.figure(figsize=(10, 5))
sns.countplot(x='Type', hue='Machine failure', data=df)
plt.title('Product Type vs Machine Failure')
plt.show()

# Product ID vs Machine failure (only a subset, as there are many unique IDs)
plt.figure(figsize=(10, 5))
sns.countplot(x='Product ID', hue='Machine failure', data=df.head(50))  # Sampling the first 50 rows
plt.title('Product ID vs Machine Failure (Sampled)')
plt.xticks(rotation=90)
plt.show()


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Prepare the data
X = df.drop(columns=['UDI', 'Product ID', 'Machine failure'])  # Dropping non-relevant columns
y = df['Machine failure']

# Encoding categorical variables
X = pd.get_dummies(X, drop_first=True)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the RandomForest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Feature importance
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   index=X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)


                         importance
HDF                        0.286389
OSF                        0.206597
PWF                        0.182932
TWF                        0.111061
Torque [Nm]                0.076374
Rotational speed [rpm]     0.058614
Tool wear [min]            0.034146
Air temperature [K]        0.020493
Process temperature [K]    0.018387
Type_L                     0.002719
Type_M                     0.002036
RNF                        0.000251


In [9]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


print(f"Original class distribution: {y_train.value_counts()}")
print(f"Resampled class distribution: {pd.Series(y_resampled).value_counts()}")


Original class distribution: Machine failure
0    7722
1     278
Name: count, dtype: int64
Resampled class distribution: Machine failure
0    7722
1    7722
Name: count, dtype: int64


In [10]:
rf_model_balanced = RandomForestClassifier(random_state=42)
rf_model_balanced.fit(X_resampled, y_resampled)

# Re-evaluate feature importance
feature_importances_balanced = pd.DataFrame(
    rf_model_balanced.feature_importances_,
    index=X.columns,
    columns=['importance']
).sort_values('importance', ascending=False)
print(feature_importances_balanced)


                         importance
Rotational speed [rpm]     0.224880
Torque [Nm]                0.213301
Tool wear [min]            0.142039
PWF                        0.077606
HDF                        0.063423
TWF                        0.057776
Air temperature [K]        0.054311
OSF                        0.051931
Type_L                     0.043159
Process temperature [K]    0.038449
Type_M                     0.032846
RNF                        0.000279


In [None]:
X_resampled

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions
y_pred = rf_model_balanced.predict(X_test)

# Performance metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1939
           1       0.67      0.97      0.79        61

    accuracy                           0.98      2000
   macro avg       0.83      0.98      0.89      2000
weighted avg       0.99      0.98      0.99      2000

Confusion Matrix:
[[1910   29]
 [   2   59]]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           scoring='f1',
                           verbose=2)
grid_search.fit(X_resampled, y_resampled)

print("Best Parameters:", grid_search.best_params_)
rf_best = grid_search.best_estimator_


Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   5.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   5.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   4.4s
[CV] END max_depth=10, min_sa

In [13]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rf_best, X_resampled, y_resampled, cv=5, scoring='f1')
print("Cross-validation F1 Scores:", cv_scores)
print("Mean F1 Score:", cv_scores.mean())


Cross-validation F1 Scores: [0.99062399 0.9851229  0.98708844 0.99059968 0.98361709]
Mean F1 Score: 0.9874104194174805


In [14]:
import joblib

# Save the model
joblib.dump(rf_best, 'best_rf_model.pkl')
print("Model saved successfully!")


Model saved successfully!
