In [1]:

import numpy as np 
import pandas as pd 



import os
for dirname, _, filenames in os.walk('content/ai4i2020 (1).csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/predictive-maintenance-dataset-ai4i-2020/ai4i2020.csv


In [2]:
df = pd.read_csv('content/ai4i2020 (1).csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [3]:
# Define a condition where 'Machine failure' is 1, but none of the other failure types are indicated (TWF, HDF, etc. are all 0).
condition_1 = ((df['Machine failure'] == 1) & 
              (df['TWF'] == 0) & 
              (df['HDF'] == 0) & 
              (df['PWF'] == 0) & 
              (df['OSF'] == 0) & 
              (df['RNF'] == 0))            
df.loc[condition_1, 'Machine failure'] = 0 # For rows meeting condition_1, set 'Machine failure' to 0 (indicating there wasn't actually a machine failure).

# Define a condition where 'Machine failure' is 0, but any of the other failure types are indicated (either TWF, HDF, etc. is 1).
condition_2 = ((df['Machine failure'] == 0) & 
              ((df['TWF'] == 1) | 
               (df['HDF'] == 1) | 
               (df['PWF'] == 1) | 
               (df['OSF'] == 1) | 
               (df['RNF'] == 1)))             
df.loc[condition_2, 'Machine failure'] = 1 # For rows meeting condition_2, set 'Machine failure' to 1 (indicating there was a machine failure).

In [4]:
#Adding new features to the dataframe

df['Temperature difference [k]']= df['Process temperature [K]'] - df['Air temperature [K]']
df['Power [W]'] = df['Torque [Nm]'] * df['Rotational speed [rpm]'] * 0.104719755
df['Strain [minNm]'] = df['Torque [Nm]'] * df['Tool wear [min]']

df = df.drop(['UDI', 'Product ID','Air temperature [K]', 'Process temperature [K]', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis =1) #dropping unnecessary columns

In [5]:
bin_map = {'L': 0, 'M': 1, 'H':2}  #considering quality an ordinal category and mapping both df and df_with_new_features based on this concept
df['Type'] = df['Type'].map(bin_map)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Type                        10000 non-null  int64  
 1   Rotational speed [rpm]      10000 non-null  int64  
 2   Torque [Nm]                 10000 non-null  float64
 3   Tool wear [min]             10000 non-null  int64  
 4   Machine failure             10000 non-null  int64  
 5   Temperature difference [k]  10000 non-null  float64
 6   Power [W]                   10000 non-null  float64
 7   Strain [minNm]              10000 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 625.1 KB


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from imblearn.over_sampling import SMOTE  # <-- Importing SMOTE

df.columns = df.columns.str.replace('[^a-zA-Z0-9\s]', '_', regex=True)

X = df.drop(['Machine failure'], axis=1)
y = df['Machine failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)


lgbm = LGBMClassifier(random_state=120)
lgbm.fit(X_train, y_train)  

y_pred = lgbm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n {classification_rep}")
print(f"Confusion Matrix:\n {conf_matrix}")


Accuracy: 0.9905
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1921
           1       0.94      0.81      0.87        79

    accuracy                           0.99      2000
   macro avg       0.97      0.90      0.93      2000
weighted avg       0.99      0.99      0.99      2000

Confusion Matrix:
 [[1917    4]
 [  15   64]]


In [7]:
#Creating a function that can take user input and find the probability of failure

def predict_failure_for_all_types(lgbm_model):
    # Get user input for other parameters
    rotational_speed = int(input("Enter Rotational speed [rpm]: "))
    torque = float(input("Enter Torque [Nm]: "))
    tool_wear = int(input("Enter Tool wear [min]: "))
    temp_difference = float(input("Enter Temperature difference [k]: "))

    power = torque * rotational_speed * 0.104719755 #Calculating power automatically based on formula provided in data description 
    strain = torque * tool_wear #Calculating strain automatically based on formula provided in data description 

    # Loop through all types
    for type_ in [0, 1, 2]:
        # Create a dataframe from the user input
        user_data = pd.DataFrame({
            'Type': [type_],
            'Rotational speed _rpm_': [rotational_speed],
            'Torque _Nm_': [torque],
            'Tool wear _min_': [tool_wear],
            'Temperature difference _k_': [temp_difference],
            'Power_W_': [power],
            'Strain _minNm_': [strain]
        })

        #print(user_data)
        
        # Predict probability of failure
        prob_failure = lgbm_model.predict_proba(user_data)[:,1][0]

        print(f"\nProbability of failure for Type {type_}: {prob_failure:.4f}")
        if prob_failure > 0.5: # assuming 0.5 as threshold
            print(f"Machine of Type {type_} is likely to fail.")
        else:
            print(f"Machine of Type {type_} is unlikely to fail.")
        print("=========================================")
        
# Note: To call the function call the function by wrting: predict_failure_for_all_types(lgbm)
#predict_failure_for_all_types(lgbm) 

In [8]:
# Predict the probability of failure for each row in X_test
failure_probabilities_test = lgbm.predict_proba(X_test)[:, 1]

# Create a new DataFrame with X_test, y_test, and failure_probabilities_test
combined_df = X_test.copy()
combined_df['y_test'] = y_test
combined_df['Failure Probability'] = failure_probabilities_test

combined_df.to_csv('X_test_probabilities.csv', index=False)

# Display the combined DataFrame
combined_df

Unnamed: 0,Type,Rotational speed _rpm_,Torque _Nm_,Tool wear _min_,Temperature difference _k_,Power _W_,Strain _minNm_,y_test,Failure Probability
3644,0,1548,32.4,81,9.3,5252.240256,2624.4,0,0.001107
1828,0,1351,56.3,174,9.4,7965.120701,9796.2,0,0.002437
6661,1,1501,39.8,187,9.1,6255.937220,7442.6,0,0.000069
9216,2,1469,43.7,96,11.1,6722.516088,4195.2,0,0.000040
5911,1,1288,64.0,3,9.6,8632.258844,192.0,0,0.000599
...,...,...,...,...,...,...,...,...,...
1790,0,1847,23.6,69,9.7,4564.650345,1628.4,0,0.000158
7512,1,1636,34.4,2,11.3,5893.460260,68.8,0,0.000211
3144,1,1478,36.3,95,9.4,5618.361463,3448.5,0,0.000210
6231,1,1495,40.8,148,9.7,6387.486176,6038.4,0,0.000077


In [9]:
# Assuming X_test is your test data without the target variable

# Calculate failure probabilities for Type=0 (i.e. L)
X_test_0 = X_test.copy()
X_test_0['Type'] = 0
failure_probabilities_0 = lgbm.predict_proba(X_test_0)[:, 1]

# Calculate failure probabilities for Type=1 (i.e. M)
X_test_1 = X_test.copy()
X_test_1['Type'] = 1
failure_probabilities_1 = lgbm.predict_proba(X_test_1)[:, 1]

# Calculate failure probabilities for Type=2 (i.e. H)
X_test_2 = X_test.copy()
X_test_2['Type'] = 2
failure_probabilities_2 = lgbm.predict_proba(X_test_2)[:, 1]

# Combine original X_test with the three new columns for each failure probability
X_test['y_test'] = y_test
X_test['Failure Probability_Type L'] = failure_probabilities_0
X_test['Failure Probability_Type M'] = failure_probabilities_1
X_test['Failure Probability_Type H'] = failure_probabilities_2

X_test.to_csv('X_ordinal_probabilities.csv', index=False)

X_test = X_test.sort_values(by="Failure Probability_Type L", ascending=False) 

X_test


Unnamed: 0,Type,Rotational speed _rpm_,Torque _Nm_,Tool wear _min_,Temperature difference _k_,Power _W_,Strain _minNm_,y_test,Failure Probability_Type L,Failure Probability_Type M,Failure Probability_Type H
2761,0,1299,65.1,212,9.5,8855.615610,13801.2,1,0.999835,0.996409,0.996595
9659,0,1287,61.9,216,10.9,8342.530698,13370.4,1,0.999721,0.994321,0.994321
8192,0,1229,65.2,209,11.3,8391.277744,13626.8,1,0.999711,0.994208,0.994208
4417,0,1365,66.8,80,7.8,9548.556700,5344.0,1,0.999585,0.999768,0.999768
5706,0,1290,70.0,139,9.5,9456.193876,9730.0,1,0.999371,0.999652,0.999670
...,...,...,...,...,...,...,...,...,...,...,...
4077,0,1635,39.8,99,8.5,6814.428617,3940.2,0,0.000011,0.000015,0.000018
7940,1,1532,40.0,0,11.0,6417.226586,0.0,0,0.000011,0.000006,0.000035
516,0,1732,29.9,32,11.8,5423.101008,956.8,0,0.000010,0.000023,0.000065
515,0,1715,28.0,30,11.7,5028.642635,840.0,0,0.000009,0.000007,0.000022


**As it is shown in the table the probability of failure for type L is normally the highest and for type H is the lowest.**