In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
file_path = r"../data/ai4i2020.csv"
try:
    ai4i_df = pd.read_csv(file_path)
    print("The dataset has been successfully loaded.")
except FileNotFoundError:
    print(f"Couldn't load data from {file_path}")
    exit

The dataset has been successfully loaded.


In [3]:
ai4i_df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [4]:
# drop uneccessary columns
ai4i_df = ai4i_df.drop(columns=['UDI','Product ID','Type','TWF', 'HDF', 'PWF', 'OSF','RNF'])

In [5]:
X = ai4i_df.drop(columns=['Machine failure'])
X

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,298.1,308.6,1551,42.8,0
1,298.2,308.7,1408,46.3,3
2,298.1,308.5,1498,49.4,5
3,298.2,308.6,1433,39.5,7
4,298.2,308.7,1408,40.0,9
...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14
9996,298.9,308.4,1632,31.8,17
9997,299.0,308.6,1645,33.4,22
9998,299.0,308.7,1408,48.5,25


In [6]:
y = ai4i_df['Machine failure']
y

0       0
1       0
2       0
3       0
4       0
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: Machine failure, Length: 10000, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 9254 to 7270
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air temperature [K]      8000 non-null   float64
 1   Process temperature [K]  8000 non-null   float64
 2   Rotational speed [rpm]   8000 non-null   int64  
 3   Torque [Nm]              8000 non-null   float64
 4   Tool wear [min]          8000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 375.0 KB


In [9]:
lr_model = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs', class_weight='balanced')
lr_model.fit(X_train,y_train)
y_pred = lr_model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8175


In [10]:
import joblib
joblib.dump(lr_model, r'../models/logistic_regression_model.pkl')
print("Model saved successfully!")

Model saved successfully!
