# Model Training

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("data/data_processed.csv")

In [5]:
df.head()

Unnamed: 0,Type,Machine failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],type_of_failure
0,1.0,0,0.222934,0.535714,0.0,0.304348,0.358025,5
1,0.0,0,0.139697,0.583791,0.011858,0.315217,0.37037,5
2,0.0,0,0.192084,0.626374,0.019763,0.304348,0.345679,5
3,0.0,0,0.154249,0.490385,0.027668,0.315217,0.358025,5
4,0.0,0,0.139697,0.497253,0.035573,0.315217,0.37037,5


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57912 entries, 0 to 57911
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     57912 non-null  float64
 1   Machine failure          57912 non-null  int64  
 2   Rotational speed [rpm]   57912 non-null  float64
 3   Torque [Nm]              57912 non-null  float64
 4   Tool wear [min]          57912 non-null  float64
 5   Air temperature [c]      57912 non-null  float64
 6   Process temperature [c]  57912 non-null  float64
 7   type_of_failure          57912 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 3.5 MB


In [7]:
df.isna().sum()

Type                       0
Machine failure            0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Air temperature [c]        0
Process temperature [c]    0
type_of_failure            0
dtype: int64

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Type,57912.0,0.40987,0.646335,0.0,0.0,0.0,1.0,2.0
Machine failure,57912.0,0.666822,0.471354,0.0,0.0,1.0,1.0,1.0
Rotational speed [rpm],57912.0,0.200935,0.182875,0.0,0.107406,0.142026,0.21362,1.0
Torque [Nm],57912.0,0.58334,0.199884,0.0,0.465659,0.607975,0.726511,1.0
Tool wear [min],57912.0,0.569651,0.273497,0.0,0.345905,0.642146,0.817961,1.0
Air temperature [c],57912.0,0.567976,0.20693,0.0,0.401045,0.586957,0.749264,1.0
Process temperature [c],57912.0,0.561527,0.16468,0.0,0.438575,0.583577,0.681561,1.0
type_of_failure,57912.0,2.5,1.70784,0.0,1.0,2.5,4.0,5.0


## Train test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df.drop(["Machine failure", "type_of_failure"], axis=1)
y = df["Machine failure"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predictting Machine Failure

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [14]:
lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()


In [15]:
models = [lr, svc, dt, rf]
scores = []

In [16]:
for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred)*100
    prec = precision_score(y_test, y_pred)*100
    rec = recall_score(y_test, y_pred)*100
    f1 = f1_score(y_test, y_pred)*100
    scores.append([acc,prec, rec, f1])

In [17]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,83.82975,86.504065,89.701686,88.073862
1,SVC,96.261763,95.193144,99.403372,97.252712
2,Decision Tree,99.007166,99.006323,99.507134,99.256097
3,Random Forest,99.222999,98.908858,99.935149,99.419355


In [18]:
best_model_idx = scores_df['F1'].idxmax()
best_model = scores_df.loc[best_model_idx, 'Model']
best_model

'Random Forest'

## Train Test Split

In [19]:
X = df.drop(['Machine failure', 'type_of_failure'], axis=1)
y = df['type_of_failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Type of Failure

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred, average='macro') * 100
    rec = recall_score(y_test, y_pred, average='macro') * 100
    f1 = f1_score(y_test, y_pred, average='macro') * 100
    scores.append([acc, prec, rec, f1])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,83.786584,83.060005,83.848813,83.320343
1,SVC,94.077527,94.313446,94.146141,93.873921
2,Decision Tree,98.635932,98.636193,98.6482,98.637891
3,Random Forest,99.214366,99.22455,99.225983,99.214943


Random Forest Classifier is the best performing model.