#### **Import Dependencies**

In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#### **Read Dataset**

In [153]:
df = pd.read_csv("Machine Downtime.csv")
df

Unnamed: 0,Date,Machine_ID,Assembly_Line_No,Hydraulic_Pressure(bar),Coolant_Pressure(bar),Air_System_Pressure(bar),Coolant_Temperature,Hydraulic_Oil_Temperature(?C),Spindle_Bearing_Temperature(?C),Spindle_Vibration(?m),Tool_Vibration(?m),Spindle_Speed(RPM),Voltage(volts),Torque(Nm),Cutting(kN),Downtime
0,31-12-2021,Makino-L1-Unit1-2013,Shopfloor-L1,71.040000,6.933725,6.284965,25.6,46.0,33.4,1.291,26.492,25892.0,335.0,24.055326,3.58,Machine_Failure
1,31-12-2021,Makino-L1-Unit1-2013,Shopfloor-L1,125.330000,4.936892,6.196733,35.3,47.4,34.6,1.382,25.274,19856.0,368.0,14.202890,2.68,Machine_Failure
2,31-12-2021,Makino-L3-Unit1-2015,Shopfloor-L3,71.120000,6.839413,6.655448,13.1,40.7,33.0,1.319,30.608,19851.0,325.0,24.049267,3.55,Machine_Failure
3,31-05-2022,Makino-L2-Unit1-2015,Shopfloor-L2,139.340000,4.574382,6.560394,24.4,44.2,40.6,0.618,30.791,18461.0,360.0,25.860029,3.55,Machine_Failure
4,31-03-2022,Makino-L1-Unit1-2013,Shopfloor-L1,60.510000,6.893182,6.141238,4.1,47.3,31.4,0.983,25.516,26526.0,354.0,25.515874,3.55,Machine_Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,01-02-2022,Makino-L1-Unit1-2013,Shopfloor-L1,112.715506,5.220885,6.196610,22.3,48.8,37.2,0.910,20.282,20974.0,282.0,22.761610,2.72,No_Machine_Failure
2496,01-02-2022,Makino-L1-Unit1-2013,Shopfloor-L1,103.086653,5.211886,7.074653,11.9,48.3,31.5,1.106,34.708,20951.0,319.0,22.786597,2.94,No_Machine_Failure
2497,01-02-2022,Makino-L2-Unit1-2015,Shopfloor-L2,118.643165,5.212991,6.530049,4.5,49.9,36.2,0.288,16.828,20958.0,335.0,22.778987,,No_Machine_Failure
2498,01-02-2022,Makino-L3-Unit1-2015,Shopfloor-L3,145.855859,5.207777,6.402655,12.2,44.5,32.1,0.995,26.498,20935.0,376.0,22.804012,2.79,No_Machine_Failure


#### **Features**

Features for prediction

'Machine_ID','Spindle_Bearing_Temperature(?C)','Run_Time'

In [154]:
# features
columns = ['Machine_ID','Spindle_Bearing_Temperature(?C)']
#,'Run_Time']
x_f = df[columns].copy()
y_ = df['Downtime']


#### **Feature Engineering**

**Missing Value Treatment**

In [132]:
# check missing values
print(f'Missing values in inputs features:\n{x_f.isnull().sum()}')
print(f'Missing values in output\n{y_.isnull().sum()}')


Missing values in inputs features:
Machine_ID                         0
Spindle_Bearing_Temperature(?C)    7
dtype: int64
Missing values in output
0


In [145]:
x_f

Unnamed: 0,Machine_ID,Spindle_Bearing_Temperature(?C)
0,Makino-L1-Unit1-2013,33.4
1,Makino-L1-Unit1-2013,34.6
2,Makino-L3-Unit1-2015,33.0
3,Makino-L2-Unit1-2015,40.6
4,Makino-L1-Unit1-2013,31.4
...,...,...
2495,Makino-L1-Unit1-2013,37.2
2496,Makino-L1-Unit1-2013,31.5
2497,Makino-L2-Unit1-2015,36.2
2498,Makino-L3-Unit1-2015,32.1


**Interpolate missing values**

Interpolate mising value using polynomical interpolation with the help of neighbouring values

In [133]:
# it searches both forward and baward direction
x_f.interpolate(method="polynomial", order = 2, limit_direction = "both",inplace=True) 

print(f'Current missing value count for input features\n{x_f.isna().sum()}')

Current missing value count for input features
Machine_ID                         0
Spindle_Bearing_Temperature(?C)    0
dtype: int64


**Label Encoding**

In [134]:
label_encoder = LabelEncoder()

x_ = x_f.copy()
x_['Machine_ID_encoded'] = label_encoder.fit_transform(x_f['Machine_ID'])
x_.drop('Machine_ID',axis=1,inplace=True)
print(x_)

# label encoding for y
y_ = label_encoder.fit_transform(y_)
y_

      Spindle_Bearing_Temperature(?C)  Machine_ID_encoded
0                                33.4                   0
1                                34.6                   0
2                                33.0                   2
3                                40.6                   1
4                                31.4                   0
...                               ...                 ...
2495                             37.2                   0
2496                             31.5                   0
2497                             36.2                   1
2498                             32.1                   2
2499                             36.2                   1

[2500 rows x 2 columns]


array([0, 0, 0, ..., 1, 1, 0])

**Normalize the data**

In [135]:
# convert to numpy array
x = x_.to_numpy()
y = y_
#y = y_.to_numpy()

# normalize the data using min-max scaler
# as temperature value cant be negative and we are not considering negative value which happens in case of z score normalize
scaler = MinMaxScaler()

x_scaled = scaler.fit_transform(x_)
x_scaled


array([[0.40148699, 0.        ],
       [0.44609665, 0.        ],
       [0.3866171 , 1.        ],
       ...,
       [0.50557621, 0.5       ],
       [0.35315985, 1.        ],
       [0.50557621, 0.5       ]])

#### **Dataset**

In [136]:
# train-test split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.15, random_state=42, shuffle=True)

print(f'Train size - {x_train.shape, y_train.shape}')
print(f'Test size - {x_test.shape, y_test.shape}')

y_test

Train size - ((2125, 2), (2125,))
Test size - ((375, 2), (375,))


array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,

#### **Model**

Define the decisiontree classifier

In [137]:
# initialize the Decision tree classifier with hyperparameters
clf = DecisionTreeClassifier(
    criterion="gini",
    max_depth=12,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42
)

In [None]:
# training
clf.fit(x_train, y_train)

# predictions with test data
y_pred = clf.predict(x_test)

Define logistic regression

In [139]:
model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

#### **Evaluation**


1. Performance for Decision Tree Classifier

In [105]:
# accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

# classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.472
Confusion Matrix:
[[102  73]
 [125  75]]
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.58      0.51       175
           1       0.51      0.38      0.43       200

    accuracy                           0.47       375
   macro avg       0.48      0.48      0.47       375
weighted avg       0.48      0.47      0.47       375



2. Logistic regression prediction

In [142]:
# accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

# classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.4666666666666667
Confusion Matrix:
[[121  54]
 [146  54]]
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.69      0.55       175
           1       0.50      0.27      0.35       200

    accuracy                           0.47       375
   macro avg       0.48      0.48      0.45       375
weighted avg       0.48      0.47      0.44       375



In [144]:
x[0]

array([33.4,  0. ])

## **Synthetic Data Geneeneration**

In [190]:
import numpy
import os

In [191]:
# load the dataframe
df = pd.read_csv("Machine Downtime.csv")
df.columns

Index(['Date', 'Machine_ID', 'Assembly_Line_No', 'Hydraulic_Pressure(bar)',
       'Coolant_Pressure(bar)', 'Air_System_Pressure(bar)',
       'Coolant_Temperature', 'Hydraulic_Oil_Temperature(?C)',
       'Spindle_Bearing_Temperature(?C)', 'Spindle_Vibration(?m)',
       'Tool_Vibration(?m)', 'Spindle_Speed(RPM)', 'Voltage(volts)',
       'Torque(Nm)', 'Cutting(kN)', 'Downtime'],
      dtype='object')

In [192]:
# rename columns
df.rename(columns=
                 {'Machine ID': 'Machine_ID', 
                  'Spindle_Bearing_Temperature(?C)': 'Temperature(C)', 
                  'Downtime': 'Downtime'
                  }, inplace=True)
df.columns

Index(['Date', 'Machine_ID', 'Assembly_Line_No', 'Hydraulic_Pressure(bar)',
       'Coolant_Pressure(bar)', 'Air_System_Pressure(bar)',
       'Coolant_Temperature', 'Hydraulic_Oil_Temperature(?C)',
       'Temperature(C)', 'Spindle_Vibration(?m)', 'Tool_Vibration(?m)',
       'Spindle_Speed(RPM)', 'Voltage(volts)', 'Torque(Nm)', 'Cutting(kN)',
       'Downtime'],
      dtype='object')

In [193]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Add synthetic Run_Time
def generate_run_time(row):
    # Base run time depending on downtime
    base_run_time = np.random.normal(400, 100) if row['Downtime'] == 'No' else np.random.normal(100, 5)
    
    # Adjust run time based on temperature
    temp_adjustment = (100 - row['Temperature(C)']) * 0.5  # Higher temp -> lower run time
    return max(base_run_time + temp_adjustment, 0)  # Ensure no negative run time

# apply the function to generate custom runtime command
df['Run_Time'] = df.apply(generate_run_time, axis=1)
df['Run_Time'].max()

150.7136574532736

In [194]:
# synthetic data
df.to_csv("Machine_Downtime_Synthetic.csv", index=False)

In [195]:
# check synthetic data
df = pd.read_csv("Machine_Downtime_Synthetic.csv")
df.head()

Unnamed: 0,Date,Machine_ID,Assembly_Line_No,Hydraulic_Pressure(bar),Coolant_Pressure(bar),Air_System_Pressure(bar),Coolant_Temperature,Hydraulic_Oil_Temperature(?C),Temperature(C),Spindle_Vibration(?m),Tool_Vibration(?m),Spindle_Speed(RPM),Voltage(volts),Torque(Nm),Cutting(kN),Downtime,Run_Time
0,31-12-2021,Makino-L1-Unit1-2013,Shopfloor-L1,71.04,6.933725,6.284965,25.6,46.0,33.4,1.291,26.492,25892.0,335.0,24.055326,3.58,Machine_Failure,135.783571
1,31-12-2021,Makino-L1-Unit1-2013,Shopfloor-L1,125.33,4.936892,6.196733,35.3,47.4,34.6,1.382,25.274,19856.0,368.0,14.20289,2.68,Machine_Failure,132.008678
2,31-12-2021,Makino-L3-Unit1-2015,Shopfloor-L3,71.12,6.839413,6.655448,13.1,40.7,33.0,1.319,30.608,19851.0,325.0,24.049267,3.55,Machine_Failure,136.738443
3,31-05-2022,Makino-L2-Unit1-2015,Shopfloor-L2,139.34,4.574382,6.560394,24.4,44.2,40.6,0.618,30.791,18461.0,360.0,25.860029,3.55,Machine_Failure,137.315149
4,31-03-2022,Makino-L1-Unit1-2013,Shopfloor-L1,60.51,6.893182,6.141238,4.1,47.3,31.4,0.983,25.516,26526.0,354.0,25.515874,3.55,Machine_Failure,133.129233
