In [1]:
import pandas as pd
import zipfile
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline  

In [5]:
# extract the test, train data to sensordata folder
with zipfile.ZipFMinMaxScaler('dataset/archive.zip') as file:
    file.extSequentialctall('sensordata/')

In [2]:
# create column names for the dataset
meta_cols = ['engine_id', 'cycle']
setting_cols = ['setting_{}'.format(i+1) for i in range(3)]
sensor_cols = ['sensor_{}'.format(i+1) for i in range(21)]
feature_cols = setting_cols + sensor_cols
target_col = 'fail'
cols = meta_cols + setting_cols + sensor_cols

In [3]:
# Training dataset
train_data = pd.read_csv('sensordata/PM_train.txt', sep = ' ',header=None).drop([26,27], axis=1)
train_data.columns = cols
print('shape of train_data:', train_data.shape)

shape of train_data: (20631, 26)


In [4]:
# Test dataset
test_data = pd.read_csv('sensordata/PM_test.txt', sep = ' ',header=None).drop([26,27], axis=1)
test_data.columns = cols
print('shape of test_data:', test_data.shape)

shape of test_data: (13096, 26)


In [5]:
# Truth values with remaining useful life
truth_data = pd.read_csv('sensordata/PM_truth.txt', sep=' ', header=None).drop([1], axis=1)
truth_data.columns = ['cycle_RUL']
truth_data['engine_id'] = truth_data.index+1
print('shape of truth_data:',truth_data.shape)
truth_data.tail()

shape of truth_data: (100, 2)


Unnamed: 0,cycle_RUL,engine_id
95,137,96
96,82,97
97,59,98
98,117,99
99,20,100


In [6]:
# Find the max value for the test dataset
rul = pd.DataFrame(test_data.groupby('engine_id')['cycle'].max()).reset_index()
rul.columns = ['engine_id', 'cycle_max']
rul.tail()

Unnamed: 0,engine_id,cycle_max
95,96,97
96,97,134
97,98,121
98,99,97
99,100,198


In [7]:
# calcuate the run to failure cycles
truth_data['rtf'] = truth_data['cycle_RUL'] + rul['cycle_max']
truth_data.tail()

Unnamed: 0,cycle_RUL,engine_id,rtf
95,137,96,234
96,82,97,216
97,59,98,180
98,117,99,214
99,20,100,218


In [8]:
truth_data.drop('cycle_RUL', axis=1, inplace=True)

In [9]:
# calculate the total time to failure(ttf) in test data for each row
test_data = test_data.merge(truth_data, on=['engine_id'], how='left')
test_data['ttf'] = test_data['rtf'] - test_data['cycle']
test_data.drop(['rtf'], axis=1, inplace=True)
test_data.tail()


Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,ttf
13091,100,194,0.0049,0.0,100.0,518.67,643.24,1599.45,1415.79,14.62,...,2388.0,8213.28,8.4715,0.03,394,2388,100.0,38.65,23.1974,24
13092,100,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,...,2388.09,8210.85,8.4512,0.03,395,2388,100.0,38.57,23.2771,23
13093,100,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,...,2388.04,8217.24,8.4569,0.03,395,2388,100.0,38.62,23.2051,22
13094,100,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,...,2388.08,8220.48,8.4711,0.03,395,2388,100.0,38.66,23.2699,21
13095,100,198,0.0013,0.0003,100.0,518.67,642.95,1601.62,1424.99,14.62,...,2388.05,8214.64,8.4903,0.03,396,2388,100.0,38.7,23.1855,20


In [10]:
# Calculate the time to failure for training dataset
train_data['ttf'] = train_data.groupby(['engine_id'])['cycle'].transform(max)-train_data['cycle']
train_data.tail()

Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,ttf
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735,4
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594,3
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064,1
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,2388.26,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522,0


In [16]:
df_train = train_data.copy()
df_test = test_data.copy()
period = 30
df_train['fail'] = df_train['ttf'].apply(lambda x: 1 if x <= period else 0)
df_test['fail'] = df_test['ttf'].apply(lambda x: 1 if x <= period else 0)
df_train.tail()

Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,ttf,fail
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735,4,1
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594,3,1
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2,1
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064,1,1
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522,0,1


In [45]:
df_train_20 = train_data.copy()
df_test_20 = test_data.copy()
period = 20
df_train_20['fail'] = df_train['ttf'].apply(lambda x: 1 if x <= period else 0)
df_test_20['fail'] = df_test['ttf'].apply(lambda x: 1 if x <= period else 0)

In [46]:
df_train_10 = train_data.copy()
df_test_10 = test_data.copy()
period = 10
df_train_10['fail'] = df_train['ttf'].apply(lambda x: 1 if x <= period else 0)
df_test_10['fail'] = df_test['ttf'].apply(lambda x: 1 if x <= period else 0)

In [48]:
# Perform feature scaling on the feature columns
sc = MinMaxScaler()
df_train[feature_cols] = sc.fit_transform(df_train[feature_cols])
df_test[feature_cols] = sc.transform(df_test[feature_cols])
df_train_10[feature_cols] = sc.fit_transform(df_train[feature_cols])
df_test_10[feature_cols] = sc.transform(df_test[feature_cols])
df_train_20[feature_cols] = sc.fit_transform(df_train[feature_cols])
df_test_20[feature_cols] = sc.transform(df_test[feature_cols])

In [49]:
df_test.to_csv('C://Users/rajas/PycharmProjects/Flask_PM/data/df_test.csv')
df_test_10.to_csv('C://Users/rajas/PycharmProjects/Flask_PM/data/df_test_10.csv')
df_test_20.to_csv('C://Users/rajas/PycharmProjects/Flask_PM/data/df_test_20.csv')

In [18]:
def gen_sequence(id_df, seq_length, seq_cols):
    df_zeros=pd.DataFrame(np.zeros((seq_length-1,id_df.shape[1])),columns=id_df.columns)
    # id_df=df_zeros.append(id_df,ignore_index=True)
    id_df = pd.concat([df_zeros,id_df], ignore_index=True)
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    lstm_array=[]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        lstm_array.append(data_array[start:stop, :])
    return np.array(lstm_array)

# function to generate labels
def gen_label(id_df, seq_length, seq_cols,label):
    df_zeros=pd.DataFrame(np.zeros((seq_length-1,id_df.shape[1])),columns=id_df.columns)
    # id_df=df_zeros.append(id_df,ignore_index=True)
    id_df = pd.concat([df_zeros,id_df], ignore_index=True)
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    y_label=[]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        y_label.append(id_df[label][stop])
    return np.array(y_label)

In [19]:
# timestamp or window size
seq_length=50
seq_cols=feature_cols

In [50]:
# generate X_train
X_train=np.concatenate(list(list(gen_sequence(df_train[df_train['engine_id']==id], seq_length, seq_cols)) for id in df_train['engine_id'].unique()))
print(X_train.shape)
# generate y_train
y_train=np.concatenate(list(list(gen_label(df_train[df_train['engine_id']==id], 50, seq_cols,'fail')) for id in df_train['engine_id'].unique()))
print(y_train.shape)
# For 10 day prediction
X_train_10=np.concatenate(list(list(gen_sequence(df_train_10[df_train_10['engine_id']==id], seq_length, seq_cols)) for id in df_train_10['engine_id'].unique()))
print(X_train_10.shape)
# generate y_train
y_train_10=np.concatenate(list(list(gen_label(df_train_10[df_train_10['engine_id']==id], 50, seq_cols,'fail')) for id in df_train_10['engine_id'].unique()))
print(y_train_10.shape)
# For 20 day prediction
X_train_20=np.concatenate(list(list(gen_sequence(df_train_20[df_train_20['engine_id']==id], seq_length, seq_cols)) for id in df_train_20['engine_id'].unique()))
print(X_train_20.shape)
# generate y_train
y_train_20=np.concatenate(list(list(gen_label(df_train_20[df_train_20['engine_id']==id], 50, seq_cols,'fail')) for id in df_train_20['engine_id'].unique()))
print(y_train_20.shape)

(20531, 50, 24)
(20531,)
(20531, 50, 24)
(20531,)
(20531, 50, 24)
(20531,)


In [51]:
# generate X_test
X_test=np.concatenate(list(list(gen_sequence(df_test[df_test['engine_id']==id], seq_length, seq_cols)) for id in df_test['engine_id'].unique()))
print(X_test.shape)
# generate y_test
y_test=np.concatenate(list(list(gen_label(df_test[df_test['engine_id']==id], 50, seq_cols,'fail')) for id in df_test['engine_id'].unique()))
print(y_test.shape)
# For 10 day prediction
X_test_10=np.concatenate(list(list(gen_sequence(df_test_10[df_test_10['engine_id']==id], seq_length, seq_cols)) for id in df_test_10['engine_id'].unique()))
print(X_test_10.shape)
# generate y_test
y_test_10=np.concatenate(list(list(gen_label(df_test[df_test_10['engine_id']==id], 50, seq_cols,'fail')) for id in df_test_10['engine_id'].unique()))
print(y_test_10.shape)
# For 20 day prediction
X_test_20=np.concatenate(list(list(gen_sequence(df_test_20[df_test_20['engine_id']==id], seq_length, seq_cols)) for id in df_test_20['engine_id'].unique()))
print(X_test_20.shape)
# generate y_test
y_test_20=np.concatenate(list(list(gen_label(df_test_20[df_test_20['engine_id']==id], 50, seq_cols,'fail')) for id in df_test_20['engine_id'].unique()))
print(y_test_20.shape)

(12996, 50, 24)
(12996,)
(12996, 50, 24)
(12996,)
(12996, 50, 24)
(12996,)


In [64]:
# Build LSTM Network - 10 day pediction
nb_features =X_train_10.shape[2]
timestamp=seq_length
model = Sequential()
model.add(LSTM(units=50,activation = 'relu',input_shape=(X_train_10.shape[1], X_train_10.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=100,activation = 'relu',return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_12 (LSTM)              (None, 50, 50)            15000     
                                                                 
 dropout_15 (Dropout)        (None, 50, 50)            0         
                                                                 
 lstm_13 (LSTM)              (None, 100)               60400     
                                                                 
 dropout_16 (Dropout)        (None, 100)               0         
                                                                 
 dense_10 (Dense)            (None, 1)                 101       
                                                                 
Total params: 75,501
Trainable params: 75,501
Non-trainable params: 0
_________________________________________________________________


In [63]:
# Fit the Network
#model.fit(X_train_10, y_train_10, epochs=10, batch_size=200, validation_split=0.05, verbose=1,
 #         callbacks = [EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto')])
model.fit(X_train, y_train, epochs=10, batch_size=200,validation_split=0.05, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20622eda320>

In [26]:
# training metrics
loss, accuracy = model.evaluate(X_test, y_test)
print('Accurracy:',accuracy)

Accurracy: 0.9819944500923157


In [65]:
y_pred = model.predict(X_test)



In [42]:
threshold = 50
lambda_func = lambda x: 1 if x> threshold else 0
result = np.vectorize(lambda_func)(y_pred)

NameError: name 'y_pred' is not defined

In [71]:
print('Accuracy of model on test data: ',accuracy_score(y_test,result))
print('Confusion Matrix: \n',confusion_matrix(y_test,result))

Accuracy of model on test data:  0.9869959987688519
Confusion Matrix: 
 [[12656     8]
 [  161   171]]


In [68]:
model_cnn = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu',input_shape=(timestamp, nb_features)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1,activation='sigmoid')
])
opt = keras.optimizers.Adam(learning_rate=0.001)
model_cnn.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])

In [70]:
# for 10 days
model_cnn_10 = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu',input_shape=(timestamp, nb_features)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1,activation='sigmoid')
])
opt = keras.optimizers.Adam(learning_rate=0.001)
model_cnn_10.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])

In [71]:
model_cnn_20 = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu',input_shape=(timestamp, nb_features)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1,activation='sigmoid')
])
opt = keras.optimizers.Adam(learning_rate=0.001)
model_cnn_20.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])

In [28]:
model_cnn.fit(X_train, y_train, epochs=10, validation_split=0.05, batch_size=200)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x205b76de560>

In [72]:
model_cnn_10.fit(X_train_10, y_train_10, epochs=10, validation_split=0.05, batch_size=200, verbose=0)
model_cnn_20.fit(X_train_20, y_train_20, epochs=10, validation_split=0.05, batch_size=200, verbose=0)

<keras.callbacks.History at 0x2063005b160>

In [29]:
# training metrics
scores = model_cnn.evaluate(X_train, y_train, verbose=1, batch_size=200)
print('Accurracy: {}'.format(scores[1]))
scores_10 = model_cnn_10.evaluate(X_train_10, y_train_10, verbose=1, batch_size=200)
print('Accurracy: {}'.format(scores_10[1]))
scores_20 = model_cnn_20.evaluate(X_train_20, y_train_20, verbose=1, batch_size=200)
print('Accurracy: {}'.format(scores_20[1]))

Accurracy: 0.9786177277565002


In [73]:
# save the model to a file
model_cnn.save('C://Users/rajas/PycharmProjects/Flask_PM/models/model_pm_30')
model_cnn_10.save('C://Users/rajas/PycharmProjects/Flask_PM/models/model_pm_10')
model_cnn_20.save('C://Users/rajas/PycharmProjects/Flask_PM/models/model_pm_20')



INFO:tensorflow:Assets written to: C://Users/rajas/PycharmProjects/Flask_PM/models/model_pm_10/assets


INFO:tensorflow:Assets written to: C://Users/rajas/PycharmProjects/Flask_PM/models/model_pm_10/assets


INFO:tensorflow:Assets written to: C://Users/rajas/PycharmProjects/Flask_PM/models/model_pm_20/assets


INFO:tensorflow:Assets written to: C://Users/rajas/PycharmProjects/Flask_PM/models/model_pm_20/assets


In [72]:
y2_pred=model_cnn.predict(X_test)
result2 = np.vectorize(lambda_func)(y2_pred)
print('Accuracy of model on test data: ',accuracy_score(y_test,result2))
print('Confusion Matrix: \n',confusion_matrix(y_test,result2))

Accuracy of model on test data:  0.9925361649738381
Confusion Matrix: 
 [[12615    49]
 [   48   284]]


In [36]:
def prob_failure(machine_id):
    machine_df=df_test[df_test.engine_id==machine_id]
    machine_test=gen_sequence(machine_df,seq_length,seq_cols)
    m_pred=model.predict(machine_test, verbose=0)
    failure_prob=list(m_pred[-1]*100)[0]
    return failure_prob

In [41]:
machine_id=25
print('Probability percentage that machine will fail within 30 days: ',prob_failure(machine_id))

Probability percentage that machine will fail within 30 days:  9.48165e-05


In [43]:
maintain_engine = []
for i in range(1, 101):
    fail_prob = prob_failure(i)
    if fail_prob > threshold:
        maintain_engine.append(i)
print(maintain_engine)

[20, 31, 34, 35, 36, 40, 49, 56, 66, 68, 76, 81, 82, 91, 92, 93]
