In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

import keras
from pykalman import KalmanFilter
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# read training data 
train_df = pd.read_csv('../PM_train_01.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [3]:
for cols in train_df.columns:
    if cols == 'id':
        continue;
    if cols == 'cycle':
        continue;
    else:
        print(cols)
        kf = KalmanFilter(transition_matrices = [1],
                      observation_matrices = [1],
                      initial_state_mean = train_df[cols].values[0],
                      initial_state_covariance = 1,
                      observation_covariance=1,
                      transition_covariance=.01)
        state_means,_ = kf.filter(train_df[cols].values)
        train_df[cols] = state_means.flatten()

setting1
setting2
setting3
s1
s2
s3
s4
s5
s6
s7
s8
s9
s10
s11
s12
s13
s14
s15
s16
s17
s18
s19
s20
s21


In [4]:
# read test data
test_df = pd.read_csv('../PM_test_01.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [5]:
for cols in test_df.columns:
    if cols == 'id':
        continue;
    if cols == 'cycle':
        continue;
    else:
        print(cols)
        kf = KalmanFilter(transition_matrices = [1],
                      observation_matrices = [1],
                      initial_state_mean = test_df[cols].values[0],
                      initial_state_covariance = 1,
                      observation_covariance=1,
                      transition_covariance=.01)
        state_means,_ = kf.filter(test_df[cols].values)
        test_df[cols] = state_means.flatten()

setting1
setting2
setting3
s1
s2
s3
s4
s5
s6
s7
s8
s9
s10
s11
s12
s13
s14
s15
s16
s17
s18
s19
s20
s21


In [6]:
# read ground truth data
truth_df = pd.read_csv('../PM_truth_01.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

In [7]:
train_df = train_df.sort_values(['id','cycle'])
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419
1,1,2,0.000178,-0.000366,100.0,518.67,641.931457,1590.416026,1401.457881,14.62,...,521.869404,2388.036887,8136.211854,8.423654,0.03,392.0,2388.0,100.0,39.039735,23.420554
2,1,3,-0.000977,-0.000194,100.0,518.67,642.03945,1589.790059,1402.165407,14.62,...,522.01147,2388.03511,8135.442471,8.422144,0.03,391.483957,2388.0,100.0,39.016581,23.400853
3,1,4,-0.000623,-0.000153,100.0,518.67,642.105091,1588.310457,1402.102967,14.62,...,522.190824,2388.044599,8135.101643,8.410742,0.03,391.593033,2388.0,100.0,38.987712,23.395156
4,1,5,-0.000854,-0.000162,100.0,518.67,642.153105,1587.320764,1402.849168,14.62,...,522.190674,2388.043765,8134.865724,8.414123,0.03,391.848042,2388.0,100.0,38.971815,23.396831


In [8]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419,191
1,1,2,0.000178,-0.000366,100.0,518.67,641.931457,1590.416026,1401.457881,14.62,...,2388.036887,8136.211854,8.423654,0.03,392.0,2388.0,100.0,39.039735,23.420554,190
2,1,3,-0.000977,-0.000194,100.0,518.67,642.03945,1589.790059,1402.165407,14.62,...,2388.03511,8135.442471,8.422144,0.03,391.483957,2388.0,100.0,39.016581,23.400853,189
3,1,4,-0.000623,-0.000153,100.0,518.67,642.105091,1588.310457,1402.102967,14.62,...,2388.044599,8135.101643,8.410742,0.03,391.593033,2388.0,100.0,38.987712,23.395156,188
4,1,5,-0.000854,-0.000162,100.0,518.67,642.153105,1587.320764,1402.849168,14.62,...,2388.043765,8134.865724,8.414123,0.03,391.848042,2388.0,100.0,38.971815,23.396831,187


In [9]:
# generate label columns for training data
w1 = 30
w0 = 15
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419,191,0,0
1,1,2,0.000178,-0.000366,100.0,518.67,641.931457,1590.416026,1401.457881,14.62,...,8.423654,0.03,392.0,2388.0,100.0,39.039735,23.420554,190,0,0
2,1,3,-0.000977,-0.000194,100.0,518.67,642.03945,1589.790059,1402.165407,14.62,...,8.422144,0.03,391.483957,2388.0,100.0,39.016581,23.400853,189,0,0
3,1,4,-0.000623,-0.000153,100.0,518.67,642.105091,1588.310457,1402.102967,14.62,...,8.410742,0.03,391.593033,2388.0,100.0,38.987712,23.395156,188,0,0
4,1,5,-0.000854,-0.000162,100.0,518.67,642.153105,1587.320764,1402.849168,14.62,...,8.414123,0.03,391.848042,2388.0,100.0,38.971815,23.396831,187,0,0


In [10]:
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

In [11]:
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,2388.03,8125.55,8.4052,0.03,392.0,2388.0,100.0,38.86,23.3735,142
1,1,2,0.000611,9.7e-05,100.0,518.67,642.57755,1586.357285,1397.267682,14.62,...,2388.040132,8130.302119,8.39679,0.03,392.337748,2388.0,100.0,38.91404,23.379613,141
2,1,3,0.000531,9.8e-05,100.0,518.67,642.547219,1586.507638,1398.318429,14.62,...,2388.037518,8130.249968,8.408997,0.03,392.508624,2388.0,100.0,38.956861,23.389157,140
3,1,4,0.001306,7.7e-05,100.0,518.67,642.524556,1586.002963,1400.030858,14.62,...,2388.040156,8130.810105,8.405341,0.03,392.189746,2388.0,100.0,38.965979,23.38589,139
4,1,5,0.001323,6.3e-05,100.0,518.67,642.521918,1586.21811,1400.373259,14.62,...,2388.038316,8130.579902,8.404935,0.03,391.79286,2388.0,100.0,38.970333,23.390803,138


In [12]:
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,8.4052,0.03,392.0,2388.0,100.0,38.86,23.3735,142,0,0
1,1,2,0.000611,9.7e-05,100.0,518.67,642.57755,1586.357285,1397.267682,14.62,...,8.39679,0.03,392.337748,2388.0,100.0,38.91404,23.379613,141,0,0
2,1,3,0.000531,9.8e-05,100.0,518.67,642.547219,1586.507638,1398.318429,14.62,...,8.408997,0.03,392.508624,2388.0,100.0,38.956861,23.389157,140,0,0
3,1,4,0.001306,7.7e-05,100.0,518.67,642.524556,1586.002963,1400.030858,14.62,...,8.405341,0.03,392.189746,2388.0,100.0,38.965979,23.38589,139,0,0
4,1,5,0.001323,6.3e-05,100.0,518.67,642.521918,1586.21811,1400.373259,14.62,...,8.404935,0.03,391.79286,2388.0,100.0,38.970333,23.390803,138,0,0


In [13]:
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
cols = ['setting1', 'setting2', 'setting3']
cols.extend(sensor_cols)

In [14]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb

X_train, X_val, Y_train, Y_val = train_test_split(train_df[cols], train_df['label1'], test_size=0.05, 
                                                  shuffle=False, random_state=42)

print ("Train_shape: " + str(X_train.shape))
print ("Val_shape: " + str(X_val.shape))
print ("No of positives in train: " + str(Y_train.sum()))
print ("No of positives in val: " + str(Y_val.sum()))

Train_shape: (19599, 24)
Val_shape: (1032, 24)
No of positives in train: 2945
No of positives in val: 155


In [15]:
lgb_train = lgb.Dataset(X_train, Y_train)
lgb_eval = lgb.Dataset(X_val, Y_val)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 11,
    'learning_rate': 0.01,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
}

print('Start training...')

gbm = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=[lgb_train, lgb_eval], 
                early_stopping_rounds=25, verbose_eval=25)

Start training...
Training until validation scores don't improve for 25 rounds.
[25]	training's auc: 0.974311	valid_1's auc: 0.96349
[50]	training's auc: 0.97637	valid_1's auc: 0.971038
[75]	training's auc: 0.976813	valid_1's auc: 0.971531
[100]	training's auc: 0.978056	valid_1's auc: 0.971729
[125]	training's auc: 0.978738	valid_1's auc: 0.97206
[150]	training's auc: 0.979727	valid_1's auc: 0.972811
[175]	training's auc: 0.980336	valid_1's auc: 0.973222
[200]	training's auc: 0.98094	valid_1's auc: 0.973807
[225]	training's auc: 0.981603	valid_1's auc: 0.973991
[250]	training's auc: 0.98209	valid_1's auc: 0.9743
[275]	training's auc: 0.982718	valid_1's auc: 0.974591
[300]	training's auc: 0.983358	valid_1's auc: 0.974966
[325]	training's auc: 0.983859	valid_1's auc: 0.975091
[350]	training's auc: 0.984394	valid_1's auc: 0.975297
[375]	training's auc: 0.984924	valid_1's auc: 0.975746
[400]	training's auc: 0.985412	valid_1's auc: 0.975989
[425]	training's auc: 0.985829	valid_1's auc: 0.97

In [16]:
from sklearn.metrics import accuracy_score
# training metrics

pred_train = gbm.predict(train_df[cols], num_iteration=gbm.best_iteration)
pred_train = np.where(pred_train > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(train_df['label1'], pred_train)))

Accurracy: 0.9559885609034947


In [17]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(train_df['label1'], pred_train)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[17025,   506],
       [  402,  2698]])

In [18]:
pred_test = gbm.predict(test_df[cols], num_iteration=gbm.best_iteration)
pred_test = np.where(pred_test > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(test_df['label1'], pred_test)))

Accurracy: 0.986484422724496


In [19]:
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(test_df['label1'], pred_test)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[12724,    40],
       [  137,   195]])

In [20]:
# compute precision and recall
precision_test = precision_score(test_df['label1'], pred_test)
recall_test = recall_score(test_df['label1'], pred_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.8297872340425532 
 Recall:  0.5873493975903614 
 F1-score: 0.6878306878306878


In [21]:
label_array_test_last = test_df.groupby('id')['label1'].nth(-1).values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(100, 1)

In [22]:
seq_array_test_last = [test_df[test_df['id']==id][cols].values[-1] for id in test_df['id'].unique()]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(100, 24)

In [23]:
pred_test_last = gbm.predict(seq_array_test_last, num_iteration=gbm.best_iteration)
pred_test_last = np.where(pred_test_last > 0.5, 1, 0)
acc = accuracy_score(label_array_test_last, pred_test_last)
print('Accurracy: {}'.format(acc))

Accurracy: 0.91


In [24]:
# make predictions and compute confusion matrix
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(label_array_test_last, pred_test_last)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[73,  2],
       [ 7, 18]])

In [25]:
# compute precision and recall
precision_test = precision_score(label_array_test_last, pred_test_last)
recall_test = recall_score(label_array_test_last, pred_test_last)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.9 
 Recall:  0.72 
 F1-score: 0.7999999999999999


In [26]:
results_df = pd.DataFrame([[acc,precision_test,recall_test,f1_test],
                          ],
                         columns = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
                         index = ['LightGBM'])
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score
LightGBM,0.91,0.9,0.72,0.8
