In [1]:
import pandas as pd
import zipfile

In [5]:
# extract the test, train data to sensordata folder
with zipfile.ZipFile('dataset/archive.zip') as file:
    file.extractall('sensordata/')

In [2]:
# create column names for the dataset
meta_cols = ['engine_id', 'cycle']
setting_cols = ['setting_{}'.format(i+1) for i in range(3)]
sensor_cols = ['sensor_{}'.format(i+1) for i in range(21)]
cols = meta_cols + setting_cols + sensor_cols

In [3]:
# Training dataset
train_data = pd.read_csv('sensordata/PM_train.txt', sep = ' ',header=None).drop([26,27], axis=1)
train_data.columns = cols
print('shape of train_data:', train_data.shape)

shape of train_data: (20631, 26)


In [4]:
# Test dataset
test_data = pd.read_csv('sensordata/PM_test.txt', sep = ' ',header=None).drop([26,27], axis=1)
test_data.columns = cols
print('shape of test_data:', test_data.shape)

shape of test_data: (13096, 26)


In [5]:
# Truth values with remaining useful life
truth_data = pd.read_csv('sensordata/PM_truth.txt', sep=' ', header=None).drop([1], axis=1)
truth_data.columns = ['cycle_RUL']
truth_data['engine_id'] = truth_data.index+1
print('shape of truth_data:',truth_data.shape)
truth_data.tail()

shape of truth_data: (100, 2)


Unnamed: 0,cycle_RUL,engine_id
95,137,96
96,82,97
97,59,98
98,117,99
99,20,100


In [6]:
# Find the max value for the test dataset
rul = pd.DataFrame(test_data.groupby('engine_id')['cycle'].max()).reset_index()
rul.columns = ['engine_id', 'cycle_max']
rul.tail()

Unnamed: 0,engine_id,cycle_max
95,96,97
96,97,134
97,98,121
98,99,97
99,100,198


In [7]:
# calcuate the run to failure cycles
truth_data['rtf'] = truth_data['cycle_RUL'] + rul['cycle_max']
truth_data.tail()

Unnamed: 0,cycle_RUL,engine_id,rtf
95,137,96,234
96,82,97,216
97,59,98,180
98,117,99,214
99,20,100,218


In [8]:
truth_data.drop('cycle_RUL', axis=1, inplace=True)

In [9]:
# calculate the total time to failure(ttf) in test data for each row
test_data = test_data.merge(truth_data, on=['engine_id'], how='left')
test_data['ttf'] = test_data['rtf'] - test_data['cycle']
test_data.drop(['rtf'], axis=1, inplace=True)
test_data.tail()


Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,ttf
13091,100,194,0.0049,0.0,100.0,518.67,643.24,1599.45,1415.79,14.62,...,2388.0,8213.28,8.4715,0.03,394,2388,100.0,38.65,23.1974,24
13092,100,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,...,2388.09,8210.85,8.4512,0.03,395,2388,100.0,38.57,23.2771,23
13093,100,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,...,2388.04,8217.24,8.4569,0.03,395,2388,100.0,38.62,23.2051,22
13094,100,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,...,2388.08,8220.48,8.4711,0.03,395,2388,100.0,38.66,23.2699,21
13095,100,198,0.0013,0.0003,100.0,518.67,642.95,1601.62,1424.99,14.62,...,2388.05,8214.64,8.4903,0.03,396,2388,100.0,38.7,23.1855,20


In [10]:
# Calculate the time to failure for training dataset
train_data['ttf'] = train_data.groupby(['engine_id'])['cycle'].transform(max)-train_data['cycle']
train_data.tail()

Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,ttf
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735,4
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594,3
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064,1
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,2388.26,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522,0


In [11]:
df_train = train_data.copy()
df_test = test_data.copy()
period = 30
df_train['fail'] = df_train['ttf'].apply(lambda x: 1 if x <= period else 0)
df_test['fail'] = df_test['ttf'].apply(lambda x: 1 if x <= period else 0)
df_train.tail()

Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,ttf,fail
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735,4,1
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594,3,1
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2,1
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064,1,1
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522,0,1
