In [2]:
import pandas as pd

train_data = pd.read_csv('../data/raw/ALLtrainMescla5D.csv')
test_data = pd.read_csv('../data/raw/ALLtestMescla5D.csv')


In [3]:
# Check for missing values
print(train_data.isnull().sum())
print(test_data.isnull().sum())

machineID            0
datetime             0
time_in_cycles       0
voltmean_24h         0
rotatemean_24h       0
pressuremean_24h     0
vibrationmean_24h    0
voltsd_24h           0
rotatesd_24h         0
pressuresd_24h       0
vibrationsd_24h      0
voltmean_5d          0
rotatemean_5d        0
pressuremean_5d      0
vibrationmean_5d     0
voltsd_5d            0
rotatesd_5d          0
pressuresd_5d        0
vibrationsd_5d       0
error1               0
error2               0
error3               0
error4               0
error5               0
comp1                0
comp2                0
comp3                0
comp4                0
model                0
age                  0
DI                   0
RULWeek              0
failure              0
failed               0
RUL                  0
RUL_I                0
dtype: int64
machineID            0
datetime             0
time_in_cycles       0
voltmean_24h         0
rotatemean_24h       0
pressuremean_24h     0
vibrationmean_24h    

In [4]:
# Parse datetime field as datetime and convert to time elapsed

train_data['datetime'] = pd.to_datetime(train_data['datetime'])
test_data['datetime'] = pd.to_datetime(test_data['datetime'])

train_data['elapsed_time'] = ( train_data.groupby('machineID')['datetime'].transform(lambda x: ((x - x.min()).dt.total_seconds() / 3600)))
test_data['elapsed_time'] = ( test_data.groupby('machineID')['datetime'].transform(lambda x: ((x - x.min()).dt.total_seconds() / 3600)))

train_data.drop('datetime', axis=1, inplace=True)
test_data.drop('datetime', axis=1, inplace=True)

In [5]:
# Convert model field to numeric using one hot encoding since there are only 4 values possible for model
train_data = pd.get_dummies(train_data, columns=['model'], prefix='mdl', dtype=int)
test_data = pd.get_dummies(test_data, columns=['model'], prefix='mdl', dtype=int)
print(train_data)


       machineID  time_in_cycles  voltmean_24h  rotatemean_24h  \
0              1               2    171.025033      454.614348   
1              1               3    174.139410      444.337772   
2              1               4    172.135931      446.126781   
3              1               5    169.350052      466.884090   
4              1               6    171.006246      455.695551   
...          ...             ...           ...             ...   
20862        100             335    166.637041      443.320475   
20863        100             336    170.826845      463.537218   
20864        100             337    169.602066      376.676980   
20865        100             338    171.742111      361.745122   
20866        100             339    167.626031      437.589364   

       pressuremean_24h  vibrationmean_24h  voltsd_24h  rotatesd_24h  \
0            102.377665          41.506930   17.623759     43.648283   
1             96.674842          41.702771   10.611608     39.4

In [6]:
# Remove the failure column as it is redundant with failed column
train_data.drop('failure', axis=1, inplace=True)
test_data.drop('failure', axis=1, inplace=True)

In [None]:
import seaborn as sns

# Analyse correlation with predicted variable - RUL, and take the top 10 features

corr_rul = train_data.corr()['RUL'].drop(['RUL', 'RUL_I', 'RULWeek']).abs().sort_values(ascending=False)
top_features = corr_rul.head(10)
print(top_features)

features_model_a = [
    "DI",
    "time_in_cycles",
    "elapsed_time",
    "age",
    "mdl_model1",
    "mdl_model2",
    "mdl_model4",
    "comp1",
    "comp3",
    "comp4",
    "error1",
    "error2",
    "error3",
    "error4",
    "error5",
]
features_model_b = [
    "DI",
    "time_in_cycles",
    "elapsed_time",
    "age",
    "mdl_model1",
    "mdl_model2",
    "mdl_model4",
    "comp1",
    "comp3",
    "comp4",
]

X_model_a = train_data[features_model_a].values
y_model_a = train_data["RUL"].values.reshape(-1, 1)
X_model_b = train_data[features_model_b].values
y_model_b = train_data["RUL"].values.reshape(-1, 1)


DI                0.650551
mdl_model4        0.223924
time_in_cycles    0.222056
elapsed_time      0.217722
age               0.210046
mdl_model1        0.156523
comp1             0.130099
mdl_model2        0.124997
comp4             0.114718
comp3             0.108671
Name: RUL, dtype: float64


In [None]:
from sklearn.preprocessing import StandardScaler

cont_idx = [0, 1, 2, 3, 6, 8, 9]
scaler = StandardScaler()
X_cont = scaler.fit_transform(X[:, cont_idx])