In [74]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.decomposition import PCA

def getTFDataset(path = "dataset", set = 1, return_rul = False):
    """
    Function for loading the NASA Turbofan Dataset. 

    Returns pandas dataframes train, test(, rul)
    """

    colnames = ['id', 'dt', #id_col and time_col
                'set1', 'set2', 'set3',
                's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 
                's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
    
    train = pd.read_table(f"{path}/train_FD00{set}.txt", header=None, delim_whitespace=True)
    test = pd.read_table(f"{path}/test_FD00{set}.txt", header=None, delim_whitespace=True)

    train.columns = colnames
    test.columns = colnames
    
    if return_rul:
        rul = pd.read_table(f"{path}/RUL_FD00{set}.txt", header=None, delim_whitespace=True)
        rul.columns = "rul"
        return train, test, rul

    return train, test

In [166]:
train, test = getTFDataset()

In [185]:
class Labeling:

    def __init__(self) -> None:
        pass

    def linear(self, data, time_col, id_col, start = 1) -> None:
        '''
        linear rul label, i.e. reverse time column
        '''
        func_help = lambda c: c[::-1]
        data['linear'] = data[[id_col, time_col]].groupby(id_col).transform(func_help) - start

    def frac(self, data, time_col, id_col):
        remove_linear = False
        if 'linear' not in data.columns:
            self.linear(self, data = data, time_col = time_col, id_col = id_col)
            remove_linear = True
        func_help = lambda c: c/c.max()
        data['frac'] = data[[id_col, 'linear']].groupby('id').transform(func_help)
        if remove_linear:
            del data['linear']

    def piecewise(self, data, time_col, id_col, max_rul = 125) -> None:
        remove_linear = False
        if 'linear' not in data.columns:
            Labeling.linear(self, data = data, time_col = time_col, id_col = id_col)
            remove_linear = True
        data['piecewise'] = data['linear']
        data.loc[data['piecewise'] > max_rul, 'piecewise'] = max_rul
        if remove_linear:
            del data['linear']
            
    def monotone(self, data, sensor_cols, time_col, id_col, min_mon = 0.66):
        '''
        health index based on spearman rank correlation and pca
        '''
        remove_linear = False
        if 'linear' not in data.columns:
            Labeling.linear(self, data = data, time_col = time_col, id_col = id_col)
            remove_linear = True
        # a. drop sensors with zero variance
        sensor_cols = data[sensor_cols].loc[:, abs(data[sensor_cols].var()) > 0.001].columns.to_list() #TODO: variance id-wise
        # b. spearman rank correlation
        corr = stats.spearmanr(data[sensor_cols], data['linear']).correlation[:,-1] #TODO: id-wise
        corr = corr[:-1]
        roh = pd.DataFrame(data = np.expand_dims(corr, axis = 0), index = [0], columns = sensor_cols)
        roh = roh.abs().sort_values(by = 0, axis = 1)
        most_mon = roh.loc[:, roh.loc[0,:] > min_mon].columns.to_list()
        if len(most_mon) != 0:
            # c. pca
            x = np.asarray(data[most_mon])
            pca = PCA(n_components = 1)
            data['mono'] = (-1) * pca.fit_transform(x) #TODO: id-wise
            get_health_index = lambda x: (x - x.min()) / (x.max() - x.min())
            data['mono'] = data[[id_col, 'mono']].groupby(id_col).transform(get_health_index)
            if remove_linear:
                del data['linear']
        else:
            print('no monotone sensor trajecories')

In [129]:
sensor_cols = train[sensors].loc[:, abs(train[sensors].var()) > 0.001].columns.to_list()

In [191]:
class Labeling:

    def __init__(self, data, sensors, id_col, time_col) -> None:
        self.data = data
        self.sensors = sensors
        self.id_col = id_col
        self.time_col = time_col
        pass

    def linear(self, start = 1) -> None:
        '''
        linear rul label, i.e. reverse time column
        '''
        func_help = lambda c: c[::-1]
        self.data['linear'] = self.data[[self.id_col, self.time_col]].groupby(self.id_col).transform(func_help) - start

    def frac(self, start = 1) -> None:
        '''
        fractional linear rul label, i.e. linear label but normalized in (0, 1)
        '''
        remove_linear = False
        if 'linear' not in self.data.columns:
            self.linear(self, start = start)
            remove_linear = True
        func_help = lambda c: c / c.max()
        self.data['frac'] = self.data[[self.id_col, 'linear']].groupby(self.id_col).transform(func_help)
        if remove_linear:
            del self.data['linear']

In [165]:
l.linear(data = train, time_col = 'dt', id_col = 'id')

In [192]:
l2 = Labeling(data = train, sensors = sensors, time_col = 'dt', id_col = 'id')

In [162]:
sensors = ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 
                's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [193]:
l2.frac()

In [149]:
a = ['s15', 's7', 's12', 's4', 's11']

In [195]:
train

Unnamed: 0,id,dt,set1,set2,set3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,linear,frac
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,191,1.000000
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,190,0.994764
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189,0.989529
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188,0.984293
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,187,0.979058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,4,0.020101
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,3,0.015075
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2,0.010050
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640,1,0.005025


In [194]:
l2.data

Unnamed: 0,id,dt,set1,set2,set3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,linear,frac
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,191,1.000000
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,190,0.994764
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189,0.989529
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188,0.984293
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,187,0.979058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,4,0.020101
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,3,0.015075
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2,0.010050
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640,1,0.005025
