In [136]:
import turbo as tb
import rul_dataframe as rd
import plots as pl
import labeling as lb
import preprocessing as pre
import classification as cls
import variational_autoencoder as vae_class

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
from sklearn import linear_model, svm
from sklearn import ensemble

In [3]:
from gtda.time_series import PearsonDissimilarity
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import Amplitude

In [4]:
preproc = pre.Preprocessing()
classy = cls.Classification()

In [5]:
def s_score(Y, Yhat, a1 = 10, a2 = 13):
    s = 0.0
    for i in range(len(Y)):
        d = Yhat[i] - Y[i]
        if d < 0:
            s = s + np.exp(-d/a1) - 1
        else:
            s = s + np.exp(d/a2) - 1
    return s

def l_score(Y, Yhat):
    s = 0.0
    for i in range(len(Y)):
        d = (Yhat[i] - Y[i])**2
        if Y[i] < 50:
            s = s + 2 * d
        else:
            s = s + 0.5 * d
    return s/len(Y)

def evaluate(Y, Yhat, label = 'Test', verbose = False):
    if label == 'Test':
        rsme = np.sqrt(mean_squared_error(Y, Yhat))
        r2 = r2_score(Y, Yhat)
        s = s_score(Y, Yhat)
        l = l_score(Y, Yhat)
        if type(s) == np.ndarray:
            s = s[0]
        if type(l) == np.ndarray:
            l = l[0]
        if verbose:
            print(f'{label} set RSME = {rsme}, S = {s}, L= {l}, R2 = {r2}')
        return rsme, s, l, r2
    else:
        rsme = np.sqrt(mean_squared_error(Y, Yhat))
        r2 = r2_score(Y, Yhat)
        if verbose:
            print(f'{label} set RSME = {rsme}, R2 = {r2}')
        return rsme, r2

In [407]:
# prepare data functions
# 1. training-validation split
def training_validation_split(X, Y, groupshufflesplit, groups, verbose = False):
    for itrain, ival in groupshufflesplit.split(X, Y, groups = groups):
        Xtrain = X.iloc[itrain].copy()
        Ytrain = Y.iloc[itrain].copy()
        Xval = X.iloc[ival].copy()
        Yval = Y.iloc[ival].copy()
        if verbose:
            print(f'Training data ids {groups[itrain].unique()}')
            print(f'Validation data ids {groups[ival].unique()}')
    return Xtrain, Ytrain, groups[itrain], Xval, Yval, groups[ival]

# 4. make xrain data
def make_data_to_sequence_one_id(df, sequence_length, stride, y, test):
    n = df.shape[0]
    if test:
        rest = n - stride * int(n/stride)
        if rest == 0:
            rest = stride
    else:
        rest = int(sequence_length/4) + 1
    for start, stop in zip(range(rest-sequence_length, n - sequence_length + 1, stride), range(rest, n + 1, stride)):
        if not y and start < 0:
            index = sequence_length - stop
            Xpadded = np.full(shape = (sequence_length, df.shape[1]), fill_value = 0).astype(np.float32)
            Xpadded[index:, :] = df.values[0:stop,:]
            yield Xpadded
        elif y:
            if test:
                yield(df.values[stop-1])
            else:
                yield(df.values[0:stop,:][-1])
        else:
            yield(df.values[start:stop,:])
def make_data_to_sequence(df, sequence_length, groups, stride, y = False, test = False):
    Xtemp = (list(make_data_to_sequence_one_id(df[groups==i], sequence_length, stride, y, test)) for i in groups.unique())
    return np.concatenate(list(Xtemp)).astype(np.float32)

In [404]:
from gtda.time_series import PearsonDissimilarity
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import Amplitude

from scipy import stats
import dcor

In [644]:
dataset = 4

sequence_length = 50
stride = 10

dist = 'distance' # 'spearman', 'distance'
m = 'landscape' # 'bottleneck' | 'wasserstein' | 'betti' | 'landscape' | 'silhouette' | 'heat' | 'persistence_image'

In [434]:
# if it doesn't work, check 
(te_df.df.groupby('id')['dt'].max()).min()

21

In [432]:
(tr_df.df.groupby('id')['dt'].max()).min()

128

In [645]:
tr, te, rl = tb.getTFDataset(set = dataset)
tb.addTFlinear(tr, te, rl)
tr_df = rd.RUL_DataFrame(df = tr, label_cols = ['linear'])
te_df = rd.RUL_DataFrame(df = te, label_cols = ['linear'])
if dataset in [1,3]:
    preproc.drop_zero_variance(rul_df = tr_df, rul_df_test = te_df)
    preproc.scale(rul_df = tr_df, rul_df_test = te_df, scale = 'std')
    preproc.drop_zero_variance(rul_df = tr_df, rul_df_test = te_df)
elif dataset in [2, 4]:
    classy.fit_kmeans(rul_df = tr_df, rul_df_test = te_df, colname = 'kmeans')
    preproc.one_hot_encode(rul_df = tr_df, rul_df_test = te_df, c_col = 'kmeans')
    preproc.c_drop_zero_variance(rul_df = tr_df, rul_df_test = te_df , c_cols = tr_df.categ_cols)
    preproc.c_scale(rul_df = tr_df, rul_df_test = te_df , c_cols = tr_df.categ_cols, scale = 'std')
    preproc.drop_zero_variance(rul_df = tr_df, rul_df_test = te_df)

#preproc.exponential_smooth(tr_df,0.1)
#preproc.exponential_smooth(te_df,0.1)

# adjust to label size
preproc.delete_first_n(tr_df, 9)

labels = pd.read_csv(f'labels/FD00{dataset}')
label_cols = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']
if not labels.shape[0] == tr_df.df.shape[0]:
    print(f'Labels and Data do not have same length. Dataset FD00{dataset}')
temp = pd.concat([tr_df.df, labels], axis=1)
training_data_with_labels = rd.RUL_DataFrame(df = temp, data_cols = tr_df.data_cols, categ_cols = tr_df.categ_cols, label_cols = ['linear', 'piecewise_optimized', 'piecewise_125', 'descriptive', 'spearman',
                                                            'rec_error'])

Xtrain = training_data_with_labels.df[[training_data_with_labels.id_col] + training_data_with_labels.data_cols]
Xtest = te_df.df[[te_df.id_col] + te_df.data_cols]
Ytrain = training_data_with_labels.df[training_data_with_labels.label_cols]
Ytest = te_df.df['linear']

n = len(training_data_with_labels.data_cols)
print(n)

groupshufflesplit = GroupShuffleSplit(n_splits = 1, train_size = 0.8 , random_state = 42)
Xtrain_split, Ytrain_split, groupstrain, Xval_split, Yval_split, groupsval = training_validation_split(Xtrain, Ytrain, groupshufflesplit, 
                                                                                        groups = training_data_with_labels.df[training_data_with_labels.id_col])

Xtrain_split_sequences_id = make_data_to_sequence(Xtrain_split, sequence_length, groupstrain, stride)
Xtrain_split_sequences = Xtrain_split_sequences_id[:,:,1:(n+1)]
Xtrain_split_sequences_id = Xtrain_split_sequences_id[:,:,0][:,-1]
Ytrain_split_sequences = make_data_to_sequence(Ytrain_split, sequence_length, groupstrain, stride,  y = True)

Xval_split_sequences_id = make_data_to_sequence(Xval_split, sequence_length, groupsval, stride)
Xval_split_sequences = Xval_split_sequences_id[:,:,1:(n+1)]
Xval_split_sequences_id = Xval_split_sequences_id[:,:,0][:,-1]
Yval_split_sequences = make_data_to_sequence(Yval_split, sequence_length, groupsval, stride,  y = True)

Xtest_sequences_id = make_data_to_sequence(Xtest, sequence_length, te_df.df[te_df.id_col], stride, test = True)
Xtest_sequences = Xtest_sequences_id[:,:,1:(n+1)]
Xtest_sequences_id = Xtest_sequences_id[:,:,0][:,-1]
Ytest_sequences = make_data_to_sequence(Ytest, sequence_length, te_df.df[te_df.id_col], stride, y =True, test = True)

path = f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}'
results = pd.DataFrame(columns=['dataset', 'label', 'method', 'RSME_tr','R2_tr', 'RSME_v','R2_v','RSME_te', 'R2_te'])

if dist == 'pearson':
    PD = PearsonDissimilarity()
    X_dm_train = PD.fit_transform(Xtrain_split_sequences)
    X_dm_val = PD.transform(Xval_split_sequences)
    X_dm_test = PD.transform(Xtest_sequences)
elif dist == 'spearman':
    roh, pval = stats.spearmanr(Xtrain_split_sequences[0])
    X_dm_train = np.asarray([roh])
    for i in range(1, Xtrain_split_sequences.shape[0]):
        roh, pval = stats.spearmanr(Xtrain_split_sequences[i])
        X_dm_train = np.concatenate((X_dm_train,[roh]))
    
    roh, pval = stats.spearmanr(Xval_split_sequences[0])
    X_dm_val = np.asarray([roh])
    for i in range(1, Xval_split_sequences.shape[0]):
        roh, pval = stats.spearmanr(Xval_split_sequences[i])
        X_dm_val = np.concatenate((X_dm_val,[roh]))
        
    roh, pval = stats.spearmanr(Xtest_sequences[0])
    X_dm_test = np.asarray([roh])
    for i in range(1, Xtest_sequences.shape[0]):
        roh, pval = stats.spearmanr(Xtest_sequences[i])
        X_dm_test = np.concatenate((X_dm_test,[roh]))
elif dist == 'distance': #time dimension!
    roh = dcor.distances.pairwise_distances(Xtrain_split_sequences[0])
    X_dm_train = np.asarray([roh])
    for i in range(1, Xtrain_split_sequences.shape[0]):
        roh = dcor.distances.pairwise_distances(Xtrain_split_sequences[i])
        X_dm_train = np.concatenate((X_dm_train,[roh]))
    
    roh = dcor.distances.pairwise_distances(Xval_split_sequences[0])
    X_dm_val = np.asarray([roh])
    for i in range(1, Xval_split_sequences.shape[0]):
        roh = dcor.distances.pairwise_distances(Xval_split_sequences[i])
        X_dm_val = np.concatenate((X_dm_val,[roh]))
        
    roh = dcor.distances.pairwise_distances(Xtest_sequences[0])
    X_dm_test = np.asarray([roh])
    for i in range(1, Xtest_sequences.shape[0]):
        roh = dcor.distances.pairwise_distances(Xtest_sequences[i])
        X_dm_test = np.concatenate((X_dm_test,[roh]))

VR = VietorisRipsPersistence(metric="precomputed")
X_vr_train = VR.fit_transform(X_dm_train)
X_vr_val = VR.fit_transform(X_dm_val)
X_vr_test = VR.fit_transform(X_dm_test)
    
Ampl = Amplitude(metric = m)
X_a_train = Ampl.fit_transform(X_vr_train)
X_a_val = Ampl.fit_transform(X_vr_val)
X_a_test = Ampl.fit_transform(X_vr_test)

Xtrain_topo = pd.concat([pd.DataFrame(X_a_train, columns = ['H0', 'H1']),
            pd.DataFrame(Ytrain_split_sequences, columns = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']),
            pd.DataFrame(Xtrain_split_sequences_id, columns = ['id'])], axis=1)
Xtrain_topo.to_csv(path+'train')

Xval_topo = pd.concat([pd.DataFrame(X_a_val, columns = ['H0', 'H1']),
            pd.DataFrame(Yval_split_sequences, columns = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']),
            pd.DataFrame(Xval_split_sequences_id, columns = ['id'])], axis=1)
Xval_topo.to_csv(path+'val')

Xtest_topo = pd.concat([pd.DataFrame(X_a_test, columns = ['H0', 'H1']),
            pd.DataFrame(Ytest_sequences, columns = ['linear']),
            pd.DataFrame(Xtest_sequences_id, columns = ['id'])], axis=1)
Xtest_topo.to_csv(path+'test')

17


# Regression

In [573]:
reg = 'lm' # 'svr', 'lm'
add = True

dataset = 3

sequence_length = 50
stride = 3

dist = 'distance' # 'spearman', 'distance'
m = 'landscape' # 'bottleneck' | 'wasserstein' | 'betti' | 'landscape' | 'silhouette' | 'heat' | 'persistence_image'

In [589]:
path = f'scores/shallow_{reg}_{sequence_length}_{m}_{dist}_FD00{dataset}'
results = pd.DataFrame(columns=['dataset', 'label', 'method', 'RSME_tr','R2_tr', 'RSME_v','R2_v','RSME_te', 'R2_te'])

Xtrain_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'train')
Xval_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'val')
Xtest_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'test')

label_cols = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']

X_a_train = np.asarray(Xtrain_topo[['H0', 'H1']])
X_a_val = np.asarray(Xval_topo[['H0', 'H1']])

Ytrain_split_sequences = np.asarray(Xtrain_topo[label_cols])
Yval_split_sequences = np.asarray(Xval_topo[label_cols])

# For 'shallow' regression testing (last elements for testing)
X_a_test = np.asarray(Xtest_topo.groupby('id')[['H0', 'H1']].tail(1))

if add:
    X_a_train = np.concatenate((Xtrain_split_sequences[:,-1,:],X_a_train), axis = 1)
    X_a_val =np.concatenate((Xval_split_sequences[:,-1,:],X_a_val), axis = 1)
    Xtemp = []
    current_id = 1
    for i in range(Xtest_topo.shape[0]):
        next_id = Xtest_topo.loc[i,'id']
        if current_id != next_id:
            Xtemp.append(Xtest_sequences[i-1][-1])
            current_id = next_id
    Xtemp.append(Xtest_sequences[i][-1])
    X_a_test =np.concatenate((np.asarray(Xtemp),X_a_test), axis = 1)

Ytest_sequences = np.asarray(Xtest_topo.groupby('id')['linear'].tail(1))

for l in range(len(label_cols)):
    print(label_cols[l])
    if reg == 'lm':
        rm = linear_model.LinearRegression()
        rm.fit(X_a_train, Ytrain_split_sequences[:,l])

    elif reg == 'svm':
        rm = svm.SVR(kernel='linear')
        rm.fit(X_a_train, Ytrain_split_sequences[:,l])

    elif reg == 'rfr':
        rm = ensemble.RandomForestRegressor(random_state=42)
        rm.fit(X_a_train, Ytrain_split_sequences[:,l])

    Ypred_train_rm = rm.predict(X_a_train)
    rsmesplit, r2split = evaluate(Ytrain_split_sequences[:,l], Ypred_train_rm, 'Training')

    Ypred_val_rm = rm.predict(X_a_val)
    rsmeval, r2val = evaluate(Yval_split_sequences[:,l], Ypred_val_rm, 'Validation')

    Ypred_test_rm = rm.predict(X_a_test)
    rsmetest, s, ls, r2test = evaluate(Ytest_sequences, Ypred_test_rm, 'Test')

    df = pd.DataFrame([[dataset, label_cols[l], reg, rsmesplit, r2split, rsmeval, r2val, rsmetest, s, ls, r2test]], columns=['dataset', 'label', 'method', 'RSME_tr','R2_tr', 'RSME_v','R2_v','RSME_te', 'S', 'L', 'R2_te'])
    results = pd.concat((results, df))

results.to_csv(path, index = False)

linear
piecewise_optimized
piecewies_125
descriptive
spearman
rec_error


  s = s + np.exp(-d/a1) - 1
  s = s + np.exp(-d/a1) - 1
  s = s + np.exp(-d/a1) - 1
  s = s + np.exp(-d/a1) - 1
  s = s + np.exp(-d/a1) - 1
  s = s + np.exp(-d/a1) - 1


In [590]:
from tabulate import tabulate
print(tabulate(results[['label', 'method', 'RSME_tr','RSME_v', 'RSME_te', 'S', 'L']], headers=['Label', 'Regression','RSME Training','RSME Validation', 'RSME Testing', 'S', 'L'], tablefmt='latex', showindex=False))

\begin{tabular}{llrrrrr}
\hline
 Label               & Regression   &   RSME Training &   RSME Validation &   RSME Testing &   S &           L \\
\hline
 linear              & lm           &         42.2236 &           38.7597 &         565699 & inf & 3.49053e+11 \\
 piecewise\_optimized & lm           &         34.4248 &           32.7508 &         399944 & inf & 1.6849e+11  \\
 piecewies\_125       & lm           &         21.8925 &           21.5751 &         333048 & inf & 1.1888e+11  \\
 descriptive         & lm           &         37.9822 &           36.0682 &         448150 & inf & 2.1004e+11  \\
 spearman            & lm           &         34.7538 &           33.1444 &         471361 & inf & 2.32673e+11 \\
 rec\_error           & lm           &         36.7468 &           37.3663 &         467208 & inf & 2.24867e+11 \\
\hline
\end{tabular}


# Combination with LSTM

In [151]:
import tensorflow as tf
from keras import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, TimeDistributed, Input

In [325]:
def make_model(input_shape, lstm_dims = [128, 64], dropout = True, d_rate = 0.2, dense_dims = [16, 1]):
    model = Sequential()
    model.add(Input(shape = [None, input_shape], ragged = True))
    for d in lstm_dims:
        model.add(LSTM(units = d, return_sequences = True))
        if dropout:
            model.add(Dropout(rate = d_rate))
    for d in dense_dims:
        model.add(Dense(units = d))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [491]:
reg = 'lm' # 'svr', 'lm'
add = True

dataset = 3

sequence_length = 50
stride = 3

dist = 'distance' # 'spearman', 'distance'
m = 'landscape' # 'bottleneck' | 'wasserstein' | 'betti' | 'landscape' | 'silhouette' | 'heat' | 'persistence_image'

In [576]:
tr, te, rl = tb.getTFDataset(set = dataset)
tb.addTFlinear(tr, te, rl)
tr_df = rd.RUL_DataFrame(df = tr, label_cols = ['linear'])
te_df = rd.RUL_DataFrame(df = te, label_cols = ['linear'])
if dataset in [1,3]:
    preproc.drop_zero_variance(rul_df = tr_df, rul_df_test = te_df)
    preproc.scale(rul_df = tr_df, rul_df_test = te_df, scale = 'std')
elif dataset in [2, 4]:
    classy.fit_kmeans(rul_df = tr_df, rul_df_test = te_df, colname = 'kmeans')
    preproc.one_hot_encode(rul_df = tr_df, rul_df_test = te_df, c_col = 'kmeans')
    preproc.c_drop_zero_variance(rul_df = tr_df, rul_df_test = te_df , c_cols = tr_df.categ_cols)
    preproc.c_scale(rul_df = tr_df, rul_df_test = te_df , c_cols = tr_df.categ_cols, scale = 'std')
    preproc.drop_zero_variance(rul_df = tr_df, rul_df_test = te_df)
    
# adjust to label size
preproc.delete_first_n(tr_df, 9)

labels = pd.read_csv(f'labels/FD00{dataset}')
label_cols = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']
if not labels.shape[0] == tr_df.df.shape[0]:
    print(f'Labels and Data do not have same length. Dataset FD00{dataset}')
temp = pd.concat([tr_df.df, labels], axis=1)
training_data_with_labels = rd.RUL_DataFrame(df = temp, data_cols = tr_df.data_cols, categ_cols = tr_df.categ_cols, label_cols = ['linear', 'piecewise_optimized', 'piecewise_125', 'descriptive', 'spearman',
                                                            'rec_error'])

Xtrain = training_data_with_labels.df[[training_data_with_labels.id_col] + training_data_with_labels.data_cols]
Xtest = te_df.df[[te_df.id_col] + te_df.data_cols]
Ytrain = training_data_with_labels.df[training_data_with_labels.label_cols]
Ytest = te_df.df['linear']

n = len(training_data_with_labels.data_cols)
print(n)

groupshufflesplit = GroupShuffleSplit(n_splits = 1, train_size = 0.8 , random_state = 42)
Xtrain_split, Ytrain_split, groupstrain, Xval_split, Yval_split, groupsval = training_validation_split(Xtrain, Ytrain, groupshufflesplit, 
                                                                                        groups = training_data_with_labels.df[training_data_with_labels.id_col])

Xtrain_split_sequences_id = make_data_to_sequence(Xtrain_split, sequence_length, groupstrain, stride)
Xtrain_split_sequences = Xtrain_split_sequences_id[:,:,1:(n+1)]
Xtrain_split_sequences_id = Xtrain_split_sequences_id[:,:,0][:,-1]
Ytrain_split_sequences = make_data_to_sequence(Ytrain_split, sequence_length, groupstrain, stride,  y = True)

Xval_split_sequences_id = make_data_to_sequence(Xval_split, sequence_length, groupsval, stride)
Xval_split_sequences = Xval_split_sequences_id[:,:,1:(n+1)]
Xval_split_sequences_id = Xval_split_sequences_id[:,:,0][:,-1]
Yval_split_sequences = make_data_to_sequence(Yval_split, sequence_length, groupsval, stride,  y = True)

Xtest_sequences_id = make_data_to_sequence(Xtest, sequence_length, te_df.df[te_df.id_col], stride, test = True)
Xtest_sequences = Xtest_sequences_id[:,:,1:(n+1)]
Xtest_sequences_id = Xtest_sequences_id[:,:,0][:,-1]
Ytest_sequences = make_data_to_sequence(Ytest, sequence_length, te_df.df[te_df.id_col], stride, y =True, test = True)

16


In [577]:
Xtrain_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'train')
Xval_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'val')
Xtest_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'test')

label_cols = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']

X_a_train = np.asarray(Xtrain_topo[['H0', 'H1']])
X_a_val = np.asarray(Xval_topo[['H0', 'H1']])

# choose label
l = 2
Ytrain_split_sequences = np.asarray(Xtrain_topo[label_cols[l]])
Yval_split_sequences = np.asarray(Xval_topo[label_cols[l]])

# For 'shallow' regression testing (last elements for testing)
X_a_test = np.asarray(Xtest_topo[['H0', 'H1']])
Ytest_sequences = np.asarray(Xtest_topo['linear'])

if add:
    X_a_train = np.concatenate((Xtrain_split_sequences[:,-1,:],X_a_train), axis = 1)
    X_a_val =np.concatenate((Xval_split_sequences[:,-1,:],X_a_val), axis = 1)
    X_a_test = np.concatenate((Xtest_sequences[:,-1,:] ,X_a_test), axis = 1)

In [578]:

counter = 0
for i in Xtrain_topo['id'].unique():
        Xtrain_topo.loc[Xtrain_topo['id'] == i, 'id'] = counter
        counter += 1

counter = 0
for i in Xval_topo['id'].unique():
        Xval_topo.loc[Xval_topo['id'] == i, 'id'] = counter
        counter += 1

counter = 0
for i in Xtest_topo['id'].unique():
        Xtest_topo.loc[Xtest_topo['id'] == i, 'id'] = counter
        counter += 1

In [579]:
Xtrain_ragged = tf.RaggedTensor.from_value_rowids(
                    values = X_a_train,
                    value_rowids = Xtrain_topo['id'])
Ytrain_ragged = tf.RaggedTensor.from_value_rowids(
                    values = Ytrain_split_sequences,
                    value_rowids = Xtrain_topo['id'])

Xval_ragged = tf.RaggedTensor.from_value_rowids(
                    values = X_a_val,
                    value_rowids = Xval_topo['id'])
Yval_ragged = tf.RaggedTensor.from_value_rowids(
                    values = Yval_split_sequences,
                    value_rowids = Xval_topo['id'])

Xtest_ragged = tf.RaggedTensor.from_value_rowids(
                    values = X_a_test,
                    value_rowids = Xtest_topo['id']) 
Ytest_sequences = np.asarray(Xtest_topo.groupby('id')['linear'].tail(1))

In [580]:
model = make_model(input_shape = Xtrain_ragged.shape[2])
        
model.fit(Xtrain_ragged, Ytrain_ragged,
                validation_data = (Xval_ragged, Yval_ragged),
                epochs = 120,
                batch_size = 16)

Epoch 1/120




Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 7

<keras.callbacks.History at 0x218198aef10>

In [348]:
model.fit(Xtrain_ragged, Ytrain_ragged,
                validation_data = (Xval_ragged, Yval_ragged),
                epochs = 50,
                batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x21818d5cd00>

In [581]:
Yhat_test_ragged = model.predict(Xtest_ragged)

In [582]:
Yhat_test_ragged = model.predict(Xtest_ragged)

Yhat = []
for i in range(Yhat_test_ragged.shape[0]):
    temp = Yhat_test_ragged[i][-1]
    Yhat.append(temp[-1].numpy())

evaluate(np.asarray(rl), np.asarray(Yhat))

(16.73415981058162, 376.4453252812496, 173.68277916587076, 0.8365823431556597)

In [517]:
evaluate(np.asarray(rl), np.asarray(Yhat))

(13.926154756932464,
 314.80625257653946,
 119.58097240581542,
 0.8876940179420423)

In [514]:
Yhat_train_ragged = model.predict(Xtrain_ragged)
evaluate(Ytrain_ragged.to_list()[0], Yhat_train_ragged.to_list()[0], 'train')

(7.890807469685202, 0.9606607343349207)

In [513]:
Yhat_val_ragged = model.predict(Xval_ragged)
evaluate(Yval_ragged.to_list()[0], Yhat_val_ragged.to_list()[0], 'train')

(10.70468471290443, 0.9344349596587355)

In [472]:
model.save(f'saved_models/FD001_ragged_topo_70_3_lanscape_pearson')



INFO:tensorflow:Assets written to: saved_models/FD001_ragged_topo_70_3_lanscape_pearson\assets


INFO:tensorflow:Assets written to: saved_models/FD001_ragged_topo_70_3_lanscape_pearson\assets


In [646]:
# Put in a loop to check all labels
add = True

dataset = 4

sequence_length = 50
stride = 10

dist = 'distance' # 'spearman', 'distance'
m = 'landscape' # 'bottleneck' | 'wasserstein' | 'betti' | 'landscape' | 'silhouette' | 'heat' | 'persistence_image'

tr, te, rl = tb.getTFDataset(set = dataset)
tb.addTFlinear(tr, te, rl)
tr_df = rd.RUL_DataFrame(df = tr, label_cols = ['linear'])
te_df = rd.RUL_DataFrame(df = te, label_cols = ['linear'])
if dataset in [1,3]:
    preproc.drop_zero_variance(rul_df = tr_df, rul_df_test = te_df)
    preproc.scale(rul_df = tr_df, rul_df_test = te_df, scale = 'std')
elif dataset in [2, 4]:
    classy.fit_kmeans(rul_df = tr_df, rul_df_test = te_df, colname = 'kmeans')
    preproc.one_hot_encode(rul_df = tr_df, rul_df_test = te_df, c_col = 'kmeans')
    preproc.c_drop_zero_variance(rul_df = tr_df, rul_df_test = te_df , c_cols = tr_df.categ_cols)
    preproc.c_scale(rul_df = tr_df, rul_df_test = te_df , c_cols = tr_df.categ_cols, scale = 'std')
    preproc.drop_zero_variance(rul_df = tr_df, rul_df_test = te_df)
    
# adjust to label size
preproc.delete_first_n(tr_df, 9)

labels = pd.read_csv(f'labels/FD00{dataset}')
label_cols = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']
if not labels.shape[0] == tr_df.df.shape[0]:
    print(f'Labels and Data do not have same length. Dataset FD00{dataset}')
temp = pd.concat([tr_df.df, labels], axis=1)
training_data_with_labels = rd.RUL_DataFrame(df = temp, data_cols = tr_df.data_cols, categ_cols = tr_df.categ_cols, label_cols = ['linear', 'piecewise_optimized', 'piecewise_125', 'descriptive', 'spearman',
                                                            'rec_error'])

Xtrain = training_data_with_labels.df[[training_data_with_labels.id_col] + training_data_with_labels.data_cols + training_data_with_labels.categ_cols]
Xtest = te_df.df[[te_df.id_col] + te_df.data_cols + te_df.categ_cols]
Ytrain = training_data_with_labels.df[training_data_with_labels.label_cols]
Ytest = te_df.df['linear']

n = Xtrain.shape[1]
print(n)

groupshufflesplit = GroupShuffleSplit(n_splits = 1, train_size = 0.8 , random_state = 42)
Xtrain_split, Ytrain_split, groupstrain, Xval_split, Yval_split, groupsval = training_validation_split(Xtrain, Ytrain, groupshufflesplit, 
                                                                                        groups = training_data_with_labels.df[training_data_with_labels.id_col])

Xtrain_split_sequences_id = make_data_to_sequence(Xtrain_split, sequence_length, groupstrain, stride)
Xtrain_split_sequences = Xtrain_split_sequences_id[:,:,1:(n+1)]
Xtrain_split_sequences_id = Xtrain_split_sequences_id[:,:,0][:,-1]
Ytrain_split_sequences = make_data_to_sequence(Ytrain_split, sequence_length, groupstrain, stride,  y = True)

Xval_split_sequences_id = make_data_to_sequence(Xval_split, sequence_length, groupsval, stride)
Xval_split_sequences = Xval_split_sequences_id[:,:,1:(n+1)]
Xval_split_sequences_id = Xval_split_sequences_id[:,:,0][:,-1]
Yval_split_sequences = make_data_to_sequence(Yval_split, sequence_length, groupsval, stride,  y = True)

Xtest_sequences_id = make_data_to_sequence(Xtest, sequence_length, te_df.df[te_df.id_col], stride, test = True)
Xtest_sequences = Xtest_sequences_id[:,:,1:(n+1)]
Xtest_sequences_id = Xtest_sequences_id[:,:,0][:,-1]
Ytest_sequences = make_data_to_sequence(Ytest, sequence_length, te_df.df[te_df.id_col], stride, y =True, test = True)

Xtrain_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'train')
Xval_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'val')
Xtest_topo = pd.read_csv(f'topo_features/{sequence_length}_{stride}_{m}_{dist}_FD00{dataset}' + 'test')

label_cols = ['linear', 'piecewise_optimized', 'piecewies_125', 'descriptive', 'spearman', 'rec_error']

X_a_train = np.asarray(Xtrain_topo[['H0', 'H1']])
X_a_val = np.asarray(Xval_topo[['H0', 'H1']])

X_a_test = np.asarray(Xtest_topo[['H0', 'H1']])

if add:
    X_a_train = np.concatenate((Xtrain_split_sequences[:,-1,:],X_a_train), axis = 1)
    X_a_val =np.concatenate((Xval_split_sequences[:,-1,:],X_a_val), axis = 1)
    X_a_test = np.concatenate((Xtest_sequences[:,-1,:] ,X_a_test), axis = 1)

# adjust ids for ragged tensor
counter = 0
for i in Xtrain_topo['id'].unique():
    Xtrain_topo.loc[Xtrain_topo['id'] == i, 'id'] = counter
    counter += 1

counter = 0
for i in Xval_topo['id'].unique():
    Xval_topo.loc[Xval_topo['id'] == i, 'id'] = counter
    counter += 1

counter = 0
for i in Xtest_topo['id'].unique():
    Xtest_topo.loc[Xtest_topo['id'] == i, 'id'] = counter
    counter += 1

Xtest_ragged = tf.RaggedTensor.from_value_rowids(
                        values = X_a_test,
                        value_rowids = Xtest_topo['id']) 
Ytest_sequences = np.asarray(Xtest_topo.groupby('id')['linear'].tail(1))


Xtrain_ragged = tf.RaggedTensor.from_value_rowids(
                        values = X_a_train,
                        value_rowids = Xtrain_topo['id'])



Xval_ragged = tf.RaggedTensor.from_value_rowids(
                        values = X_a_val,
                        value_rowids = Xval_topo['id'])


# choose label
path = f'scores/topo_ragged_LSTM_FD00{dataset}_{sequence_length}_{stride}_{m}_{dist}'
results = pd.DataFrame(columns=['dataset', 'label','RSME_tr','R2_tr', 'RSME_v','R2_v','RSME_te','S Score', 'L Score', 'R2_te'])
for l in [2]:
    Ytrain_split_sequences = np.asarray(Xtrain_topo[label_cols[l]])
    Yval_split_sequences = np.asarray(Xval_topo[label_cols[l]])

    Ytrain_ragged = tf.RaggedTensor.from_value_rowids(
                        values = Ytrain_split_sequences,
                        value_rowids = Xtrain_topo['id'])

    Yval_ragged = tf.RaggedTensor.from_value_rowids(
                        values = Yval_split_sequences,
                        value_rowids = Xval_topo['id'])

    model = make_model(input_shape = Xtrain_ragged.shape[2], lstm_dims = [256, 128], dropout = True, d_rate = 0.2, dense_dims = [32, 1])
            
    model.fit(Xtrain_ragged, Ytrain_ragged,
                    validation_data = (Xval_ragged, Yval_ragged),
                    epochs = 50,
                    batch_size = 16)
    
    model.save(f'saved_models/topo_ragged_lstm_model_FD00{dataset}_label{l}_{sequence_length}_{stride}_{m}_{dist}')

    Yhat_train_ragged = model.predict(Xtrain_ragged)
    rsmesplit, r2split, = evaluate(Ytrain_ragged.to_list()[0], Yhat_train_ragged.to_list()[0], 'train')

    Yhat_val_ragged = model.predict(Xval_ragged)
    rsmeval, r2val = evaluate(Yval_ragged.to_list()[0], Yhat_val_ragged.to_list()[0], 'train')

    Yhat_test_ragged = model.predict(Xtest_ragged)

    Yhat = []
    for i in range(Yhat_test_ragged.shape[0]):
        temp = Yhat_test_ragged[i][-1]
        Yhat.append(temp[-1].numpy())

    rsmetest, s, ls, r2test = evaluate(np.asarray(rl), np.asarray(Yhat))

    df = pd.DataFrame([[dataset, label_cols[l], rsmesplit, r2split, rsmeval, r2val, rsmetest, s, ls, r2test]], columns=['dataset', 'label','RSME_tr','R2_tr', 'RSME_v','R2_v','RSME_te','S Score', 'L Score', 'R2_te'])
    results = pd.concat((results, df))
results.to_csv(path, index = False)

24
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




INFO:tensorflow:Assets written to: saved_models/topo_ragged_lstm_model_FD004_label2_50_10_landscape_distance\assets


INFO:tensorflow:Assets written to: saved_models/topo_ragged_lstm_model_FD004_label2_50_10_landscape_distance\assets


In [642]:
model.fit(Xtrain_ragged, Ytrain_ragged,
                    validation_data = (Xval_ragged, Yval_ragged),
                    epochs = 10,
                    batch_size = 16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21897bbb9d0>

In [647]:
from tabulate import tabulate
print(tabulate(results[['label', 'RSME_tr','RSME_v', 'RSME_te', 'S Score', 'L Score']], headers=['Label','RSME Training','RSME Validation', 'RSME Testing', 'S', 'L'], tablefmt='latex', showindex=False))

\begin{tabular}{lrrrrr}
\hline
 Label         &   RSME Training &   RSME Validation &   RSME Testing &      S &       L \\
\hline
 piecewies\_125 &         39.9594 &           41.5309 &        54.6124 & 352781 & 3630.82 \\
\hline
\end{tabular}


In [550]:
model.summary()

Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_43 (LSTM)              (None, None, 128)         74240     
                                                                 
 dropout_40 (Dropout)        (None, None, 128)         0         
                                                                 
 lstm_44 (LSTM)              (None, None, 64)          49408     
                                                                 
 dropout_41 (Dropout)        (None, None, 64)          0         
                                                                 
 dense_48 (Dense)            (None, None, 16)          1040      
                                                                 
 dense_49 (Dense)            (None, None, 1)           17        
                                                                 
Total params: 124,705
Trainable params: 124,705
Non-t

In [524]:
Xtest_ragged.shape

TensorShape([100, None, 30])

In [210]:
PD = PearsonDissimilarity()
X_pd = PD.fit_transform(Xtrain_split_sequences)
VR = VietorisRipsPersistence(metric="precomputed")
X_vr = VR.fit_transform(X_pd)

In [246]:
i = 7
print(f'RUL: {Ytrain_split_sequences[i,0]}')
VR.plot(X_vr, sample=i)

RUL: 78.0


In [238]:
i = 10
print(f'RUL: {Ytrain_split_sequences[i,0]}')
VR.plot(X_vr, sample=i)

RUL: 48.0


In [243]:
i = 14
print(f'RUL: {Ytrain_split_sequences[i,0]}')
VR.plot(X_vr, sample=i)

RUL: 8.0


In [65]:
Ampl = Amplitude()
X_a = Ampl.fit_transform(X_vr)

In [58]:
from sklearn import linear_model, svm
lm = linear_model.LinearRegression()
lm.fit(X_a, Ytrain_split_sequences[:,0])

LinearRegression()

In [66]:
Yhat_train_split = lm.predict(X_a)

In [67]:
evaluate(Ytest_sequences, Yhat_train_split)

Test set RSME = 38.23183343296603, S = 6080.7100343828415, L= 1588.7851080791245, R2 = 0.15357065813808612


(38.23183343296603,
 6080.7100343828415,
 1588.7851080791245,
 0.15357065813808612)

In [49]:
Xtrain_split_sequences.shape

(835, 60, 14)

In [None]:
from sklearn import linear_model, svm


In [8]:
Xval_split_sequences.shape

(204, 60, 14)

In [15]:
from scipy import stats
import dcor

In [248]:
abc = dcor.distances.pairwise_distances(Xval_split_sequences[0].T)

In [249]:
abc

array([[ 0.        ,  7.03311611,  5.10450104, 13.23756775,  5.98696515,
         4.85222962,  5.88041811, 13.06542432,  5.48192318,  4.71828018,
         5.90634202,  5.57957565, 12.5841503 , 13.49188227],
       [ 7.03311611,  0.        ,  6.5153678 , 13.03635467,  6.52933216,
         5.53258477,  6.53224007, 12.87428555,  6.57483787,  5.32469813,
         6.7608991 ,  6.41541718, 12.01388098, 12.89701603],
       [ 5.10450104,  6.5153678 ,  0.        , 14.09345422,  5.27154817,
         3.43565998,  4.02868003, 14.07350329,  4.39559011,  3.5043827 ,
         4.87191415,  4.79385378, 13.50662552, 14.29260089],
       [13.23756775, 13.03635467, 14.09345422,  0.        , 11.23641984,
        12.24958444, 14.15416382,  4.87591072, 12.1214822 , 11.81143591,
        13.01356023, 13.90148852,  4.84826899,  4.6906171 ],
       [ 5.98696515,  6.52933216,  5.27154817, 11.23641984,  0.        ,
         3.68607369,  5.18722722, 11.16914399,  4.39701132,  3.76723251,
         5.41077332,  6.09

In [270]:
Xval_split_sequences.shape

(213, 60, 14)

In [289]:
Xval_split_sequences.shape[0]

213

In [272]:
type(X_pd)

numpy.ndarray

In [306]:
roh, pval = stats.spearmanr(Xval_split_sequences[0])
X_sp = np.asarray([roh])
for i in range(1, Xval_split_sequences.shape[0]):
    roh, pval = stats.spearmanr(Xval_split_sequences[i])
    X_sp = np.concatenate((X_sp,[roh]))

In [304]:
np.concatenate((X_sp, X_sp), axis = 0).shape

AxisError: axis 2 is out of bounds for array of dimension 2

In [9]:
PD = PearsonDissimilarity()
X_pd = PD.fit_transform(Xval_split_sequences)

In [10]:
X_pd.shape

(204, 14, 14)

In [11]:
VR = VietorisRipsPersistence(metric="precomputed")
X_vr = VR.fit_transform(X_pd)

In [13]:
VR.plot(X_vr, sample=50)