<a href="https://colab.research.google.com/github/stellagerantoni/LatentCfMultivariate/blob/main/FingerMovements_simple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/stellagerantoni/LatentCfMultivariate

In [None]:
!pip install -q wildboar
!pip install -q scikit-learn
!pip install -q stumpy
!pip install -q fastdtw
!pip install aeon

In [None]:
import logging
import os
import warnings
from argparse import ArgumentParser
from aeon.datasets import load_classification

from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.spatial import distance_matrix
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree, KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from wildboar.datasets import load_dataset
from wildboar.ensemble import ShapeletForestClassifier
from wildboar.explain.counterfactual import counterfactuals
%cd '/content/LatentCfMultivariate'
from _guided import ModifiedLatentCF
from help_functions import *
from keras_models import *

In [61]:
os.environ['TF_DETERMINISTIC_OPS'] = '1'
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
RANDOM_STATE = 39

## **FUNCTIONS**

In [62]:
def load_dataset(dataset):
  X, y = load_classification(dataset)
  if dataset == 'FingerMovements':
    pos = 'left'
    neg = 'right'


  print(" Shape of X = ", X.shape)
  print(" Shape of y = ", y.shape)
  #print(" Meta data = ", meta_data)
  # Convert positive and negative labels to 1 and 0
  pos_label, neg_label = 1, 0
  if pos != pos_label:
      y[y==pos] = pos_label # convert/normalize positive label to 1
  if neg != neg_label:
      y[y==neg] = neg_label # convert negative label to 0

  y = y.astype(int)
  print(f"\n X[:1] = \n{X[:1]}")
  return X,y,pos_label, neg_label

# **ACTUALL CODE**
datasets available : 'Heartbeat', 'SelfRegulationSCP1'

In [63]:
RANDOM_STATE = 39
X,y,pos_label,neg_label = load_dataset('FingerMovements')
X = X.transpose(0,2,1)
print(f'shape of X = {X.shape}')
print(f'shape of y = {y.shape}')
#print(f'data imformation = {data_information}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
print(f'shape of X train = {X_train.shape}')
print(f'shape of y train = {y_train.shape}')

 Shape of X =  (416, 28, 50)
 Shape of y =  (416,)

 X[:1] = 
[[[41.8 44.8 47.1 ... 69.8 72.6 76.1]
  [55.2 53.8 59.9 ... 17.5 28.  12.1]
  [-8.6 -3.6 14.4 ... 23.3 35.9 23.2]
  ...
  [16.9 24.5 24.5 ... 51.9 59.6 57.3]
  [42.2 35.  41.7 ... 51.5 58.5 46.9]
  [13.  26.6 52.5 ... -3.5 -3.2 -2.6]]]
shape of X = (416, 50, 28)
shape of y = (416,)
shape of X train = (332, 50, 28)
shape of y train = (332,)


In [64]:
#Upsample the minority class

unique_classes, class_counts = np.unique(y_train, return_counts=True)
print(f'before: {class_counts}')
X_train,y_train = upsample_minority_multivariate(X_train,y_train)
X,y = upsample_minority_multivariate(X, y)
unique_classes, class_counts = np.unique(y_train, return_counts=True)
print(f'after: {class_counts}')

before: [166 166]
after: [166 166]


In [65]:
#Processing and Padding all our data
#Padding needed for autoencoder

n_training,n_timesteps, n_features= X_train.shape

X, trained_scaler =  normalize_multivariate(data=X, n_timesteps=n_timesteps, n_features = n_features)
X_train_processed, trained_scaler =  normalize_multivariate(data=X_train, n_timesteps=n_timesteps, n_features = n_features)
X_test_processed, _ =  normalize_multivariate(data=X_test, n_timesteps=n_timesteps, scaler=trained_scaler, n_features = n_features)

X, padding_size = conditional_pad_multivariate(X)
X_train_processed_padded, padding_size = conditional_pad_multivariate(X_train_processed) # add extra padding zeros if n_timesteps cannot be divided by 4, required for 1dCNN autoencoder structure
X_test_processed_padded, _ = conditional_pad_multivariate(X_test_processed)

n_timesteps_padded = X_train_processed_padded.shape[1]
print(f"Data pre-processed, original #timesteps={n_timesteps}, padded #timesteps={n_timesteps_padded}.")

#check the processing (0,1) min should be min 0 and max should be max 1
print(f"\nmin value = {np.min(X_train)}, max value = {np.max(X_train)}")
print(f"min value normalized = {np.min(X_train_processed)}, max value normalized= {np.max(X_train_processed)}")

#check that padding paddes the right dimention
print(f"\nX_train.shape = {X_train.shape}" )
print(f"X_train_processed_padded.shape = {X_train_processed_padded.shape}")


Data pre-processed, original #timesteps=50, padded #timesteps=52.

min value = -132.1, max value = 205.1
min value normalized = 0.0, max value normalized= 1.0

X_train.shape = (332, 50, 28)
X_train_processed_padded.shape = (332, 52, 28)


In [66]:
#splitting the dataset

from sklearn.model_selection import train_test_split
X_train,X_validation, y_train, y_validation = train_test_split(X_train_processed_padded, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train)

In [67]:
#Getting the two forms of labels needed
#-the y_classes (1,0,1,0,...)
#-the y (one hot encoded)

print(f'X_train = {X_train.shape}')
print(f'X_validation = {X_validation.shape}')
print(f'X_test = {X_test.shape}')

y_classes = y
y_train_classes = y_train
y_validation_classes = y_validation
y_test_classes = y_test

from tensorflow.keras.utils import to_categorical
y = to_categorical(y, len(np.unique(y)))
y_train = to_categorical(y_train, len(np.unique(y_train)))
y_validation = to_categorical(y_validation, len(np.unique(y_validation)))
y_test = to_categorical(y_test, len(np.unique(y_test)))

print(f'\ny_train_classes = {y_train_classes.shape}, y_validation_classes = {y_validation_classes.shape}, y_test_classes = {y_test_classes.shape}')
print(f'y_train = {y_train.shape}, y_validation = {y_validation.shape}, y_test= {y_test.shape}')

X_train = (265, 52, 28)
X_validation = (67, 52, 28)
X_test = (84, 50, 28)

y_train_classes = (265,), y_validation_classes = (67,), y_test_classes = (84,)
y_train = (265, 2), y_validation = (67, 2), y_test= (84, 2)


In [68]:


# ## LatentCF++ models
# reset seeds for numpy, tensorflow, python random package and python environment seed
reset_seeds()
###############################################
# ### 1dCNN classifier

cnnClassifier = Classifier(
    n_timesteps_padded, n_features, n_output=2, add_dense_layer = False
)

optimizer = keras.optimizers.Adam(lr=0.001)
cnnClassifier.compile(
    optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
)

# Define the early stopping criteria
early_stopping_accuracy = keras.callbacks.EarlyStopping(
    monitor="val_accuracy", patience=15, restore_best_weights=True
)
# Train the model
reset_seeds()
print("Training log for LSTM-FCN classifier:")
classifier_history = cnnClassifier.fit(
    X_train,
    y_train,
    epochs=150,
    batch_size=12,
    shuffle=True,
    verbose=True,
    validation_data=(X_validation, y_validation),
    callbacks=[early_stopping_accuracy],
)

y_pred = cnnClassifier.predict(X_test_processed_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
acc = balanced_accuracy_score(y_true=y_test_classes, y_pred=y_pred_classes)
print(f"LSTM-FCN classifier trained, with validation accuracy {acc}.")

confusion_matrix_df = pd.DataFrame(
    confusion_matrix(y_true=y_test_classes, y_pred=y_pred_classes, labels=[1, 0]),
    index=["True:1", "True:0"],
    columns=["Pred:1", "Pred:0"],
)
print(confusion_matrix_df)




Training log for LSTM-FCN classifier:
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
LSTM-FCN classifier trained, with validation accuracy 0.44047619047619047.
        Pred:1  Pred:0
True:1      18      24
True:0      23      19


In [69]:
reset_seeds()

# ### 1dCNN autoencoder
autoencoder = Autoencoder( n_timesteps_padded,n_features,32)
optimizer = keras.optimizers.Adam(lr=0.0005)
autoencoder.compile(optimizer=optimizer, loss="mse")

# Define the early stopping criteria
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=5, restore_best_weights=True)
# Train the model
reset_seeds()
print("Training log for 1dCNN autoencoder:")
autoencoder_history = autoencoder.fit(
    X_train,
    X_train,
    epochs=50,
    batch_size=12,
    shuffle=True,
    verbose=2,
    validation_data=(X_validation, X_validation),
    callbacks=[early_stopping])

ae_val_loss = np.min(autoencoder_history.history['val_loss'])
print(f"1dCNN autoencoder trained, with validation loss: {ae_val_loss}.")


(None, 52, 28)
(None, 52, 32)
(None, 26, 32)
(None, 26, 16)
(None, 13, 16)
(None, 13, 16)
(None, 26, 16)
(None, 26, 32)
(None, 52, 32)




(None, 52, 28)
Training log for 1dCNN autoencoder:
Epoch 1/50
23/23 - 10s - loss: 0.1518 - val_loss: 0.0500 - 10s/epoch - 431ms/step
Epoch 2/50
23/23 - 0s - loss: 0.0250 - val_loss: 0.0140 - 271ms/epoch - 12ms/step
Epoch 3/50
23/23 - 0s - loss: 0.0131 - val_loss: 0.0112 - 178ms/epoch - 8ms/step
Epoch 4/50
23/23 - 0s - loss: 0.0112 - val_loss: 0.0099 - 186ms/epoch - 8ms/step
Epoch 5/50
23/23 - 0s - loss: 0.0101 - val_loss: 0.0092 - 272ms/epoch - 12ms/step
Epoch 6/50
23/23 - 0s - loss: 0.0094 - val_loss: 0.0083 - 284ms/epoch - 12ms/step
Epoch 7/50
23/23 - 0s - loss: 0.0087 - val_loss: 0.0077 - 393ms/epoch - 17ms/step
Epoch 8/50
23/23 - 0s - loss: 0.0080 - val_loss: 0.0071 - 407ms/epoch - 18ms/step
Epoch 9/50
23/23 - 0s - loss: 0.0074 - val_loss: 0.0066 - 473ms/epoch - 21ms/step
Epoch 10/50
23/23 - 0s - loss: 0.0068 - val_loss: 0.0060 - 494ms/epoch - 21ms/step
Epoch 11/50
23/23 - 0s - loss: 0.0061 - val_loss: 0.0055 - 451ms/epoch - 20ms/step
Epoch 12/50
23/23 - 0s - loss: 0.0057 - val_los

In [None]:
#Gettting the Global weights, needed for counterfactuals

from _guided import get_global_weights
from help_functions import evaluate
pos_label = 1
neg_label = 0

step_weights = get_global_weights(
        X,
        y_classes,
        cnnClassifier,
        n_timesteps= n_timesteps,
        n_features=n_features,
        random_state=RANDOM_STATE,
)


In [71]:
step_weights

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]])

In [72]:
reset_seeds()
cf_model = ModifiedLatentCF(
    probability=0.5,tolerance=1e-6, max_iter=500, optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),autoencoder = autoencoder,
    pred_margin_weight=0.7, step_weights = step_weights, random_state= RANDOM_STATE)
cf_model.fit(cnnClassifier)

y_neg = y_classes[y_classes == 0]
X_neg = X[y_classes == 0]


with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning) # ignore warning of matrix multiplication: https://stackoverflow.com/questions/29688168/mean-nanmean-and-warning-mean-of-empty-slice
    cf_embeddings, losses, weights = cf_model.transform(X_neg, y_neg) #self, x, pred_label
cf_pred_labels = cnnClassifier.predict(cf_embeddings)[:,1]# predicted probabilities of CFs
for idx in range(cf_pred_labels.shape[0]):
  if cf_pred_labels[idx] > 0.5:
    cf_pred_labels[idx] = 1
  else:
    cf_pred_labels[idx] = 0

print(f'Transformation_finished with validity_score = {validity_score(y_neg,cf_pred_labels)}')

1 samples been transformed.
26 samples been transformed.
51 samples been transformed.
76 samples been transformed.
101 samples been transformed.
126 samples been transformed.
151 samples been transformed.
176 samples been transformed.
201 samples been transformed.
208 samples been transformed, in total.
Transformation_finished with validity_score = 1.0


In [None]:
#Calculating proximity
from tensorflow.keras.losses import MeanSquaredError
total = 0
probability = 0.5
for idx in range(cf_embeddings.shape[0]):
    counterfactual = cf_embeddings[idx,np.newaxis]
    prediction = cnnClassifier.predict(counterfactual)[:, 1]
    dist = (prediction - probability)
    total +=dist
mean_mse = total /cf_embeddings.shape[0]


In [74]:
print(f"The Mean MSE of the data is: {mean_mse} ")

The Mean MSE of the data is: [0.00051351] 


In [None]:
#Calculating proximity
from tensorflow.keras.losses import MeanSquaredError
total = 0
probability = 0.5
for idx in range(cf_embeddings.shape[0]):
    counterfactual = cf_embeddings[idx,np.newaxis]
    prediction = cnnClassifier.predict(counterfactual)[:, 1]
    dist = abs(prediction - probability)
    total +=dist
mean_mse = total /cf_embeddings.shape[0]


In [76]:
print(f"The Absolute Mean MSE of the data is: {mean_mse} ")

The Absolute Mean MSE of the data is: [0.00051351] 


In [77]:
def remove_paddings(cf_samples, padding_size):
    if padding_size != 0:
        # use np.squeeze() to cut the last time-series dimension, for evaluation
        cf_samples = np.squeeze(cf_samples[:, :-padding_size, :])
    else:
        cf_samples = np.squeeze(cf_samples)
    return cf_samples

In [78]:
X = remove_paddings(X, padding_size)
cf_embeddings = remove_paddings(cf_embeddings, padding_size)


In [91]:
#Proximity
def euclidean_distance(X, cf_samples):
    paired_distances = np.linalg.norm(X - cf_samples, axis=1)
    return np.mean(paired_distances)
euclidean_distance(X_neg, cf_embeddings)

0.4412798141890419

In [80]:
def remove_paddings(cf_samples, padding_size):
    if padding_size != 0:
        # use np.squeeze() to cut the last time-series dimension, for evaluation
        cf_samples = np.squeeze(cf_samples[:, :-padding_size, :])
    else:
        cf_samples = np.squeeze(cf_samples)
    return cf_samples

In [81]:
# Remove paddings because KDE does not work with paddings.

X_unpadded = remove_paddings(X, padding_size)
cf_embeddings_unpadded = remove_paddings(cf_embeddings, padding_size)

In [82]:
from scipy.stats import gaussian_kde
diffrences_from_abnormal = []
diffrences_from_normal = []
for dimention in range(cf_embeddings.shape[2]):


  abnormal_data = X[y_classes == 1][:,:,dimention]
  normal_data = X[y_classes == 0][:,:,dimention]
  counterf_data = cf_embeddings[:,:,dimention]

  #get the kernel for every dimention of the trained
  kernel = gaussian_kde(abnormal_data.T,bw_method=None)

  #get all the log likelihoods
  log_likelihood_abnormal = np.mean(kernel.logpdf(abnormal_data.T))
  log_likelihood_normal = np.mean(kernel.logpdf(normal_data.T))
  log_likelihood_counterfactual = np.mean(kernel.logpdf(counterf_data.T))

  #get the diffrences from the counterfactuals
  diff_from_abnormal = abs(log_likelihood_counterfactual-log_likelihood_abnormal)
  diffrences_from_abnormal.append(diff_from_abnormal)

  diff_from_normal = abs(log_likelihood_counterfactual-log_likelihood_normal)
  diffrences_from_normal.append(diff_from_normal)



In [83]:
print(diffrences_from_normal)

[10.14190637323604, 59.36053145276334, 16.676596903506663, 16.009800043353337, 15.914253094148407, 3.3084957436863363, 14.987479240605609, 11.431553806729582, 12.850826221229624, 21.995216594113145, 10.085604616895267, 9.83738700850374, 17.424174230334017, 33.9245609690335, 59.762894464983084, 7.54800387978527, 13.018639012459815, 0.5721938614201036, 40.39350238012719, 32.39096305950366, 1.3753953888049892, 1.9422568245467886, 6.165461277482876, 6.1785447351806795, 5.474288819501908, 17.77340190528811, 15.690615477537278, 13.785216282805678]


In [84]:
print(diffrences_from_abnormal)

[44.212911050150126, 112.92762526917502, 73.65000590042959, 73.24704171599039, 41.43466186241061, 53.01934523248295, 72.4284511407488, 69.03285774268709, 43.31021800002591, 31.608219964543963, 44.13069128215305, 44.29242341510391, 36.63523117176335, 87.78181016291421, 113.50509568689554, 47.139258887473574, 42.10099856438231, 58.83248072223172, 101.41369303537027, 94.73161550225579, 57.502877782225994, 49.84957174575267, 47.66249700027157, 45.251459517188835, 46.59700932339416, 34.833210622336864, 36.26195917458506, 39.60060776059336]


In [85]:
print(np.mean(diffrences_from_normal))

17.000705845270215


In [86]:
print(np.mean(diffrences_from_abnormal))

58.678351044126316
