In [1]:
library(TDA)
library(TDAstats)

In [2]:
# read data - ensure the availability of the dataset in the current working directory
# setwd('target working directory')
data <- read.csv(file = 'breast_cancer_dataset.csv', header = TRUE)
data <- data[names(data)!='label']

In [3]:
# configuration parameters
n_samples = 5
wasserstein_exp = 1
wasserstein_dim = 0

In [4]:
# function to draw samples and compute persistent homologies
compute_homology <- function(n_samples){
    homology_list <- list()
    for (i in 1:n_samples) {
      my_data <- data
      data_sample <- my_data[sample(nrow(my_data), sample_size, replace = FALSE), ]
      homology_list[[i]] <- calculate_homology(data_sample)
    }
    return(homology_list)
}

In [5]:
# function to compute wasserstein distances between persistent homologies of samples
compute_distance <- function(n_samples) {
    my_list <- compute_homology(n_samples)
    my_matrix <- matrix(nrow = n_samples, ncol = n_samples)
    for (row in 1:nrow(my_matrix)) {
      for (col in row:ncol(my_matrix)) {
        my_matrix[row,col] <- wasserstein(my_list[[row]],my_list[[col]], 
                                                p = wasserstein_exp, 
                                                dimension = wasserstein_dim)
        
      }
    }
    return(my_matrix)
}

In [6]:
# print wasserstein distances as a matrix - upper triangular matrix
sample_size = 550 # select 550 data points or equivalently remove approx. 20 data points simulating "small" perturbation
distance_matrix <- compute_distance(n_samples)
distance_matrix # cell (i,j) denotes the wasserstein distance between sample i and sample j

0,1,2,3,4
0.0,236.8261,244.2818,433.2164,309.9561
,0.0,305.5811,584.1658,319.1143
,,0.0,553.9717,424.4586
,,,0.0,493.7156
,,,,0.0
