#### Import packages

In [1]:
## Import packages from ripser and persim (scikit-tda packages) for computing 
## average persistence landscapes

from ripser import ripser
from persim.landscapes import (
PersLandscapeApprox,
average_approx,
snap_pl,
plot_landscape,
plot_landscape_simple
)
import gudhi.representations
import gudhi as gd
import numpy as np
from numpy import random
import math
from ediblepickle import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import time
import csv
import os
from persim.persistent_entropy import *
from scipy import stats

sns.set_style('darkgrid')

#### Get the filenames and locations to load the persistence diagram datasets

In [2]:
directory_in_str = '../data/annotated_PH_2020_500/'

directory = os.fsencode(directory_in_str)
folders=[]

for folder in os.listdir(directory):
    foldername = os.fsdecode(folder)
    if foldername.startswith('PH_640_'):
        folders.append(foldername)

#### Get persistence data stored in files within the folder whose location is passed through the keyword argument path

In [4]:
number_of_edges_threshold = 0

"""
Returns persistence data
Inputs -- path: local_path
       -- starts_with: the string that the filenames start with if needed
       -- ends_with: the string that the filenames end with if needed
       -- threshold: number of edges threshold of the underlying vascular networks
"""

def get_data(path=None,starts_with=None,ends_with=None,threshold=None):
    files = [os.fsdecode(file) for file in os.listdir(path)]
    persistence_data={}
    for filename in files:
        if starts_with and ends_with:
            if filename.startswith(starts_with) and filename.endswith(ends_with):
                data = []
                with open(path+filename) as f:
                    reader = csv.reader(f)
                    for row in reader:
                        data.append([float(x) for x in row])
                if threshold == 0:
                    persistence_data[filename]=[item for item in data[1:] if item[1]!=np.inf]
                else:
                    if data[0][1] >= threshold:
                        persistence_data[filename]=[item for item in data[1:] if item[1]!=np.inf]
        elif starts_with:
            if filename.startswith(starts_with):
                data = []
                with open(path+filename) as f:
                    reader = csv.reader(f)
                    for row in reader:
                        data.append([float(x) for x in row])
                if threshold == 0:
                    persistence_data[filename]=[item for item in data[1:] if item[1]!=np.inf]
                else:
                    if data[0][1] >= threshold:
                        persistence_data[filename]=[item for item in data[1:] if item[1]!=np.inf]
        elif ends_with:
            if filename.endswith(ends_with):
                data = []
                with open(path+filename) as f:
                    reader = csv.reader(f)
                    for row in reader:
                        data.append([float(x) for x in row])
                if threshold == 0:
                    persistence_data[filename]=[item for item in data[1:] if item[1]!=np.inf]
                else:
                    if data[0][1] >= threshold:
                        persistence_data[filename]=[item for item in data[1:] if item[1]!=np.inf]
                
    return persistence_data

In [5]:
og_PHs={}
for folder in folders:
    og_PHs[folder] = get_data('../data/annotated_PH_2020_500/'+folder+'/',starts_with=('ph_'),threshold=number_of_edges_threshold)

shuffled_PHs={}
for folder in folders:
    shuffled_PHs[folder] = get_data('../data/annotated_PH_2020_500/'+folder+'/',starts_with=('shuffled_ph_'),threshold=number_of_edges_threshold)

#### Use the following helper function to compute average of approximate persistence landscapes from the persistence diagram data, which is stored within a dictionary of dictinoaries.

In [6]:
def get_average_pl(persistence_data):
    pl_data = []
    for key,value in persistence_data.items():
        if value:
            for subkey,subvalue in value.items():
                pl_data.append(PersLandscapeApprox(dgms=[np.array([[item[0],item[1]] for item in subvalue])],\
                                                   hom_deg=0))
    return average_approx(pl_data)

#### Compute distance between a pair of persistence landscapes using norms

In [7]:
def get_norm(pl1,pl2,norm=None):
    [pl1_snapped, pl2_snapped] = snap_pl([pl1, pl2])
    diff_pl = pl1_snapped - pl2_snapped
    if norm == 'sup':
        return diff_pl.sup_norm()
    elif norm == 'l1':
        return diff_pl.p_norm(1)
    else:
        return dff_pl.p_norm(2)

#### Use the following helper function to compute approximate persistence landscapes from the persistence diagram data, which is stored within a dictionary of dictinoaries.

In [8]:
def get_approximate_pl(persistence_data):
    pl_data = []
    for key,value in persistence_data.items():
        if value:
            for subkey,subvalue in value.items():
                pl_data.append(PersLandscapeApprox(dgms=[np.array([[item[0],item[1]] for item in subvalue])],\
                                                   hom_deg=0))
    return pl_data

#### Permutation test:
Refer to the following link for another example: https://persim.scikit-tda.org/en/latest/notebooks/Differentiation%20with%20Persistence%20Landscapes.html

In [10]:
from numpy import random
comb_pl = get_approximate_pl(og_PHs) + get_approximate_pl(shuffled_PHs)
significance = get_norm(get_average_pl(og_PHs),get_average_pl(shuffled_PHs),norm='sup')
print(significance)
n=len(comb_pl)
sig_count = 0
num_perms=100
sigs=[]

for shuffle in range(num_perms):
    A_indices = random.choice(range(n),n//2)
    B_indices = [_ for _ in range(n) if _ not in A_indices]

    A_pl = [comb_pl[i] for i in A_indices]
    B_pl = [comb_pl[j] for j in B_indices]

    A_avg = average_approx(A_pl)
    B_avg = average_approx(B_pl)
    [A_avg_sn, B_avg_sn] = snap_pl([A_avg,B_avg])

    AB_diff = A_avg_sn - B_avg_sn
    sig = AB_diff.sup_norm()
    if (sig >= significance): sig_count += 1
    sigs.append(sig)
    print(sig)

pval = sig_count/num_perms

print(f'There were {sig_count} shuffles out of {num_perms} that',
     'were more significant than the true labelling. Thus, the',
     f'p-value is {pval}.')

0.10118723629465924
0.016072358355857463
0.01888657275803584
0.011823338950517681
0.0075804865898486395
0.00960125550896114
0.0061699603908349915
0.005201690881080592
0.004034773803596806
0.010179120239791319
0.009394765134095304
0.012768985527570997
0.022765238109996727
0.004790025324519739
0.006596851210469962
0.019367296264292885
0.011952375920400375
0.007083639487198548
0.005979880579895097
0.0100478982986642
0.013954316916238108
0.008945763547656832
0.008525394830944333
0.012825598171914442
0.006427398355047201
0.010107095237787758
0.006653191562382282
0.010484665767278259
0.015739436952981738
0.008065674963981431
0.010490382724245885
0.012399757442288675
0.00587418638559037
0.007838534822981691
0.028395745518597228
0.010227228019223975
0.010811146698822237
0.014262141874806303
0.005223416739041843
0.012936145243961208
0.010835436698976287
0.006785129365749293
0.011502920160712554
0.01987717953634624
0.010278950208927731
0.010125290329591474
0.012720056790550813
0.0114646312385192