In [1]:
import sys
import os
pathogist_dir = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(os.path.join(pathogist_dir, 'PathOGiST'))

In [2]:
import time
import itertools
import statistics

In [3]:
import pathogist
import pathogist.io
import pathogist.cluster
import pathogist.distance

## Reading distance files

In [4]:
ecoli_mlst_dist = pathogist.io.open_distance_file('/home/mkatebi/PathOGiSTPrivate/distance_files/ecoli_mlst_dist.tsv')

In [5]:
tb_mlst_dist = pathogist.io.open_distance_file('/home/mkatebi/PathOGiSTPrivate/distance_files/tb_mlst_dist.tsv')
tb_sample_subset = tb_mlst_dist.index.values[0:100]

In [6]:
yersinia_mlst_dist = pathogist.io.open_distance_file('/home/mkatebi/PathOGiSTPrivate/distance_files/yersinia_mlst_dist.tsv')

In [7]:
yersinia_snv_dist = pathogist.io.open_distance_file('package/yersinia_snv_dist.tsv')

### E. Coli MLST

#### Time with c4 parallel correlation clustering algorithm

In [17]:
s_t = time.time()
ecoli_mlst_clustering_c4 = pathogist.cluster.c4_correlation(ecoli_mlst_dist, threshold=1500)
e_t = time.time()
print(e_t - s_t)

0.019458532333374023


#### Time with current correlation clustering algorithm

In [18]:
s_t = time.time()
ecoli_mlst_clustering = pathogist.cluster.correlation(ecoli_mlst_dist, threshold=1500)
e_t = time.time()
print(e_t - s_t)

11.148344993591309


#### Adjusted rand index between output of c4 and current correlation clustering

In [19]:
pathogist.cluster.adjusted_rand_index(ecoli_mlst_clustering_c4, ecoli_mlst_clustering)

0.9929475426271949

### Yersinia MLST

#### Time with c4 parallel correlation clustering algorithm

In [34]:
s_t = time.time()
yersinia_mlst_clustering_c4 = pathogist.cluster.c4_correlation(yersinia_mlst_dist, threshold=400)
e_t = time.time()
print(e_t - s_t)

0.035915374755859375


#### Time with current correlation clustering algorithm

In [35]:
s_t = time.time()
yersinia_mlst_clustering = pathogist.cluster.correlation(yersinia_mlst_dist, threshold=400)
e_t = time.time()
print(e_t - s_t)

113.84787106513977


#### Adjusted rand index between output of c4 and current correlation clustering

In [36]:
pathogist.cluster.adjusted_rand_index(yersinia_mlst_clustering_c4, yersinia_mlst_clustering)

1.0

### Yersinia SNP

#### Time with c4 parallel correlation clustering algorithm

In [12]:
s_t = time.time()
yersinia_snv_clustering_c4 = pathogist.cluster.c4_correlation(yersinia_snv_dist, threshold=2500)
e_t = time.time()
print(e_t - s_t)

0.037908315658569336


#### Time with current correlation clustering algorithm

In [10]:
s_t = time.time()
yersinia_snv_clustering = pathogist.cluster.correlation(yersinia_snv_dist, threshold=2500)
e_t = time.time()
print(e_t - s_t)

113.8979766368866


#### Adjusted rand index between output of c4 and current correlation clustering

In [13]:
pathogist.cluster.adjusted_rand_index(yersinia_snv_clustering_c4, yersinia_snv_clustering)

1.0

### M. Tuberculosis MLST

#### Time with c4 parallel correlation clustering algorithm

In [30]:
s_t = time.time()
tb_mlst_clustering_c4 = pathogist.cluster.c4_correlation(tb_mlst_dist.loc[tb_sample_subset, tb_sample_subset], threshold=700)
e_t = time.time()
print(e_t - s_t)

0.045311689376831055


#### Time with current correlation clustering algorithm

In [31]:
s_t = time.time()
tb_mlst_clustering = pathogist.cluster.correlation(tb_mlst_dist.loc[tb_sample_subset, tb_sample_subset], threshold=700)
e_t = time.time()
print(e_t - s_t)

29.851481914520264


#### Adjusted rand index between output of c4 and current correlation clustering

In [32]:
pathogist.cluster.adjusted_rand_index(tb_mlst_clustering_c4, tb_mlst_clustering)

0.9670712474220253

#### Time of c4 algorithm on the all the samples

In [21]:
s_t = time.time()
tb_mlst_clustering_c4 = pathogist.cluster.c4_correlation(tb_mlst_dist, threshold=700)
e_t = time.time()
print(e_t - s_t)

0.1685783863067627


## Comparing outputs of different runs of c4

In [25]:
c4_output_clusterings = []
for i in range(50):
    c4_output_clusterings += [pathogist.cluster.c4_correlation(tb_mlst_dist, threshold=700)]

In [26]:
ARIs = []
for c1, c2 in itertools.combinations(c4_output_clusterings, 2):
    ARIs += [pathogist.cluster.adjusted_rand_index(c1, c2)]

In [27]:
print('Max:', max(ARIs))
print('Min:', min(ARIs))
print('Mean:', statistics.mean(ARIs))
print('Standard deviation:', statistics.stdev(ARIs))

Max: 1.0
Min: 0.7139533113071445
Mean: 0.98379136920366
Standard deviation: 0.054372729943674696
