In [3]:
import sys
import os
pathogist_dir = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(os.path.join(pathogist_dir, 'PathOGiST'))

In [20]:
import time
import itertools
import statistics

In [5]:
import pathogist
import pathogist.io
import pathogist.cluster
import pathogist.distance

## Reading distance files

In [6]:
ecoli_mlst_dist = pathogist.io.open_distance_file('/home/mkatebi/PathOGiSTPrivate/distance_files/ecoli_mlst_dist.tsv')

In [7]:
tb_mlst_dist = pathogist.io.open_distance_file('/home/mkatebi/PathOGiSTPrivate/distance_files/tb_mlst_dist.tsv')
tb_sample_subset = tb_mlst_dist.index.values[0:100]

In [8]:
yersinia_mlst_dist = pathogist.io.open_distance_file('/home/mkatebi/PathOGiSTPrivate/distance_files/yersinia_mlst_dist.tsv')

In [9]:
yersinia_snv_dist = pathogist.io.open_distance_file('package/yersinia_snv_dist.tsv')

### E. Coli MLST

#### Time with c4 parallel correlation clustering algorithm

In [12]:
s_t = time.time()
ecoli_mlst_clustering_c4 = pathogist.cluster.c4_correlation(ecoli_mlst_dist, threshold=1500)
e_t = time.time()
print(e_t - s_t)

0.019078969955444336


#### Time with current correlation clustering algorithm

In [13]:
s_t = time.time()
ecoli_mlst_clustering = pathogist.cluster.correlation(ecoli_mlst_dist, threshold=1500)
e_t = time.time()
print(e_t - s_t)

13.50654673576355


#### Adjusted rand index between output of c4 and current correlation clustering

In [14]:
pathogist.cluster.adjusted_rand_index(ecoli_mlst_clustering_c4, ecoli_mlst_clustering)

1.0

### Yersinia MLST

#### Time with c4 parallel correlation clustering algorithm

In [15]:
s_t = time.time()
yersinia_mlst_clustering_c4 = pathogist.cluster.c4_correlation(yersinia_mlst_dist, threshold=400)
e_t = time.time()
print(e_t - s_t)

0.03434181213378906


#### Time with current correlation clustering algorithm

In [16]:
s_t = time.time()
yersinia_mlst_clustering = pathogist.cluster.correlation(yersinia_mlst_dist, threshold=400)
e_t = time.time()
print(e_t - s_t)

130.7845356464386


#### Adjusted rand index between output of c4 and current correlation clustering

In [None]:
pathogist.cluster.adjusted_rand_index(yersinia_mlst_clustering_c4, yersinia_mlst_clustering)

### Yersinia SNP

#### Time with c4 parallel correlation clustering algorithm

In [12]:
s_t = time.time()
yersinia_snv_clustering_c4 = pathogist.cluster.c4_correlation(yersinia_snv_dist, threshold=2500)
e_t = time.time()
print(e_t - s_t)

0.037908315658569336


#### Time with current correlation clustering algorithm

In [10]:
s_t = time.time()
yersinia_snv_clustering = pathogist.cluster.correlation(yersinia_snv_dist, threshold=2500)
e_t = time.time()
print(e_t - s_t)

113.8979766368866


#### Adjusted rand index between output of c4 and current correlation clustering

In [13]:
pathogist.cluster.adjusted_rand_index(yersinia_snv_clustering_c4, yersinia_snv_clustering)

1.0

### M. Tuberculosis MLST

#### Time with c4 parallel correlation clustering algorithm

In [10]:
s_t = time.time()
tb_mlst_clustering_c4 = pathogist.cluster.c4_correlation(tb_mlst_dist.loc[tb_sample_subset, tb_sample_subset], threshold=700)
e_t = time.time()
print(e_t - s_t)

0.4765915870666504


#### Time with current correlation clustering algorithm

In [11]:
s_t = time.time()
tb_mlst_clustering = pathogist.cluster.correlation(tb_mlst_dist.loc[tb_sample_subset, tb_sample_subset], threshold=700)
e_t = time.time()
print(e_t - s_t)

31.19114398956299


#### Adjusted rand index between outpu###t of c4 and current correlation clustering

pathogist.cluster.adjusted_rand_index(tb_mlst_clustering_c4, tb_mlst_clustering)

## Comparing outputs of different runs of c4

In [14]:
c4_output_clusterings = []
for i in range(20):
    c4_output_clusterings += [pathogist.cluster.c4_correlation(ecoli_mlst_dist, threshold=1500)]

In [18]:
ARIs = []
for c1, c2 in itertools.combinations(c4_output_clusterings, 2):
    ARIs += [pathogist.cluster.adjusted_rand_index(c1, c2)]

In [28]:
print('Max:', max(ARIs))
print('Min:', min(ARIs))
print('Mean:', statistics.mean(ARIs))
print('Standard deviation:', statistics.stdev(ARIs))

Max: 1.0
Min: 0.8854203855765799
Mean: 0.9812659935475349
Standard deviation: 0.026893225864310304
