# Genome cluster membership

Explore how genome cluster membership changes at each ANI clustering threshold.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
import plotly.graph_objects as go
from itertools import combinations, product, chain
from collections import defaultdict
import os
from pathlib import Path

In [2]:
clusters = {}
basedir = '../ani_clusters/'
for clusterfile in os.listdir( basedir ) :
    ani = int(float(Path( clusterfile ).stem.rsplit('_',1)[-1]))
    clusters[ani] = { int(cid) : set( genomes.split(',') ) 
                      for cid,genomes
                      in [ line.split() 
                           for line
                           in open( os.path.join( basedir, clusterfile ) ) ] }

In [3]:
# filter down to clusters with useable numbers of gene trees

eclusters = defaultdict( dict )
basedir = '../clusters'
for anidir in os.listdir( basedir ) :
    if os.path.isdir( os.path.join( basedir, anidir ) ) :
        for ciddir in os.listdir( os.path.join( basedir, anidir ) ) :
            ani = int(float(anidir))
            cid = int(ciddir)
            eclusters[ani][cid] = clusters[ani][cid]

In [5]:
cluster_ids = sorted(eclusters.keys())
#cluster_ids = [77, 80, 85]

labels = list( chain( *[ [ '_'.join( [ str(ani), str(cid) ] ) 
                           for cid in eclusters[ani].keys() ]
                         for ani in cluster_ids ] ) )

source = []
target = []
value  = []

for aniA,aniB in zip( cluster_ids[:-1], cluster_ids[1:] ) :
    for cidA,cidB in product( eclusters[aniA].keys(), eclusters[aniB].keys() ) :
        source.append( labels.index( '_'.join( [ str(aniA), str(cidA) ] ) ) )
        target.append( labels.index( '_'.join( [ str(aniB), str(cidB) ] ) ) )
        value.append( len( eclusters[aniA][cidA] & eclusters[aniB][cidB] ) )
        
fig = go.Figure(data=[go.Sankey(
    arrangement='snap',
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = source,
      target = target,
      value = value
  ))])

fig.update_layout(title_text='Genome cluster membership by ANI', font_size=10)
config = { 'toImageButtonOptions' : { 'scale' : 2 } }
fig.show( config=config )

In [3]:
# cluster_id : genomes} -> genome : cluster_id 
clusters_inv = { ani : { genome : [ cid for cid 
                                        in clusters[ani].keys()
                                        if genome in clusters[ani][cid] ][0] 
                         for genome in set.union( *clusters[ani].values() ) } 
                 for ani in clusters.keys() }

In [4]:
# because we've clustered the same genomes at diffrent ANI thresholds,
# the same genomes are present in every ANI bucket

df = pandas.DataFrame( [ { ani : clusters_inv[ani][genome]
                           for ani
                           in sorted( clusters_inv.keys() ) }
                         for genome
                         in clusters_inv[list(clusters_inv.keys())[0]].keys() ] )
df

Unnamed: 0,75,76,77,78,79,80,81,82,83,84,...,90,91,92,93,94,95,96,97,98,99
0,112,164,222,284,334,384,419,450,473,492,...,580,598,624,640,652,665,695,730,770,779
1,112,164,216,275,323,371,404,434,456,474,...,562,580,606,622,634,647,676,710,750,759
2,120,257,351,459,536,608,658,709,741,774,...,908,932,968,990,1011,1029,1061,1097,1140,1150
3,78,61,90,114,78,87,166,52,37,162,...,70,79,197,113,252,35,124,290,344,203
4,112,155,193,238,277,323,351,380,401,417,...,499,513,539,553,564,576,603,634,671,677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189,112,127,151,182,208,245,270,296,315,330,...,409,421,441,454,464,477,502,530,561,567
1190,41,49,68,95,84,177,181,73,183,101,...,200,23,154,170,243,79,145,132,289,87
1191,112,236,330,438,515,587,637,688,720,753,...,887,911,947,969,990,1008,1040,1076,1119,1129
1192,112,164,212,265,307,354,384,414,436,454,...,540,559,585,601,613,626,654,689,728,736
