# 028 Tree silhouette and monophyly scores
* depends on results from 026.concat_resolution (specially the trees)

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

from Bio import Seq, SeqIO, Align, AlignIO, Phylo, Alphabet, pairwise2 
from Bio.SeqRecord import SeqRecord
from Bio.Align import AlignInfo, Applications
from Bio.Phylo import draw, TreeConstruction # TreeConstruction.DistanceTreeConstructor 
# https://bioinformatics.stackexchange.com/questions/4337/ \
#biopython-phylogenetic-tree-replace-branch-tip-labels-by-sequence-logos

import numpy as np
import seaborn as sns
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing
import skbio, parasail, dendropy, pandas
import sys, gzip, re, glob, pickle, collections, subprocess, os, errno, random, itertools

def print_redblack(textr, textbb="", textbl=""):
    print ('\x1b[0;1;31;1m'+ str(textr) + '\x1b[0;1;30;1m'+ str(textbb) + '\x1b[0;0;30;0m'+ str(textbl) + '\x1b[0m')

## read taxonomic information

In [2]:
gbff_dir = "/media/deolivl/QIB_deolivl/bigdata/"   ## directory (or directories, in our case) with refseq genomes
#gbff_dir = "./bigdata/"

file_lines = [line.strip() for line in open("./bigdata/gtdb_list.csv", 'r')]
print_redblack("header: <file>  ", file_lines[0].split(',')[1:]) ## first column is data.frame index from R

table_filenames_species = [line.split(',')[1:] for line in file_lines[1:]] ## skip first line, with headers
print ("first element of list: ", table_filenames_species[0], "\n")

[0;1;31;1mheader: <file>  [0;1;30;1m['accession', 'gtdb_taxonomy', 'lsu_silva_23s_taxonomy', 'ncbi_organism_name', 'ncbi_taxonomy', 'ssu_silva_taxonomy'][0;0;30;0m[0m
first element of list:  ['GCF_000465235.1', 'Campylobacter_D coli', 'Campylobacter coli CVM N29710', 'Campylobacter coli CVM N29710', 'Campylobacter coli', 'Campylobacter jejuni 30318'] 



## labels of genes and merged sets must follow order
- for printing and visualising only
- we assume that trees already exist (otherwise 026 notebook must be run, or trees created by hand)

In [4]:
outdir = "./026_results/"
ordered_types = ["5S","16Sv1v2","16Sv3v4","23S","16S","16S5S","23S5S","16S23S","16S23S5S"]
outfile_list = [outdir + str(keys) for keys in ordered_types] # for this data set we know order :D 
print (outfile_list)
## if you need to read alignments
#concat_aligned = {k:AlignIO.read(outdir + k + "_long.fasta", "fasta") for k in ordered_types}
#consen_aligned = {k:AlignIO.read(outdir + k + "_consensus.fasta", "fasta") for k in ordered_types}

['./026_results/5S', './026_results/16Sv1v2', './026_results/16Sv3v4', './026_results/23S', './026_results/16S', './026_results/16S5S', './026_results/23S5S', './026_results/16S23S', './026_results/16S23S5S']


## Tree silhouette score
- calculating the patristic distance between leaves in a tree is the first component of the tree silhouette score, and is calculated with the dendropy package 
- the second component is the silhouette score, implemented in the scikit-learn library 
- the function below calculates both, and also "shuffles" the tree, i.e. maintain the leaf names but randomise branches, generating a "null model"

In [5]:
def patristic_distances_from_treefile (filename, have_paralogs = False, shuffle = False):
    tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
    if shuffle:
        tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 1000)
    species = ['.'.join(t.label.split('.')[:2]) for t in tree.taxon_namespace] ## follow order of taxon_namespace
    ntaxa = len(tree.taxon_namespace)
    distmat = np.zeros((ntaxa,ntaxa)) # diagonals are zero
    nodemat = np.zeros((ntaxa,ntaxa))
    ## STEP 1: pairwise distances along the tree
    pdm = tree.phylogenetic_distance_matrix() # initialises class
    for i,j in itertools.combinations(range(ntaxa),2):
        distmat[i,j] = distmat[j,i] = pdm.distance(tree.taxon_namespace[i], tree.taxon_namespace[j])
        nodemat[i,j] = nodemat[j,i] = pdm.path_edge_count(tree.taxon_namespace[i], tree.taxon_namespace[j])
    ## STEP 2: silhouette score using pairwise distances and taxonomic information
    mdist = metrics.silhouette_samples(distmat, species, metric="precomputed")
    mnode = metrics.silhouette_samples(nodemat, species, metric="precomputed")
    if have_paralogs:
        sample = ['.'.join(t.label.split('.')[:3]) for t in tree.taxon_namespace]
        m3 = metrics.silhouette_samples(distmat, sample, metric="precomputed")
        m4 = metrics.silhouette_samples(nodemat, sample, metric="precomputed")
        return mdist, mnode, m3, m4
    return mdist, mnode

def silhouette_str(distrib):
    return '{:7.3f}'.format(np.percentile(distrib, 5)) + \
    '{:7.3f}'.format(np.percentile(distrib, 25)) + \
    '{:7.3f}'.format(np.percentile(distrib, 50)) + \
    '{:7.3f}  '.format(sum(distrib>0)/float(len(distrib))) 

In [8]:
mdis, mnod = patristic_distances_from_treefile (outfile_list[0]+ "_consensus.fasta.treefile", shuffle = True)
# bar plot uses path difference
bplot_label = ["random"] ; bplot_hue = ["proportion of positive silhouette scores"]; 
bplot_value = [sum(mnod>0)/float(len(mnod))]
# violin plots use weighted distances
vio_silho_x = ["random"] * mdis.shape[0];  vio_silho_y = list(mdis)

print ('     {:12s}'.format("random"), "\t", silhouette_str(mdis),  silhouette_str(mnod))

suffix1 = ".fasta.treefile"
title1  = "ML    "
#for suffix2, title2 in zip (["_long", "_consensus"], ["long ", "cons "]):
for suffix2, title2 in zip (["_consensus"], ["cons "]):
    suffix = suffix2 + suffix1; title = title2 + title1;
    for fname, rname in zip(outfile_list, [i.split('/')[-1] for i in outfile_list]):
        mdis, mnod = patristic_distances_from_treefile (fname + suffix)
        print (title + '{:6s}'.format(rname), "\t", silhouette_str(mdis),  silhouette_str(mnod))
        if "long" in title:
            bplot_label.append(rname) ; bplot_hue.append("proportion of positive silhouette scores");
            bplot_value.append(sum(mnod>0)/float(len(mnod)))
            vio_silho_x.extend([str(rname)] * mdis.shape[0]); vio_silho_y.extend(list(mdis))
        

     random       	  -0.499 -0.440 -0.378  0.006    -0.258 -0.200 -0.161  0.009  
cons ML    5S     	  -0.992 -0.045  0.993  0.709    -0.495 -0.130  0.287  0.680  
cons ML    16Sv1v2 	  -0.644  0.227  0.952  0.799    -0.463 -0.070  0.259  0.703  
cons ML    16Sv3v4 	  -0.989  0.321  0.996  0.796    -0.698 -0.155  0.246  0.617  
cons ML    23S    	   0.103  0.670  0.959  0.953    -0.185  0.239  0.390  0.912  
cons ML    16S    	  -0.551  0.716  0.954  0.865    -0.480  0.011  0.319  0.760  
cons ML    16S5S  	  -0.337  0.621  0.953  0.888    -0.446  0.064  0.307  0.780  
cons ML    23S5S  	  -0.013  0.641  0.951  0.948    -0.219  0.232  0.379  0.897  
cons ML    16S23S 	   0.206  0.684  0.955  0.961    -0.259  0.165  0.388  0.890  
cons ML    16S23S5S 	   0.154  0.726  0.954  0.957    -0.294  0.162  0.421  0.859  


In [14]:
def sp_list_from_labels (labels):
    return ['.'.join(t.split('.')[:2]) for t in labels]

def genera_set_from_namespace (txnamespace):
    x = [t.label.split('.')[0] for t in txnamespace]
    return set(x)

def taxa_list_from_strain (leaves, strain): # strain is list of species or genera
    return [x for x in leaves if strain in x]

def average_lengths_lca (samples, tre, pdm, lca): ## assumes names are unique
    taxon_label_list = [tre.find_node_with_taxon_label(x) for x in samples]
    avge = {} 
    for t in taxon_label_list:
        if t.label not in avge: ## not calculated yet
            is_mono = True
            mono_taxa = None
            while is_mono and t is not lca:
                t = t.parent_node
                taxa_below = [x for x in tre.taxon_namespace.bitmask_taxa_list(t.leafset_bitmask)]
                is_mono = all([i.label in samples for i in taxa_below])
                if (is_mono):
                    mono_taxa = [x for x in taxa_below]
            if mono_taxa:
                pairdist = np.mean([pdm.distance(i, j) for i,j in itertools.combinations(mono_taxa,2)])
                for i in mono_taxa:
                    avge[i.label] = pairdist
            else:
                avge[t.label] = 0.
    return max([v for k,v in avge.items()])

def monophyly_score_from_treefile (filename, shuffle = False):
    tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
    if shuffle:
        x = tree.length()
        tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 5000)
        for edge in tree.preorder_edge_iter():
            edge.length = edge.length/x
    else: 
        tree.reroot_at_midpoint(update_bipartitions=True)
    tree.encode_bipartitions()
    pdm = tree.phylogenetic_distance_matrix() # initialises patristic distance class
    leaf_list = [x.label for x in tree.taxon_namespace]
    freq_species = []
    monoscore = []
    max_avge = []
    for sp in set(sp_list_from_labels(leaf_list)):
        samples_from_sp = taxa_list_from_strain (leaf_list, sp)
        mrca = tree.mrca(taxon_labels=samples_from_sp)
        samples_below_mrca = [x.label for x in tree.taxon_namespace.bitmask_taxa_list(mrca.leafset_bitmask)]
        samples_above_mrca = [x for x in leaf_list if x not in samples_below_mrca]
        try:
            m1 = len(samples_from_sp)/len(samples_below_mrca)
        except:
            m1 = len(samples_from_sp)/len(samples_above_mrca)
        monoscore.append(m1)
        try:
            m1 = 1./len(set(sp_list_from_labels(samples_below_mrca)))
        except:
            m1 = 1./len(set(sp_list_from_labels(samples_above_mrca)))
        freq_species.append(m1)
        max_avge.append(average_lengths_lca (samples_from_sp, tree, pdm, mrca))
        #print (samples_above_mrca)
    return monoscore, freq_species, max_avge

def mono_str(mono_, freq_, mav_): ## was 25 and 50, now 5 and 25
    return '{:7.3f}'.format(np.percentile(mono_, 5)) + \
    '{:7.3f}\t'.format(np.percentile(mono_, 25)) + \
    '{:7.3f}\t\t'.format(sum(np.array(freq_) > 0.999)/float(len(freq_))) + \
    '{:.12f}\t'.format(np.percentile(mav_, 50)) + \
    '{:.12f}\t'.format(np.percentile(mav_, 90))

#mono, fspec = monophyly_score_from_treefile (outfile_list[8]+ "_long.fasta.treefile", shuffle = False)

In [15]:
mono, fspec, mav = monophyly_score_from_treefile (outfile_list[0]+ "_consensus.fasta.treefile", shuffle = True)
print ('{:17s}'.format("random"), "\t", mono_str(mono, fspec, mav))
bplot_label.append("random") ; bplot_hue.append("proportion of monophyletic groups")
bplot_value.append(sum(np.array(fspec) > 0.999)/float(len(fspec)))
vio_mono_x = ["random"] * len(mono)
vio_mono_y = mono; vio_mav_y = mav

suffix1 = ".fasta.treefile"
title1  = "ML    "
#for suffix2, title2 in zip (["_long", "_consensus"], ["long ", "cons "]):
for suffix2, title2 in zip (["_consensus"], ["cons "]):
    suffix = suffix2 + suffix1; title = title2 + title1;
    for fname, rname in zip(outfile_list, [i.split('/')[-1] for i in outfile_list]):
        mono, fspec, mav = monophyly_score_from_treefile (fname + suffix)
        print (title + '{:7s}'.format(rname), "\t", mono_str(mono, fspec, mav))
        if "long" in title:
            bplot_label.append(rname) ; bplot_hue.append("proportion of monophyletic groups")
            bplot_value.append(sum(np.array(fspec) > 0.999)/float(len(fspec)))
            vio_mono_x.extend([rname] * len(mono))
            vio_mono_y.extend(mono); vio_mav_y.extend(mav);

random            	   0.007  0.009	  0.000		0.000000000000	0.000000000000	
cons ML    5S      	   0.125  0.227	  0.538		0.000001440686	0.000012224360	
cons ML    16Sv1v2 	   0.048  0.311	  0.564		0.000007657644	0.003216267316	
cons ML    16Sv3v4 	   0.184  0.396	  0.641		0.000003805867	0.000370485385	
cons ML    23S     	   0.242  1.000	  0.769		0.000739745250	0.005384325574	
cons ML    16S     	   0.220  1.000	  0.744		0.000426328932	0.005402358935	
cons ML    16S5S   	   0.157  1.000	  0.744		0.000198986366	0.004976368835	
cons ML    23S5S   	   0.187  1.000	  0.769		0.000772437650	0.005846017972	
cons ML    16S23S  	   0.187  1.000	  0.795		0.000706283875	0.005578137678	
cons ML    16S23S5S 	   0.276  1.000	  0.795		0.000938054069	0.005353173882	


In [None]:
fig, axes = plt.subplots(3,1) ; fig.set_size_inches(16, 24); 
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.rc('font', weight='bold')
fig.subplots_adjust(top=.91, bottom=.01, left=.02, right=.99, wspace=.2, hspace=.2)

sns.set(); 
sns.set_palette("cubehelix", 12);
sns.boxenplot(x=vio_silho_x, y=vio_silho_y, ax = axes[0], outlier_prop=0.00001)
axes[0].set_ylabel("tree silhouette scores", fontsize=22, weight="bold") # "silhouette scores from patristic distances"
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation=40)
axes[0].tick_params(labelsize=16)

sns.set(); sns.set_palette("cubehelix", 12)
sns.pointplot(x=vio_mono_x, y=vio_mono_y, ax = axes[1], linestyles="--", errwidth=0.5, color="gray", scale=0.4)
sns.swarmplot(x=vio_mono_x, y=vio_mono_y, ax = axes[1], size=10, linewidth=1)
axes[1].set_ylabel("monophyly scores", fontsize=22, weight="bold")
axes[1].set_xticklabels(axes[1].get_xticklabels(),rotation=40)
axes[1].tick_params(labelsize=16)

sns.set(); sns.set_palette("cubehelix", 12) # "Average distance between samples from best monophyletic subtree"
sns.boxplot(x=vio_mono_x, y=vio_mav_y, ax = axes[2]) 
axes[2].set_ylabel("Average clade distance", fontsize=22, weight="bold")
axes[2].set_xticklabels(axes[2].get_xticklabels(),rotation=40)
axes[2].set_ylim(-0.0001,0.045)
axes[2].tick_params(labelsize=16)
#sns.boxplot(x=rnatypelist, y=monolist_sp, ax = axes[1])  # inner="sticks",
#sns.pointplot(x=rnatypelist, y=monolist_sp, ax = axes[1])  

fig, axes = plt.subplots(1) ; fig.set_size_inches(12, 6); 
fig.subplots_adjust(top=.99, bottom=.01, left=.01, right=.98, wspace=.2, hspace=.2)
sns.set(); sns.set_palette("cubehelix", 12)
sns.barplot(hue=bplot_label, y=bplot_value, x=bplot_hue, ax=axes)
axes.set_ylabel("proportion", fontsize=18, fontweight="bold")
axes.tick_params(labelsize=14)
#axes.set_title("proportion of sample pairs with good silhouettes and proportion of monophyletic groups")