# Effect of paralog choice on rRNA full operons
- Pseudomonas (also can try on Staph)

-> iqtree -s o.fas -wbtl -bb  --> o.fas.uboot will have ultrafast bootsrap trees w blens

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

from Bio import Seq, SeqIO, Align, AlignIO, Phylo, Alphabet, pairwise2 
from Bio.SeqRecord import SeqRecord
from Bio.Align import AlignInfo, Applications
from Bio.Phylo import draw, TreeConstruction  #TreeConstruction.DistanceCalculator, TreeConstruction.DistanceTreeConstructor 
# https://bioinformatics.stackexchange.com/questions/4337/biopython-phylogenetic-tree-replace-branch-tip-labels-by-sequence-logos

import numpy as np
import seaborn as sns
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing
import skbio, parasail, dendropy
import sys, gzip, re, glob, pickle, collections, subprocess, os, errno, random, itertools

def print_redblack(textr, textb=""):
    print ('\x1b[0;1;31;1m'+ str(textr) + '\x1b[0;1;30;1m'+ str(textb) + '\x1b[0m')

### defining main variables 

In [3]:
##  "Mycobacterium" is not truely positive or negative
gram_positive = ["Clostridium", "Enterococcus", "Listeria", "Mycobacterium", "Staphylococcus", "Streptococcus"]
gram_negative = ["Campylobacter", "Escherichia", "Helicobacter", "Leptospira", 
                 "Neisseria", "Pseudomonas", "Salmonella"]
#outdir = "/usr/users/QIB_fr005/deolivl/Academic/Quadram/005.metagenomics_70S/notebooks/20190215.013_results/"
bacteria_genus = "Staphylococcus"  # "Pseudomonas"
outdir = "./014_results/" + bacteria_genus
rrna_types = ["16S", "23S", "5S"];

### read refseq genomes 

In [4]:
## read species names translation tables and keep only genus+species 
file_name = glob.glob("/media/deolivl/QIB_deolivl/bigdata/" + bacteria_genus + "*/names.txt")[0]
file_lines = [line.strip() for line in open(file_name, 'r')]
thistbl = [line.split('\t') for line in file_lines[1:]] # concatenation of lists
fnames = glob.glob("/media/deolivl/QIB_deolivl/bigdata/" + bacteria_genus + "*/GCF_*.gbff.gz")
idx=[i for j in thistbl for i,fname in enumerate(fnames) if j[0] in fname]
table_filenames_species = [[fnames[idx[i]]] + j for i,j in enumerate(thistbl)]
table_filenames_species = [line + ["_".join(line[2].split()[:2]).split('.')[0]] for line in table_filenames_species]
table_filenames_species = [line + [line[2].split()[0]] for line in table_filenames_species]
print_redblack ("table_filenames_species[] has:"," filename,  GCF_id, full species_subsp name, species name (genus+sp), genus\n")

print (table_filenames_species[0], "\n", set([x[4] for x  in table_filenames_species]))

species_counter = collections.Counter([x[3] for x in table_filenames_species if x[3][-3:] !="_sp"])
print_redblack ("distinct species: ", species_counter)

def return_species_name_from_table (table_fs):
    return str(table_fs[3]) + "_" + str(re.search('GCF_(\d+)\.', table_fs[1]).group(1))

tbl_fl_sp = [x  for x in table_filenames_species if species_counter[ x[3] ] > 2]
print_redblack(set([x[3] for x  in tbl_fl_sp]))
random.shuffle(tbl_fl_sp)

[0;1;31;1mtable_filenames_species[] has:[0;1;30;1m filename,  GCF_id, full species_subsp name, species name (genus+sp), genus
[0m
['/media/deolivl/QIB_deolivl/bigdata/Staphylococcus/GCF_000009005.1_ASM900v1_genomic.gbff.gz', 'GCF_000009005.1', 'Staphylococcus aureus RF122', 'Staphylococcus_aureus', 'Staphylococcus'] 
 {'Staphylococcus'}
[0;1;31;1mdistinct species: [0;1;30;1mCounter({'Staphylococcus_aureus': 362, 'Staphylococcus_epidermidis': 16, 'Staphylococcus_lugdunensis': 10, 'Staphylococcus_simulans': 7, 'Staphylococcus_saprophyticus': 7, 'Staphylococcus_haemolyticus': 6, 'Staphylococcus_pseudintermedius': 6, 'Staphylococcus_warneri': 5, 'Staphylococcus_schleiferi': 5, 'Staphylococcus_xylosus': 4, 'Staphylococcus_argenteus': 4, 'Staphylococcus_nepalensis': 3, 'Staphylococcus_capitis': 3, 'Staphylococcus_equorum': 3, 'Staphylococcus_condimenti': 3, 'Staphylococcus_hyicus': 3, 'Staphylococcus_caprae': 3, 'Staphylococcus_muscae': 2, 'Staphylococcus_cohnii': 2, 'Staphylococcus_hom

### main functions
* find all operons, align sequences

In [5]:
def align_seqs (sequences=None, maxiters=12, infile=None, outfile=None, mafft = True):
    print ("started aligning...", flush=True, end=" ")
    if (sequences is None) and (infile is None):
        print ("ERROR: You must give me an alignment object or file")
        return [] ## OTOH if both are present then infile is overwritten with contents of sequences[]
    if infile is None:
        ifl = "/tmp/in.fas"
    else:
        ifl = infile
    if outfile is None:
        ofl = "/tmp/out.fas"
    else:
        ofl = outfile
    SeqIO.write(sequences, ifl, "fasta")
    if (mafft is False):
        proc_run = subprocess.check_output("muscle -in " + ifl + " -diags -maxiters " + str(maxiters) + " -out " + ofl,
                                       shell=True, universal_newlines=True)
    else: # "--parttree --6merpair" and 0.123 is to avoid very long alignments
        proc_run = subprocess.check_output("mafft --ep 0.3 --op 3.0 --auto " + ifl + " > " + ofl,
                                       shell=True, universal_newlines=True)
      
    aligned = AlignIO.read(ofl, "fasta")
    print ("Finished",flush=True)
    if infile is None:
        os.system("rm -f " + ifl)
    if outfile is None:
        os.system("rm -f " + ofl)
    return aligned

def return_contig_locations (featurelist): # featurelist has list of [ [location1, location2], rnaname, strand ]
    featurelist = sorted(featurelist, key = lambda x: int(x[0][0]))
    rnablocks = []
    rnabnames = []
    rnasignal = []
    tmp_loc = list(featurelist[0][0]) # start and end locations
    tmp_nam = [featurelist[0][1]]    # gene name
    tmp_sig = [featurelist[0][2]]
    for feature in featurelist[1:]:
        if (abs(tmp_loc[-1] - feature[0][0]) < 1000):
            tmp_loc.extend(list(feature[0]))
            tmp_nam.extend([feature[1]])
            tmp_sig.extend([feature[2]])
        else:
            rnablocks.append(tmp_loc)
            rnabnames.append(tmp_nam)
            rnasignal.append(tmp_sig)
            tmp_loc = list(feature[0])
            tmp_nam = [feature[1]]
            tmp_sig = [feature[2]]
    rnablocks.append(tmp_loc)
    rnabnames.append(tmp_nam)
    rnasignal.append(tmp_sig)
    return rnablocks, rnabnames, rnasignal
    
def get_rrna_from_genbank_to_list_and_dict (table_fs):
    gbank = SeqIO.parse(gzip.open(table_fs[0], "rt"), "genbank")
    list_of_features = []
    list_of_seqs = []
    genome = next(gbank) # no point in iterating over gbank, only this has whole information
    for feature in genome.features:
        if(feature.type == "rRNA"):
            for rna in rrna_types:
                if rna in feature.qualifiers['product'][0]:
                    this_product = rna
                    break
            this_location = [int(i) for i in re.findall('\d+',str(feature.location))]
            this_strand = feature.strand
            if 0 not in this_location:
                list_of_features.append([this_location[:2], this_product, this_strand])
    # now list_of_features is complete  for this genome record (gbank[0])
    if not list_of_features:
        print ("no info from ", table_fs[0])
        return None # some genomes have no relevant annotation
    location, gnames, strand = return_contig_locations (list_of_features)
    genome_len = len(genome.seq)
    for i, (l,g,s) in enumerate(zip(location, gnames, strand)):
        i_start = min(l); i_end = max(l)
        if sum(s) < 0: ## hopefully they all have same signal
            contig = genome.seq[i_start:i_end].reverse_complement()
            seqname = "s" + "".join(g[::-1])
        else:
            contig = genome.seq[i_start:i_end]
            seqname = "s" + "".join(g)
        numbercode = '_{:02d}'.format(i)
        seqname = return_species_name_from_table(table_fs) + numbercode
        if (len(contig) > 1000) and (len(contig) < 8500): # some contigs have only 5S; some genomes have only this 5S annotated
            list_of_seqs.append(SeqRecord(contig, id = seqname, description = seqname))
    return list_of_seqs

### sample genomes and create unaligned list

In [6]:
good_fl_sp = []
rna_operon = []  # includes all paralogs
for file_counter, tfs in enumerate(tbl_fl_sp[:100]):
    if not (file_counter+1)%20:
        print (str(file_counter+1), end=" ", flush=True)
    rna_list = get_rrna_from_genbank_to_list_and_dict (tfs)
    # (some list will have no usable features or b/c too long/too short; some dict has no usable features
    if rna_list:
        good_fl_sp.append(tfs)
        rna_operon.extend(rna_list)

# save table, remembering that 'outdir' has the genus name already
fl=gzip.open(outdir+"_fl_sp.pickle.gz", "w"); pickle.dump([good_fl_sp],fl,2); fl.close()

print ("# first sequence of operon alignment:\n", rna_operon[0])
x = [len(i.seq) for i in rna_operon]
print ("# percentiles:", np.percentile(x, 5), np.percentile(x, 99), len(x))

20 40 60 80 100 # first sequence of operon alignment:
 ID: Staphylococcus_aureus_900324415_01
Name: <unknown name>
Description: Staphylococcus_aureus_900324415_01
Number of features: 0
Seq('TTTATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAATAC...GGC', IUPACAmbiguousDNA())
# percentiles: 4924.0 6082.52 442


In [7]:
# align each of the dictionary values (lists of unaligned seqs)
operon_aligned = align_seqs(sequences=rna_operon, outfile=outdir + ".fasta")

started aligning... Finished


In [8]:
def create_species_genus_labels (seqlist, have_paralogs = False): 
    species = ['_'.join(sequence.id.split('_')[:2]) for sequence in seqlist]
    genus = [sequence.id.split('_')[0] for sequence in seqlist]
    if have_paralogs:# assumes 'genus_species_code_number' names 
        sample = ['_'.join(sequence.id.split('_')[:3]) for sequence in seqlist]
        return species, genus, sample
    return species, genus 

### if resuming from previous analysis, run code below to read genome names table and alignments

In [None]:
# read table
fl=gzip.open(outdir+"_fl_sp.pickle.gz", "r"); good_fl_sp = pickle.load(fl); fl.close()
operon_aligned = AlignIO.read(outdir + ".fasta", "fasta")

In [9]:
outcheck = subprocess.check_output("muscle -maketree -in " + outdir + ".fasta -out " + outdir + "_upgma.treefile", shell=True,universal_newlines=True)
outcheck = subprocess.check_output("iqtree-omp -s " + outdir + ".fasta -nt 8 -m HKY+G",shell=True,universal_newlines=True)
print (outcheck)

IQ-TREE multicore version 1.3.11.1 for Linux 64-bit built Feb  1 2016
Copyright (c) 2011-2015 Nguyen Lam Tung, Olga Chernomor, Arndt von Haeseler and Bui Quang Minh.

Host:    n114312.nbi.ac.uk (AVX2, FMA3, 15 GB RAM)
Command: iqtree-omp -s ./014_results/Staphylococcus.fasta -nt 8 -m HKY+G
Seed:    882015 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Fri Feb 22 15:48:14 2019
Kernel:  AVX - 8 threads (8 CPU cores detected)

Reading alignment file ./014_results/Staphylococcus.fasta ... Fasta format detected
Alignment most likely contains DNA/RNA sequences
Alignment has 442 sequences with 7296 columns and 1456 patterns (986 informative sites)
                                          Gap/Ambiguity  Composition  p-value
   1  Staphylococcus_aureus_900324415_01         31.29%    passed     99.85%
   2  Staphylococcus_aureus_900324415_02         31.30%    passed     99.87%
   3  Staphylococcus_aureus_900324415_03         29.40%    passed     90.34%
   4  Staphylococcus_a

In [10]:
def patristic_distances_from_treefile (filename, have_paralogs = False, shuffle = False):
    tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
    if shuffle:
        tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 1000)
    species = ['_'.join(t.label.split('_')[:2]) for t in tree.taxon_namespace] ## follow order of taxon_namespace
    genus = [t.label.split('_')[0] for t in tree.taxon_namespace]
    ntaxa = len(tree.taxon_namespace)
    distmat = np.zeros((ntaxa,ntaxa)) # diagonals are zero
    nodemat = np.zeros((ntaxa,ntaxa))
    pdm = tree.phylogenetic_distance_matrix() # initialises class
    for i,j in itertools.combinations(range(ntaxa),2):
        distmat[i,j] = distmat[j,i] = pdm.distance(tree.taxon_namespace[i], tree.taxon_namespace[j])
        nodemat[i,j] = nodemat[j,i] = pdm.path_edge_count(tree.taxon_namespace[i], tree.taxon_namespace[j])
    if have_paralogs:
        sample = ['_'.join(t.label.split('_')[:3]) for t in tree.taxon_namespace]
        return species, genus, distmat, nodemat, sample
    return species, genus, distmat, nodemat

In [11]:
def silhouette_str(dist, labels):
    distrib = metrics.silhouette_samples(dist, labels, metric="precomputed")
    return '{:7.3f}'.format(np.percentile(distrib, 5)) + \
    '{:7.3f}'.format(np.percentile(distrib, 25)) + \
    '{:7.3f}  '.format(sum(distrib>0)/float(len(distrib))) 

Each triplet from `silhouette_str()` has $5\%$, $25\%$ and proportion of positive silhouettes. And we have two triplets on the left for _species_ (S), and two on the right for _strain_ (T), one being from weighted (W) and one from unweighted (U) patristic distances. Therefore the header is: SW SU TW TU

By the way having low silhouettte scores is not necessarily bad, since
1. clusters of size one will have score of _zero_ **by definition** (should be _one_ given the silhouette function but this would lead to ever-increasing clusterings)
2. clusters with within-distance zero are perfect, while for phylogenies we want non-zero diversification
3. they can be negative if for a case like `((A:a,A:a,A:a),B:b)` with $a>b$ since $d(A,A)=2a > a+b = d(A,B)$ --- however the taxonomic classification is fine for A and B

In [13]:
sp, ge, mat, mat2, strain = patristic_distances_from_treefile (outdir + ".fasta.treefile", have_paralogs = True, shuffle = True)
print ("random \t", silhouette_str(mat, sp),  silhouette_str(mat2, sp), " | ", 
      silhouette_str(mat, strain),  silhouette_str(mat2, strain))

for suffix1, title1 in zip(["_upgma.treefile",".fasta.treefile"], ["upgma ", "ML    "]):
    ## operon doenst have _long or _consensus
    sp, ge, mat, mat2, strain = patristic_distances_from_treefile (outdir + suffix1, have_paralogs = True)
    print (title1, "\t", silhouette_str(mat, sp),  silhouette_str(mat2, sp), " | ", 
      silhouette_str(mat, strain),  silhouette_str(mat2, strain))

random 	  -0.445 -0.398  0.007    -0.302 -0.223  0.005    |   -0.936 -0.904  0.023    -0.524 -0.438  0.020  
upgma  	  -0.001  0.106  0.948    -0.542 -0.284  0.267    |   -0.756 -0.533  0.090    -0.713 -0.588  0.102  
ML     	  -0.291 -0.206  0.731    -0.558 -0.367  0.183    |   -0.872 -0.782  0.118    -0.615 -0.474  0.143  


In [14]:
def sp_count_from_list (labels):
    x = ['_'.join(t.split('_')[:2]) for t in labels]
    return len(set(x))

def strain_count_from_list (labels):
    x = ['_'.join(t.split('_')[:3]) for t in labels]
    return len(set(x))

def bipartitions_from_treefile (filename, shuffle = False):
    tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
    if shuffle:
        tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 1000)
    tree.encode_bipartitions()
    # bits[i] have the list of labels associated to bipartition (edge) i
    bits = [[x.label for x in tree.taxon_namespace.bitmask_taxa_list(i.leafset_bitmask)] for i in tree.bipartition_encoding]
    sp_count = [sp_count_from_list(i) for i in bits if len(i) > 1]
    strain_count = [strain_count_from_list(i) for i in bits if len(i) > 1]
    return sp_count, strain_count

def bipart_str(counts):
    distrib = np.array(counts)
    return '{:7.3f}'.format(np.mean(distrib)) + \
    '{:7.3f}'.format(np.percentile(distrib, 90)) + \
    '{:7.3f}\t'.format(sum(distrib == 1)/float(len(distrib))) 

sp0, ge0 = bipartitions_from_treefile (outdir + ".fasta.treefile", shuffle = True)
print ('{:7s}'.format("random"), "\t", bipart_str(sp0), bipart_str(ge0))

for suffix1, title1 in zip(["_upgma.treefile",".fasta.treefile"], ["upgma ", "ML    "]):
    sp, ge = bipartitions_from_treefile (outdir + suffix1)    
    print ('{:7s}'.format(title1), "\t", bipart_str(sp), bipart_str(ge))

random  	   2.264  4.000  0.368	   8.275 17.100  0.000	
upgma   	   1.218  2.000  0.843	  14.423 45.000  0.073	
ML      	   1.718  2.000  0.893	  12.759 28.000  0.091	
