# Effect of paralog choice on rRNA full operons

- The analysis below runs on Pseudomonas and Staphylococcus data sets; you can try different ones as long as download them beforehand

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

from Bio import Seq, SeqIO, Align, AlignIO, Phylo, Alphabet, pairwise2 
from Bio.SeqRecord import SeqRecord
from Bio.Align import AlignInfo, Applications
from Bio.Phylo import draw, TreeConstruction

import numpy as np
import seaborn as sns
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing
import skbio, parasail, dendropy, pandas
import sys, gzip, re, glob, pickle, collections, subprocess, os, errno, random, itertools

def print_redblack(textr, textb=""):
    print ('\x1b[0;1;31;1m'+ str(textr) + '\x1b[0;1;30;1m'+ str(textb) + '\x1b[0m')

### defining main variables
- we also assume that all RefSeq genomes are in directory `bigdata/`, in `gbff.gz` format.

In [2]:
##  "Mycobacterium" is not truely positive or negative
gram_positive = ["Clostridium", "Enterococcus", "Listeria", "Mycobacterium", "Staphylococcus", "Streptococcus"]
gram_negative = ["Campylobacter", "Escherichia", "Helicobacter", "Leptospira", 
                 "Neisseria", "Pseudomonas", "Salmonella"]

bacteria_genera = ["Pseudomonas", "Staphylococcus"]
outdir = "./014_results/"
rrna_types = ["16S", "23S", "5S"];

### read refseq genomes 

In [3]:
def return_species_name_from_table (table_fs):
    return str(table_fs[3]) + "_" + str(re.search('GCF_(\d+)\.', table_fs[1]).group(1))

def subsample_accept (sp_name, collection_counter):
    if collection_counter[sp_name] < 3:
        return False
    if sp_name in "Pseudomonas_aeruginosa": # 171 against 42 of second more common
        return np.random.random() < 0.5
    if sp_name in "Staphylococcus_aureus": # 362 against 16 of S. epidermidis
        return np.random.random() < 0.2
    return True

tbl_fl_sp = {}
print_redblack ("table_fl_sp: "," filename,  GCF_id, full species_subsp name, species name (genus+sp), genus\n")
for genus in bacteria_genera:
    file_name = glob.glob("./bigdata/" + genus + "*/names.txt")[0]
    file_lines = [line.strip() for line in open(file_name, 'r')]
    thistbl = [line.split('\t') for line in file_lines[1:]] # concatenation of lists
    fnames = glob.glob("./bigdata/" + genus + "*/GCF_*.gbff.gz")
    idx=[i for j in thistbl for i,fname in enumerate(fnames) if j[0] in fname]
    table_filenames_species = [[fnames[idx[i]]] + j for i,j in enumerate(thistbl)]
    table_filenames_species = [line + ["_".join(line[2].split()[:2]).split('.')[0]] for line in table_filenames_species]
    table_filenames_species = [line + [line[2].split()[0]] for line in table_filenames_species]

    print (table_filenames_species[0], "\n", set([x[4] for x  in table_filenames_species]))
    species_counter = collections.Counter([x[3] for x in table_filenames_species if x[3][-3:] !="_sp"])
    print_redblack (genus + " distinct species: ", species_counter)

    tbl_fl_sp[genus] = [x  for x in table_filenames_species if subsample_accept (x[3], species_counter)]
    print_redblack(set([x[3] for x  in tbl_fl_sp[genus]]), " length :: " + str(len(tbl_fl_sp[genus])))
    random.shuffle(tbl_fl_sp[genus])

[0;1;31;1mtable_fl_sp: [0;1;30;1m filename,  GCF_id, full species_subsp name, species name (genus+sp), genus
[0m
['/media/deolivl/QIB_deolivl/bigdata/Pseudomonas/GCF_000006765.1_ASM676v1_genomic.gbff.gz', 'GCF_000006765.1', 'Pseudomonas aeruginosa PAO1', 'Pseudomonas_aeruginosa', 'Pseudomonas'] 
 {'Pseudomonas'}
[0;1;31;1mPseudomonas distinct species: [0;1;30;1mCounter({'Pseudomonas_aeruginosa': 171, 'Pseudomonas_chlororaphis': 42, 'Pseudomonas_syringae': 29, 'Pseudomonas_putida': 28, 'Pseudomonas_fluorescens': 21, 'Pseudomonas_stutzeri': 17, 'Pseudomonas_mendocina': 6, 'Pseudomonas_protegens': 6, 'Pseudomonas_synxantha': 6, 'Pseudomonas_monteilii': 5, 'Pseudomonas_orientalis': 5, 'Pseudomonas_brassicacearum': 4, 'Pseudomonas_frederiksbergensis': 3, 'Pseudomonas_entomophila': 3, 'Pseudomonas_koreensis': 3, 'Pseudomonas_parafulva': 3, 'Pseudomonas_plecoglossicida': 2, 'Pseudomonas_pseudoalcaligenes': 2, 'Pseudomonas_citronellolis': 2, 'Pseudomonas_fragi': 2, 'Pseudomonas_azotoforma

### main functions
* find all operons, align sequences

In [4]:
def align_seqs (sequences=None, maxiters=12, infile=None, outfile=None, mafft = True):
    print ("started aligning...", flush=True, end=" ")
    if (sequences is None) and (infile is None):
        print ("ERROR: You must give me an alignment object or file")
        return [] ## OTOH if both are present then infile is overwritten with contents of sequences[]
    if infile is None:
        ifl = "/tmp/in.fas"
    else:
        ifl = infile
    if outfile is None:
        ofl = "/tmp/out.fas"
    else:
        ofl = outfile
    SeqIO.write(sequences, ifl, "fasta")
    if (mafft is False):
        proc_run = subprocess.check_output("muscle -in " + ifl + " -diags -maxiters " + str(maxiters) + " -out " + ofl,
                                       shell=True, universal_newlines=True)
    else: # "--parttree --6merpair" and 0.123 is to avoid very long alignments
        proc_run = subprocess.check_output("mafft --ep 0.3 --op 3.0 --auto " + ifl + " > " + ofl,
                                       shell=True, universal_newlines=True)
      
    aligned = AlignIO.read(ofl, "fasta")
    print ("Finished",flush=True)
    if infile is None:
        os.system("rm -f " + ifl)
    if outfile is None:
        os.system("rm -f " + ofl)
    return aligned

def return_contig_locations (featurelist): # featurelist has list of [ [location1, location2], rnaname, strand ]
    featurelist = sorted(featurelist, key = lambda x: int(x[0][0]))
    rnablocks = []
    rnabnames = []
    rnasignal = []
    tmp_loc = list(featurelist[0][0]) # start and end locations
    tmp_nam = [featurelist[0][1]]    # gene name
    tmp_sig = [featurelist[0][2]]
    for feature in featurelist[1:]:
        if (abs(tmp_loc[-1] - feature[0][0]) < 1000):
            tmp_loc.extend(list(feature[0]))
            tmp_nam.extend([feature[1]])
            tmp_sig.extend([feature[2]])
        else:
            rnablocks.append(tmp_loc)
            rnabnames.append(tmp_nam)
            rnasignal.append(tmp_sig)
            tmp_loc = list(feature[0])
            tmp_nam = [feature[1]]
            tmp_sig = [feature[2]]
    rnablocks.append(tmp_loc)
    rnabnames.append(tmp_nam)
    rnasignal.append(tmp_sig)
    return rnablocks, rnabnames, rnasignal
    
def get_rrna_from_genbank_to_list_and_dict (table_fs):
    gbank = SeqIO.parse(gzip.open(table_fs[0], "rt"), "genbank")
    list_of_features = []
    list_of_seqs = []
    genome = next(gbank) # no point in iterating over gbank, only this has whole information
    for feature in genome.features:
        if(feature.type == "rRNA"):
            for rna in rrna_types:
                if rna in feature.qualifiers['product'][0]:
                    this_product = rna
                    break
            this_location = [int(i) for i in re.findall('\d+',str(feature.location))]
            this_strand = feature.strand
            if 0 not in this_location:
                list_of_features.append([this_location[:2], this_product, this_strand])
    # now list_of_features is complete  for this genome record (gbank[0])
    if not list_of_features:
        print ("no info from ", table_fs[0])
        return None # some genomes have no relevant annotation
    location, gnames, strand = return_contig_locations (list_of_features)
    genome_len = len(genome.seq)
    for i, (l,g,s) in enumerate(zip(location, gnames, strand)):
        i_start = min(l); i_end = max(l)
        if sum(s) < 0: ## hopefully they all have same signal
            contig = genome.seq[i_start:i_end].reverse_complement()
            seqname = "s" + "".join(g[::-1])
        else:
            contig = genome.seq[i_start:i_end]
            seqname = "s" + "".join(g)
        numbercode = '_{:02d}'.format(i)
        seqname = return_species_name_from_table(table_fs) + numbercode
        if (len(contig) > 3000) and (len(contig) < 8500): # some contigs have only 5S; some genomes have only this 5S annotated
            list_of_seqs.append(SeqRecord(contig, id = seqname, description = seqname))
    return list_of_seqs

### sample genomes and create unaligned list
- also saves variables (internediate results) to `xxx_fl_sp.pickle.gz` where xxx is the genus name ;) 

In [6]:
good_fl_sp = {}
rna_operon = {}  # includes all paralogs
for genus in bacteria_genera:
    good_fl_sp[genus] = []
    rna_operon[genus] = []
    for file_counter, tfs in enumerate(tbl_fl_sp[genus]):
        if (len(good_fl_sp[genus]) == 200):
            break
        if not (file_counter+1)%20:
            print (str(file_counter+1), end=" ", flush=True)
        rna_list = get_rrna_from_genbank_to_list_and_dict (tfs)
        # (some list will have no usable features or b/c too long/too short; some dict has no usable features
        if rna_list:
            x = [len(i.seq) for i in rna_list]
            good_fl_sp[genus].append(tfs + [len(rna_list), np.mean(x), np.min(x), np.max(x)])
            rna_operon[genus].extend(rna_list)
    outfile = outdir + genus
    # save table
    fl = gzip.open(outfile + "_fl_sp.pickle.gz", "w"); pickle.dump([good_fl_sp[genus]],fl,2); fl.close()
    print_redblack(genus)
    print ("first sequence of operon alignment:\n", rna_operon[genus][0])
    x = [len(i.seq) for i in rna_operon[genus]]
    print ("percentiles:", np.mean(x), np.std(x), len(x), " | ", 
           np.min(x), np.percentile(x, 5), np.percentile(x, 95), np.max(x))

20 40 60 80 100 120 140 160 180 200 [0;1;31;1mPseudomonas[0;1;30;1m[0m
first sequence of operon alignment:
 ID: Pseudomonas_putida_000495455_00
Name: <unknown name>
Description: Pseudomonas_putida_000495455_00
Number of features: 0
Seq('GAACTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACAC...AGA', IUPACAmbiguousDNA())
percentiles: 5183.97395833 205.576786012 960  |  3146 4993.95 5432.05 6952
20 40 60 80 100 120 140 160 [0;1;31;1mStaphylococcus[0;1;30;1m[0m
first sequence of operon alignment:
 ID: Staphylococcus_saprophyticus_001558275_01
Name: <unknown name>
Description: Staphylococcus_saprophyticus_001558275_01
Number of features: 0
Seq('TTTATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAATAC...GGC', IUPACAmbiguousDNA())
percentiles: 5133.05492958 287.110541743 710  |  3105 4906.0 5974.0 6186


### create spreadsheet for supplementary table

In [8]:
def create_dataframe_from_table_filename_species (table_fs):
    dfd = {"Organism":[], "Accession Number":[],"File Name":[], "Species":[], "Number of Operons":[], 
          "Average Operon Size":[], "Min Operon Size":[], "Max Operon Size":[]}
    for elem in table_fs: # 5,6,7,8 are length, mean, min, max
        dfd["Accession Number"].append(elem[1])
        dfd["Organism"].append(elem[2])
        dfd["File Name"].append(elem[0].split("/")[-1])
        dfd["Species"].append(" ".join(elem[3].split("_")))
        dfd["Number of Operons"].append(elem[5])
        dfd["Average Operon Size"].append(elem[6])
        dfd["Min Operon Size"].append(elem[7])
        dfd["Max Operon Size"].append(elem[8])
    return pandas.DataFrame(data=dfd)

for genus in bacteria_genera:
    df = create_dataframe_from_table_filename_species (good_fl_sp[genus])
    df.to_csv(outdir + genus + ".csv")

In [10]:
# align each of the dictionary values (lists of unaligned seqs)
operon_aligned = {}
for genus in bacteria_genera:
    operon_aligned[genus] = align_seqs(sequences=rna_operon[genus], outfile=outdir + genus + ".fasta")

started aligning... Finished
started aligning... Finished


In [11]:
def create_species_genus_labels (seqlist, have_paralogs = False): 
    species = ['_'.join(sequence.id.split('_')[:2]) for sequence in seqlist]
    genus = [sequence.id.split('_')[0] for sequence in seqlist]
    if have_paralogs:# assumes 'genus_species_code_number' names 
        sample = ['_'.join(sequence.id.split('_')[:3]) for sequence in seqlist]
        return species, genus, sample
    return species, genus 

### if resuming from previous analysis, run code below to read genome names table and alignments

In [None]:
# read table
for genus in bacteria_genera:
    fl=gzip.open(outdir + genus + "_fl_sp.pickle.gz", "r"); good_fl_sp[genus] = pickle.load(fl); fl.close()
    operon_aligned = AlignIO.read(outdir + genus + ".fasta", "fasta")

### estimates trees with UPGMA (muscle) and Likelihood (iqtree)

In [12]:
for genus in bacteria_genera:
    fname = outdir + genus
    outcheck = subprocess.check_output("muscle -maketree -in " + fname + ".fasta -out " + fname + "_upgma.treefile", shell=True,universal_newlines=True)
    outcheck = subprocess.check_output("iqtree-omp -s " + fname + ".fasta -nt 8 -m HKY+G",shell=True,universal_newlines=True)

### calculates leaf-to-leaf distances from estimated trees

In [13]:
def patristic_distances_from_treefile (filename, have_paralogs = False, shuffle = False):
    tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
    if shuffle:
        tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 1000)
    species = ['_'.join(t.label.split('_')[:2]) for t in tree.taxon_namespace] ## follow order of taxon_namespace
    genus = [t.label.split('_')[0] for t in tree.taxon_namespace]
    ntaxa = len(tree.taxon_namespace)
    distmat = np.zeros((ntaxa,ntaxa)) # diagonals are zero
    nodemat = np.zeros((ntaxa,ntaxa))
    pdm = tree.phylogenetic_distance_matrix() # initialises class
    for i,j in itertools.combinations(range(ntaxa),2):
        distmat[i,j] = distmat[j,i] = pdm.distance(tree.taxon_namespace[i], tree.taxon_namespace[j])
        nodemat[i,j] = nodemat[j,i] = pdm.path_edge_count(tree.taxon_namespace[i], tree.taxon_namespace[j])
    if have_paralogs:
        sample = ['_'.join(t.label.split('_')[:3]) for t in tree.taxon_namespace]
        return species, genus, distmat, nodemat, sample
    return species, genus, distmat, nodemat

In [14]:
def silhouette_str(dist, labels):
    distrib = metrics.silhouette_samples(dist, labels, metric="precomputed")
    return '{:7.3f}'.format(np.percentile(distrib, 5)) + \
    '{:7.3f}'.format(np.percentile(distrib, 25)) + \
    '{:7.3f}  '.format(sum(distrib>0)/float(len(distrib))) 

Each triplet from `silhouette_str()` has $5\%$, $25\%$ and proportion of positive silhouettes. And we have two triplets on the left for _species_ (S), and two on the right for _strain_ (T), one being from weighted (W) and one from unweighted (U) patristic distances. Therefore the header is: SW SU TW TU

By the way having low silhouettte scores is not necessarily bad, since
1. clusters of size one will have score of _zero_ **by definition** (should be _one_ given the silhouette function but this would lead to ever-increasing clusterings)
2. clusters with within-distance zero are perfect, while for phylogenies we want non-zero diversification
3. they can be negative if for a case like `((A:a,A:a,A:a),B:b)` with $a>b$ since $d(A,A)=2a > a+b = d(A,B)$ --- however the taxonomic classification is fine for A and B

In [15]:
for genus in bacteria_genera:
    fname = outdir + genus
    print_redblack(genus)
    sp, ge, mat, mat2, strain = patristic_distances_from_treefile (fname + ".fasta.treefile", have_paralogs = True, shuffle = True)
    print ("random \t", silhouette_str(mat, sp),  silhouette_str(mat2, sp), " | ", 
          silhouette_str(mat, strain),  silhouette_str(mat2, strain))
    for suffix1, title1 in zip(["_upgma.treefile",".fasta.treefile"], ["upgma ", "ML    "]):
        ## operon doenst have _long or _consensus
        sp, ge, mat, mat2, strain = patristic_distances_from_treefile (fname + suffix1, have_paralogs = True)
        print (title1, "\t", silhouette_str(mat, sp),  silhouette_str(mat2, sp), " | ", 
          silhouette_str(mat, strain),  silhouette_str(mat2, strain))

[0;1;31;1mPseudomonas[0;1;30;1m[0m
random 	  -0.324 -0.233  0.015    -0.152 -0.119  0.011    |   -0.978 -0.906  0.009    -0.603 -0.427  0.013  
upgma  	  -0.450  0.340  0.848    -0.468 -0.057  0.707    |   -0.821 -0.250  0.551    -0.618 -0.221  0.616  
ML     	  -0.644  0.313  0.827    -0.442 -0.058  0.656    |   -0.851 -0.356  0.553    -0.535 -0.222  0.595  
[0;1;31;1mStaphylococcus[0;1;30;1m[0m
random 	  -0.615 -0.457  0.037    -0.177 -0.118  0.028    |   -0.959 -0.830  0.011    -0.631 -0.410  0.007  
upgma  	   0.007  0.234  0.968    -0.482 -0.248  0.475    |   -0.850 -0.430  0.087    -0.765 -0.529  0.118  
ML     	  -0.300  0.444  0.783    -0.294  0.003  0.761    |   -0.883 -0.795  0.083    -0.596 -0.449  0.118  


### the code below is obsolete, it just counts how many species are found below each node
- please see notebook `021` for an up-to-date version, with proper monophyly scores

In [None]:
def sp_count_from_list (labels):
    x = ['_'.join(t.split('_')[:2]) for t in labels]
    return len(set(x))

def strain_count_from_list (labels):
    x = ['_'.join(t.split('_')[:3]) for t in labels]
    return len(set(x))

def bipartitions_from_treefile (filename, shuffle = False):
    tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
    if shuffle:
        tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 1000)
    tree.encode_bipartitions()
    # bits[i] have the list of labels associated to bipartition (edge) i
    bits = [[x.label for x in tree.taxon_namespace.bitmask_taxa_list(i.leafset_bitmask)] for i in tree.bipartition_encoding]
    sp_count = [sp_count_from_list(i) for i in bits if len(i) > 1]
    strain_count = [strain_count_from_list(i) for i in bits if len(i) > 1]
    return sp_count, strain_count

def bipart_str(counts):
    distrib = np.array(counts)
    return '{:7.3f}'.format(np.mean(distrib)) + \
    '{:7.3f}'.format(np.percentile(distrib, 90)) + \
    '{:7.3f}\t'.format(sum(distrib == 1)/float(len(distrib))) 

sp0, ge0 = bipartitions_from_treefile (outdir + ".fasta.treefile", shuffle = True)
print ('{:7s}'.format("random"), "\t", bipart_str(sp0), bipart_str(ge0))

for suffix1, title1 in zip(["_upgma.treefile",".fasta.treefile"], ["upgma ", "ML    "]):
    sp, ge = bipartitions_from_treefile (outdir + suffix1)    
    print ('{:7s}'.format(title1), "\t", bipart_str(sp), bipart_str(ge))