# Effect of paralog choice on rRNA full operons

- The analysis below runs on Pseudomonas and Staphylococcus data sets; you can try different ones, using the files downloaded as described in 026.concat_resolution.ipynb
- BTW, please see 026.concat_resolution.ipynb for more details in case anything is not clear here.

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

from Bio import Seq, SeqIO, Align, AlignIO, Phylo, Alphabet, pairwise2 
from Bio.SeqRecord import SeqRecord
from Bio.Align import AlignInfo, Applications
from Bio.Phylo import draw, TreeConstruction

import numpy as np
import seaborn as sns
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing
import skbio, parasail, dendropy, pandas
import sys, gzip, re, glob, pickle, collections, subprocess, os, errno, random, itertools

def print_redblack(textr, textbb="", textbl=""):
    print ('\x1b[0;1;31;1m'+ str(textr) + '\x1b[0;1;30;1m'+ str(textbb) + '\x1b[0;0;30;0m'+ str(textbl) + '\x1b[0m')

### defining main variables
- we also assume that all RefSeq genomes are in directory `bigdata/`, in `gbff.gz` format.
- Here we apply a somehow 'agressive' subsampling (typically 1 or 2 samples per species) since we are not interested in quantitative analysis. We just want to show a case where different copies from the same strain cluster in distinct subtrees. 

In [2]:
# this is the set of possibilities (that we previously downloaded from refseq) 
all_genera = ["Campylobacter","Enterococcus","Klebsiella","Listeria","Neisseria","Staphylococcus",
                 "Vibrio","Clostridium","Escherichia","Helicobacter","Leptospira","Mycobacterium",
                 "Pseudomonas","Salmonella","Streptococcus"] ## we don't use this variable in practice
# Genera that we will in fact get all paralogs (rRNA copies)
bacteria_genera = ["Pseudomonas", "Staphylococcus"]
outdir = "./024_results/"
rrna_types = ["16S", "23S", "5S"];

In [6]:
def species_list_from_binomial (binomial_str):
    return binomial_str.split()[:2]  ## two first names, since might have strain, serovar

## this function will generate leaf names on trees (default, below, is GTDB _ accession_number)
def species_name_from_binomial (binomial_str, accession_str=None):
    binstr = ".".join(binomial_str.split()[:2]) ## join with \. since \_ is already used bt GTDB
    if (accession_str):
        accession_str = str(re.search('GCF_(\d+)\.', accession_str).group(1)) # gets number inside GCF_ddd.1
        binstr += "." + accession_str
    return binstr

def subsample_accept_genus (sp_name, list_of_genera = bacteria_genera):
    for x in list_of_genera:
        if x in sp_name:
            return True
    return False

def subsample_accept_counter (sp_name, collection_counter):
    if sp_name not in collection_counter:
        return False
    x = collection_counter[sp_name]
    if x < 1:  ## used only in 026, not here (since we may have several paralogs even for one sample) 
        return False
    if x > 0: ## in practice only subsamples P. aeruginosa, S. aureus, and maybe a few others
        return np.random.random() < 2/x
    return True

### table with taxonomic classification

In [4]:
gbff_dir = "/media/deolivl/QIB_deolivl/bigdata/"   ## directory (or directories, in our case) with refseq genomes
#gbff_dir = "./bigdata/"

file_lines = [line.strip() for line in open("./bigdata/gtdb_list.csv", 'r')]
print_redblack("header: <file>  ", file_lines[0].split(';')[1:]) ## first column is data.frame index from R

table_filenames_species = [line.split(';')[1:] for line in file_lines[1:]] ## skip first line, with headers
print ("first element of list: ", table_filenames_species[0], "\n")

fnames = glob.glob(gbff_dir + "*/GCF_*.gbff.gz")
idx=[i for j in table_filenames_species for i,fname in enumerate(fnames) if j[0] in fname] ## j[1] has accession 
table_filenames_species = [[fnames[idx[i]]] + j for i,j in enumerate(table_filenames_species)]
print ("first element, with file location: ", table_filenames_species[0])

[0;1;31;1mheader: <file>  [0;1;30;1m['gtdb_taxonomy', 'lsu_silva_23s_taxonomy', 'ncbi_organism_name', 'ncbi_taxonomy', 'ssu_silva_taxonomy'][0;0;30;0m[0m
first element of list:  ['GCF_000465235.1', 'Campylobacter_D coli', 'Campylobacter coli CVM N29710', 'Campylobacter coli CVM N29710', 'Campylobacter coli', 'Campylobacter jejuni 30318'] 

first element, with file location:  ['/media/deolivl/QIB_deolivl/bigdata/Campylobacter/GCF_000465235.1_ASM46523v1_genomic.gbff.gz', 'GCF_000465235.1', 'Campylobacter_D coli', 'Campylobacter coli CVM N29710', 'Campylobacter coli CVM N29710', 'Campylobacter coli', 'Campylobacter jejuni 30318']


### subsample from database, for selected genera

In [7]:
tbl_fl_sp = {}
for genus in bacteria_genera:
    # remove all samples that don't belong to set of _arbitrarily_ chosen genera
    tbl_tmp = [x  for x in list(table_filenames_species) if subsample_accept_genus (x[2], list_of_genera=[genus])] 
    species_counter = collections.Counter([x[2] for x in tbl_tmp if "sp." not in x[2]]) ## remove generic "sp."
    print_redblack ("\nMost common species in " + genus + " (from all samples): ", "", species_counter.most_common(10))

    tbl_fl_sp[genus] = [x  for x in table_filenames_species if subsample_accept_counter (x[2], species_counter)]
    tmp_counter = collections.Counter([x[2] for x in tbl_fl_sp[genus]])
    print_redblack("\nFinal set with " + str(len(tbl_fl_sp[genus])) + " samples ", "(" + str(len(tmp_counter))+ " species) ",tmp_counter)

[0;1;31;1m
Most common species in Pseudomonas (from all samples): [0;1;30;1m[0;0;30;0m[('Pseudomonas aeruginosa', 122), ('Pseudomonas_E avellanae', 13), ('Pseudomonas_E hunanensis', 11), ('Pseudomonas_E ficuserectae', 7), ('Pseudomonas_E monteilii_B', 6), ('Pseudomonas_A stutzeri', 5), ('Pseudomonas_E syringae_M', 5), ('Pseudomonas_E protegens', 5), ('Pseudomonas_E piscium', 5), ('Pseudomonas aeruginosa_A', 4)][0m
[0;1;31;1m
Final set with 150 samples [0;1;30;1m(113 species) [0;0;30;0mCounter({'Pseudomonas_E avellanae': 5, 'Pseudomonas_A stutzeri': 4, 'Pseudomonas_E hunanensis': 3, 'Pseudomonas_E syringae': 3, 'Pseudomonas_E cerasi': 3, 'Pseudomonas_E taiwanensis': 3, 'Pseudomonas aeruginosa_A': 2, 'Pseudomonas_E fluorescens_AR': 2, 'Pseudomonas_E simiae': 2, 'Pseudomonas_E lurida': 2, 'Pseudomonas_E sp002966775': 2, 'Pseudomonas_E sp000346225': 2, 'Pseudomonas_E mendocina_A': 2, 'Pseudomonas_E mendocina': 2, 'Pseudomonas_E putida_P': 2, 'Pseudomonas_E monteilii_B': 2, 'Pseudomo

### main functions
* find all operons, align sequences

In [8]:
def align_seqs (sequences=None, maxiters=12, infile=None, outfile=None, mafft = True):
    print ("started aligning...", flush=True, end=" ")
    if (sequences is None) and (infile is None):
        print ("ERROR: You must give me an alignment object or file")
        return [] ## OTOH if both are present then infile is overwritten with contents of sequences[]
    if infile is None:
        ifl = "/tmp/in.fas"
    else:
        ifl = infile
    if outfile is None:
        ofl = "/tmp/out.fas"
    else:
        ofl = outfile
    SeqIO.write(sequences, ifl, "fasta")
    if (mafft is False):
        proc_run = subprocess.check_output("muscle -in " + ifl + " -diags -maxiters " + str(maxiters) + " -out " + ofl,
                                       shell=True, universal_newlines=True)
    else: # "--parttree --6merpair" and 0.123 is to avoid very long alignments
        proc_run = subprocess.check_output("mafft --ep 0.3 --op 3.0 --auto " + ifl + " > " + ofl,
                                       shell=True, universal_newlines=True)
      
    aligned = AlignIO.read(ofl, "fasta")
    print ("Finished",flush=True)
    if infile is None:
        os.system("rm -f " + ifl)
    if outfile is None:
        os.system("rm -f " + ofl)
    return aligned

def return_contig_locations (featurelist): # featurelist has list of [ [location1, location2], rnaname, strand ]
    featurelist = sorted(featurelist, key = lambda x: int(x[0][0]))
    rnablocks = []
    rnabnames = []
    rnasignal = []
    tmp_loc = list(featurelist[0][0]) # start and end locations
    tmp_nam = [featurelist[0][1]]    # gene name
    tmp_sig = [featurelist[0][2]]
    for feature in featurelist[1:]:
        if (abs(tmp_loc[-1] - feature[0][0]) < 1000):
            tmp_loc.extend(list(feature[0]))
            tmp_nam.extend([feature[1]])
            tmp_sig.extend([feature[2]])
        else:
            rnablocks.append(tmp_loc)
            rnabnames.append(tmp_nam)
            rnasignal.append(tmp_sig)
            tmp_loc = list(feature[0])
            tmp_nam = [feature[1]]
            tmp_sig = [feature[2]]
    rnablocks.append(tmp_loc)
    rnabnames.append(tmp_nam)
    rnasignal.append(tmp_sig)
    return rnablocks, rnabnames, rnasignal
    
def get_rrna_from_genbank_to_list_and_dict (table_fs):
    gbank = SeqIO.parse(gzip.open(table_fs[0], "rt"), "genbank")
    list_of_features = []
    list_of_seqs = []
    genome = next(gbank) # no point in iterating over gbank, only this has whole information
    for feature in genome.features:
        if(feature.type == "rRNA"):
            for rna in rrna_types:
                if rna in feature.qualifiers['product'][0]:
                    this_product = rna
                    break
            this_location = [int(i) for i in re.findall('\d+',str(feature.location))]
            this_strand = feature.strand
            if 0 not in this_location:
                list_of_features.append([this_location[:2], this_product, this_strand])
    # now list_of_features is complete  for this genome record (gbank[0])
    if not list_of_features:
        print ("no info from ", table_fs[0])
        return None # some genomes have no relevant annotation
    location, gnames, strand = return_contig_locations (list_of_features)
    genome_len = len(genome.seq)
    for i, (l,g,s) in enumerate(zip(location, gnames, strand)):
        i_start = min(l); i_end = max(l)
        if sum(s) < 0: ## hopefully they all have same signal
            contig = genome.seq[i_start:i_end].reverse_complement()
            seqgenes = "s" + "".join(g[::-1])
        else:
            contig = genome.seq[i_start:i_end]
            seqgenes = "s" + "".join(g)
        numbercode = '.{:02d}'.format(i) ## unlike pilot, here dot is the field delimiter
        
        # table_fs[2] = GTDB, 3=lsuSILVA, 5=NCBI, 6=ssuSILVA
        seqname  = species_name_from_binomial (table_fs[2], table_fs[1]) # table_fs[1] = accession number 
        seqname += numbercode
        if (len(contig) > 3000) and (len(contig) < 8500): # some contigs have only 5S or only this 5S annotated
            if "16S23S5S" in seqgenes: # only full operon, in proper order (to avoid e.g. 16S16S...)
                list_of_seqs.append(SeqRecord(contig, id = seqname, description = seqname))
    return list_of_seqs

### sample genomes and create unaligned list
- also saves variables (internediate results) to `xxx_fl_sp.pickle.gz` where xxx is the genus name ;) 

In [9]:
good_fl_sp = {}
rna_operon = {}  # includes all paralogs
for genus in bacteria_genera:
    good_fl_sp[genus] = []
    rna_operon[genus] = []
    for file_counter, tfs in enumerate(tbl_fl_sp[genus]):
        if (len(good_fl_sp[genus]) == 200):
            break
        if not (file_counter+1)%20:
            print (str(file_counter+1), end=" ", flush=True)
        rna_list = get_rrna_from_genbank_to_list_and_dict (tfs)
        # (some list will have no usable features or b/c too long/too short; some dict has no usable features
        if rna_list:
            x = [len(i.seq) for i in rna_list]
            good_fl_sp[genus].append(tfs + [len(rna_list), np.mean(x), np.min(x), np.max(x)])
            rna_operon[genus].extend(rna_list)
    outfile = outdir + genus
    # save table
    fl = gzip.open(outfile + "_fl_sp.pickle.gz", "w"); pickle.dump([good_fl_sp[genus]],fl,2); fl.close()
    print_redblack(genus)
    print ("first sequence of operon alignment:\n", rna_operon[genus][0])
    x = [len(i.seq) for i in rna_operon[genus]]
    print ("percentiles:", np.mean(x), np.std(x), len(x), " | ", 
           np.min(x), np.percentile(x, 5), np.percentile(x, 95), np.max(x))

20 40 60 80 100 120 140 [0;1;31;1mPseudomonas[0;1;30;1m[0;0;30;0m[0m
first sequence of operon alignment:
 ID: Pseudomonas.aeruginosa_A.003025345.00
Name: <unknown name>
Description: Pseudomonas.aeruginosa_A.003025345.00
Number of features: 0
Seq('GAACTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACAC...AGC', IUPACAmbiguousDNA())
percentiles: 5215.84779706275 126.62942079984022 749  |  4957 4994.799999999999 5441.0 7127
20 40 [0;1;31;1mStaphylococcus[0;1;30;1m[0;0;30;0m[0m
first sequence of operon alignment:
 ID: Staphylococcus.aureus.001018685.00
Name: <unknown name>
Description: Staphylococcus.aureus.001018685.00
Number of features: 0
Seq('TTTATGGAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAATAC...GGC', IUPACAmbiguousDNA())
percentiles: 5119.324468085107 289.88500294638857 188  |  4887 4906.35 5993.9 6211


### create spreadsheet for supplementary table

In [10]:
def create_dataframe_from_table_filename_species (table_fs):
    dfd = {"Organism (NCBI)":[], "Accession Number":[],"File Name":[], "Species (GTDB)":[], "Number of Operons":[], 
          "Average Operon Size":[], "Min Operon Size":[], "Max Operon Size":[]}
    for elem in table_fs: # 7,8,9,10 are length, mean, min, max
        dfd["Accession Number"].append(elem[1])
        dfd["Organism (NCBI)"].append(elem[4])  ## species=[5], but organism=[4] 
        dfd["File Name"].append(elem[0].split("/")[-1])
        dfd["Species (GTDB)"].append(" ".join(elem[2].split("_")))
        dfd["Number of Operons"].append(elem[7])
        dfd["Average Operon Size"].append(elem[8])
        dfd["Min Operon Size"].append(elem[9])
        dfd["Max Operon Size"].append(elem[10])
    return pandas.DataFrame(data=dfd)

for genus in bacteria_genera:
    df = create_dataframe_from_table_filename_species (good_fl_sp[genus])
    df.to_csv(outdir + genus + ".csv")

In [11]:
# align each of the dictionary values (lists of unaligned seqs)
operon_aligned = {}
for genus in bacteria_genera:
    operon_aligned[genus] = align_seqs(sequences=rna_operon[genus], outfile=outdir + genus + ".fasta")

started aligning... Finished
started aligning... Finished


In [12]:
def create_species_genus_labels (seqlist, have_paralogs = False): 
    species = ['_'.join(sequence.id.split('.')[:2]) for sequence in seqlist]
    genus = [sequence.id.split('.')[0] for sequence in seqlist]
    if have_paralogs:# assumes 'genus_species_code_number' names 
        sample = ['_'.join(sequence.id.split('.')[:3]) for sequence in seqlist]
        return species, genus, sample
    return species, genus 

### if resuming from previous analysis, run code below to read genome names table and alignments

In [None]:
# read table
for genus in bacteria_genera:
    fl=gzip.open(outdir + genus + "_fl_sp.pickle.gz", "r"); good_fl_sp[genus] = pickle.load(fl); fl.close()
    operon_aligned = AlignIO.read(outdir + genus + ".fasta", "fasta")

### estimates trees with UPGMA (muscle) and Likelihood (iqtree)

In [13]:
for genus in bacteria_genera:
    fname = outdir + genus
    outcheck = subprocess.check_output("muscle -maketree -in " + fname + ".fasta -out " + fname + "_upgma.treefile", 
                                      shell = True, universal_newlines = True)
for genus in bacteria_genera:
    fname = outdir + genus
    outcheck = subprocess.check_output("iqtree -redo -s " + fname + ".fasta -nt 8 -m HKY+G",
                                       shell = True, universal_newlines = True)

### calculates leaf-to-leaf distances from estimated trees

In [None]:
def patristic_distances_from_treefile (filename, have_paralogs = False, shuffle = False):
    tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
    if shuffle:
        tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 1000)
    species = ['_'.join(t.label.split('_')[:2]) for t in tree.taxon_namespace] ## follow order of taxon_namespace
    genus = [t.label.split('_')[0] for t in tree.taxon_namespace]
    ntaxa = len(tree.taxon_namespace)
    distmat = np.zeros((ntaxa,ntaxa)) # diagonals are zero
    nodemat = np.zeros((ntaxa,ntaxa))
    pdm = tree.phylogenetic_distance_matrix() # initialises class
    for i,j in itertools.combinations(range(ntaxa),2):
        distmat[i,j] = distmat[j,i] = pdm.distance(tree.taxon_namespace[i], tree.taxon_namespace[j])
        nodemat[i,j] = nodemat[j,i] = pdm.path_edge_count(tree.taxon_namespace[i], tree.taxon_namespace[j])
    if have_paralogs:
        sample = ['_'.join(t.label.split('_')[:3]) for t in tree.taxon_namespace]
        return species, genus, distmat, nodemat, sample
    return species, genus, distmat, nodemat

In [None]:
def silhouette_str(dist, labels):
    distrib = metrics.silhouette_samples(dist, labels, metric="precomputed")
    return '{:7.3f}'.format(np.percentile(distrib, 5)) + \
    '{:7.3f}'.format(np.percentile(distrib, 25)) + \
    '{:7.3f}  '.format(sum(distrib>0)/float(len(distrib))) 

Each triplet from `silhouette_str()` has $5\%$, $25\%$ and proportion of positive silhouettes. And we have two triplets on the left for _species_ (S), and two on the right for _strain_ (T), one being from weighted (W) and one from unweighted (U) patristic distances. Therefore the header is: SW SU TW TU

By the way having low silhouettte scores is not necessarily bad, since
1. clusters of size one will have score of _zero_ **by definition** (should be _one_ given the silhouette function but this would lead to ever-increasing clusterings)
2. clusters with within-distance zero are perfect, while for phylogenies we want non-zero diversification
3. they can be negative if for a case like `((A:a,A:a,A:a),B:b)` with $a>b$ since $d(A,A)=2a > a+b = d(A,B)$ --- however the taxonomic classification is fine for A and B

In [None]:
for genus in bacteria_genera:
    fname = outdir + genus
    print_redblack(genus)
    sp, ge, mat, mat2, strain = patristic_distances_from_treefile (fname + ".fasta.treefile", have_paralogs = True, shuffle = True)
    print ("random \t", silhouette_str(mat, sp),  silhouette_str(mat2, sp), " | ", 
          silhouette_str(mat, strain),  silhouette_str(mat2, strain))
    for suffix1, title1 in zip(["_upgma.treefile",".fasta.treefile"], ["upgma ", "ML    "]):
        ## operon doenst have _long or _consensus
        sp, ge, mat, mat2, strain = patristic_distances_from_treefile (fname + suffix1, have_paralogs = True)
        print (title1, "\t", silhouette_str(mat, sp),  silhouette_str(mat2, sp), " | ", 
          silhouette_str(mat, strain),  silhouette_str(mat2, strain))