# Understanding phylogenetic trees

In this notebook, we will investigate how phylogenetic trees are shaped by the mutations that accumulate while bacteria multiply. 

#### Credits - code in this notebook was adapted from the sources below:
* Sequence alignment: https://dmnfarrell.github.io/bioinformatics/bokeh-sequence-aligner
* Sequence evolution simulation: https://hplgit.github.io/bioinf-py/doc/pub/html/main_bioinf.html. 

In [None]:
import random
import numpy as np

from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo
from io import StringIO

import re

import panel as pn
import panel.widgets as pnw
pn.extension()

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot

# Setting up functions

For this explainer, I have created functions to generate random DNA sequences, simulate the evolution of these sequences over time, build a phylogenetic tree of the simulated sequences and view the differences in the sequences. If you want to explore them, click the 'Show hidden cell' buttons to examine the code. 

For each mutation event, a random position in the sequence is chosen, and a random base is selected, with a 1 in 4 chance that the same base is selected so there is no observable mutation. 

In [None]:
# substitute a random base in a sequence n times

def mutate_n(dna, N):
    dna = np.array(dna, dtype='c')  # array of characters
    mutation_sites = random.sample(range(0, len(dna) - 1), N)
    # Draw new bases - 1 in 4 chance of no mutation
    new_bases = random.choices(list('ATCG'), k=N)
    # Replace the mutated bases
    dna[mutation_sites] = new_bases
    return ''.join([bytes.decode(s) for s in np.frombuffer(dna, dtype='S1')])

In [None]:
# visualize a sequence alignment
# adapted from https://dmnfarrell.github.io/bioinformatics/bokeh-sequence-aligner

import panel as pn
import panel.widgets as pnw
pn.extension()

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot

def view_alignment(aln, fontsize="9pt", plot_width=800):
    """Bokeh sequence alignment view"""

    #make sequence and id lists from the aln object
    seqs = [rec.seq for rec in (aln)]
    ids = [rec.id for rec in aln]    
    text = [i for s in list(seqs) for i in s]
    colors = get_colors(seqs)    
    N = len(seqs[0])
    S = len(seqs)    
    width = .4

    x = np.arange(0.5,N+0.5)
    y = np.arange(0,S,1)
    #creates a 2D grid of coords from the 1D arrays
    xx, yy = np.meshgrid(x, y)
    #flattens the arrays
    gx = xx.ravel()
    gy = yy.flatten()
    #use recty for rect coords with an offset
    recty = gy+.5
    h= 1/S
    #now we can create the ColumnDataSource with all the arrays
    source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors))
    plot_height = len(seqs)*15+50
    x_range = Range1d(0,N+1, bounds='auto')
    if N>100:
        viewlen=100
    else:
        viewlen=N
    #view_range is for the close up view
    view_range = (0,viewlen)
    tools="xpan, xwheel_zoom, reset, save" 

    #sequence text view with ability to scroll along x axis
    p = figure(title=None, plot_width=plot_width, plot_height=plot_height,
                x_range=view_range, y_range=ids, tools="xpan,reset",
                min_border=0, toolbar_location='below')#, lod_factor=1)          
    glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black",
                text_font="monospace",text_font_size=fontsize)
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                line_color=None, fill_alpha=0.4)
    p.add_glyph(source, glyph)
    p.add_glyph(source, rects)

    p.grid.visible = False
    p.xaxis.major_label_text_font_style = "bold"
    p.yaxis.minor_tick_line_width = 0
    p.yaxis.major_tick_line_width = 0

    p = gridplot([[p]], toolbar_location='below')
    return p

def get_colors(seqs):
    """make colors for bases in sequence"""
    text = [i for s in list(seqs) for i in s]
    clrs =  {'A':'red','T':'green','G':'orange','C':'blue','-':'white'}
    colors = [clrs[i] for i in text]
    return colors


In [None]:
# build a phylogenetic tree from a list of sequences

def build_tree(sequences, method='upgma'):
    alignment = MultipleSeqAlignment(sequences)
    
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    
    constructor = DistanceTreeConstructor()
    if method is 'upgma':
        tree = constructor.upgma(dm)
    if method is 'nj':
        tree = constructor.nj(dm)
    
    tree.root_with_outgroup('Ancestor')
    tree.ladderize() # place longer branches on the bottom
    edit_tree = re.sub('Inner\\d+', '', tree.format('newick')) # remove the inner node labels
    handle = StringIO(edit_tree)
    tree = Phylo.read(handle, 'newick')
    Phylo.draw(tree)
    
    return alignment


In [None]:
# simulate sequence evolution

def random_DNA(length):
    DNA=""
    for count in range(length):
        DNA+=random.choice("CGTA")
    return DNA

def make_random_seqs(num_mutations):
    seq_recs_random = [SeqRecord(Seq(seq1, generic_dna), id="Ancestor")]
    for i in range(n_seqs):
        num = i + 1
        num_mutations = random.randint(0, 10)
        print('Generating sequence ' + str(num) + ': ' + str(num_mutations) + ' mutations from the Ancestor')
        sequence = mutate_n(seq1, num_mutations)
        seq_recs_random.append(SeqRecord(Seq(sequence, generic_dna), id="Sequence_" + str(num)))
    return seq_recs_random

def make_serial_seqs(num_mutations): 
    seq_recs_serial = [SeqRecord(Seq(seq1, generic_dna), id="Ancestor")]
    for i in range(n_seqs):
        num = i + 1
        print('Generating sequence ' + str(num) + ': ' + str(num_mutations) + ' mutations from sequence ' + str(i))
        sequence = mutate_n(str(seq_recs_serial[i].seq), num_mutations)
        seq_recs_serial.append(SeqRecord(Seq(sequence, generic_dna), id="Sequence_" + str(num)))
    return seq_recs_serial

def make_bifurcating_seqs(num_mutations):
    seq_recs_bifurcating = {'Ancestor': SeqRecord(Seq(seq1, generic_dna), id="Ancestor")}
    gen_names = {'0' : ['Ancestor']}
    
    for gen in range(4): # model four generations
        ids = ['A', 'B']
        ancestors = gen_names[str(gen)]
        names = []
        for ancestor in ancestors:
            offspring = [ancestor + "." + id for id in ids]
            names.extend(offspring)
            for i in offspring:
                sequence = mutate_n(str(seq_recs_bifurcating[ancestor].seq), num_mutations)
                seq_recs_bifurcating[i] = SeqRecord(Seq(sequence, generic_dna), id=i)
        gen_names[str(gen+1)] = names
    seq_recs_bifurcating = list(seq_recs_bifurcating.values())
    return seq_recs_bifurcating

def make_binary_fission(num_mutations, num_gens=4):
    seq_recs_bifurcating = {'Ancestor': SeqRecord(Seq(seq1, generic_dna), id="Ancestor")}
    gen_names = {'0' : ['Ancestor']}
    
    for gen in range(num_gens): # model num_gens generations
        ancestors = gen_names[str(gen)]
        names = []
        for ancestor in ancestors:
            print('Generating sequences for generation ' + str(gen) + ': ' + str(num_mutations) + ' mutations from the previous generation')
            offspring = [ancestor + 'same', ancestor + 'mut']
            names.extend(offspring)
            seq_recs_bifurcating[offspring[0]] = SeqRecord(Seq(str(seq_recs_bifurcating[ancestor].seq), generic_dna), id=offspring[0])
            sequence = mutate_n(str(seq_recs_bifurcating[ancestor].seq), num_mutations)
            seq_recs_bifurcating[offspring[1]] = SeqRecord(Seq(sequence, generic_dna), id=offspring[1])
        gen_names[str(gen+1)] = names
    seq_recs_bifurcating = list(seq_recs_bifurcating.values())
    return seq_recs_bifurcating


Here, we define the DNA sequence of our bacterial ancestor ('seq1'), the number of sequences to generate ('n_seqs'), and the number of mutations to simulate per step ('num_mutations'). These parameters can all be changed to explore the impact of each factor on the appearance of the tree.

In [None]:
seq1 = random_DNA(length=40)
print("DNA sequence of our ancestor: " + seq1)
n_seqs = 20
num_mutations = 2 # number of mutations to simulate per step (if applicable)

# Creating bacteria with a fixed number of mutations compared to their ancestor

Here, each sequence is a mutated version of the previous one. This results in a steady movement of sequences from the left to the right as they become more different to the ancestor. 

In [None]:
sequences = make_serial_seqs(num_mutations)

alignment = build_tree(sequences, method='nj')
p = view_alignment(alignment, plot_width=1200)
pn.pane.Bokeh(p)

# Simulating a growing population of bacteria

In this scenario, we look at a scenario that reflects the way bacteria replicate through binary fission. During this process, the ancestor's double-stranded DNA is peeled apart, and each strand is used as a template to create a complementary copy, resulting in two copies of the genome. During this process, mistakes can occur which makes the offspring's DNA different from the ancestor. 

<img src="https://cdn.kastatic.org/ka-perseus-images/3af458a8d1139517c80af9fff471aca53bab0dbf.png" width="400px">

In this simulation, we capture the genome of the ancestor, plus both of their offspring, which each have a chance of accumulating mutations during the copying of their ancestor's DNA, plus two of their offspring, and so on...

In [None]:
sequences = make_bifurcating_seqs(num_mutations)

alignment = build_tree(sequences, method='nj')
p = view_alignment(alignment, plot_width=1200)
pn.pane.Bokeh(p)

Some of this variation is reduced by simulating a longer genome and fewer mutations between samples.

In [None]:
seq1 = random_DNA(length=100)
print("DNA sequence of our ancestor: " + seq1)
n_seqs = 20
num_mutations = 2 # number of mutations to simulate per step

sequences = make_bifurcating_seqs(num_mutations)

alignment = build_tree(sequences, method='nj')
p = view_alignment(alignment, plot_width=1200)
pn.pane.Bokeh(p)

# Creating bacteria with a random number of mutations compared to our ancestor

Here, we generate a collection of sequences with a random number of mutation events compared to our ancestor sequence. This is a better representation of the data we normally collect when we randomly collect a sample of bacteria, e.g. from a patient population or from the environment. 

In this scenario, we expect to see the ancestor on the left hand side of the tree, representing 0 mutations. We expect to see each generated sequence a certain distance from the Ancestor, with sequences on the same part of the x-axis if they are identical to the ancestor, and sequences with the most chances to mutate sitting further to the right. 

The number of mutation opportunities and the distance from the ancestor won't line up perfectly for two reasons:
1. Each opportunity to mutate has a 1 in 4 chance of not resulting in a change in the DNA
2. In some cases, an early mutation might change the ancestral sequence and a later mutation might change it back

In [None]:
sequences = make_random_seqs(num_mutations)

alignment = build_tree(sequences, method='nj')
p = view_alignment(alignment, plot_width=1200)
pn.pane.Bokeh(p)