<a href="https://colab.research.google.com/github/sokrypton/ws2023/blob/main/Day1_phylogeny.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download example dataset we'll be using this week!

In [None]:
%%bash
wget -qnc https://raw.githubusercontent.com/sokrypton/ws2023/main/day1/phy_msa.fasta
wget -qnc https://raw.githubusercontent.com/sokrypton/ws2023/main/day1/phy_msa_ext.fasta
wget -qnc https://raw.githubusercontent.com/sokrypton/ws2023/main/day1/phy_primates.fasta

##BioPython Phylogeny
https://biopython.org/wiki/Phylo

In [None]:
# install biopython
!pip -q install biopython

In [None]:
##########################################################
# RUN THIS CELL!
##########################################################
import numpy as np
import matplotlib.pylab as plt
from Bio import Phylo

##########################################################
# functions we'll use later:
##########################################################

# Jukes-Cantor model
def jc(p):
  return (-3/4) * np.log(1-(4*p/3))

def jc_correction(mtx):
  '''takes distance matrix and applies Jukes-Cantor correction'''
  new_mtx = []
  for x in mtx:
    new_mtx.append([])
    for p in x:
      new_mtx[-1].append(jc(p))
  return new_mtx

# function that removes labels from inner nodes
def rm_inner(tree):
  for node in tree.get_nonterminals():
    node.name = None

# Parsimony method!

In [None]:
# library for loading alignments
from Bio import AlignIO

# lets try the alignment we walked through in class!
simple_aln = AlignIO.read('phy_msa.fasta', 'fasta')
print(simple_aln)

In [None]:
simple_aln[2].seq

In [None]:
# libraries for Parsimony Method
from Bio.Phylo.TreeConstruction import ParsimonyTreeConstructor
from Bio.Phylo.TreeConstruction import ParsimonyScorer
from Bio.Phylo.TreeConstruction import NNITreeSearcher

# Fitch algorithm for tracing characters and counting number of steps
scorer = ParsimonyScorer()

# Nearest Neighbor Interchanges (NNI) algorithm
searcher = NNITreeSearcher(scorer) 

# get parsimony tree!
pars_tree = ParsimonyTreeConstructor(searcher, starting_tree=None).build_tree(simple_aln)

# print parsimony score
print("steps",scorer.get_score(pars_tree,simple_aln))

# remove inner labels (opinional)
rm_inner(pars_tree)

# draw the tree!
Phylo.draw(pars_tree)

#Distance Method!

In [None]:
# libraries for Distance Method
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceMatrix

In [None]:
sarich_names = ["Dog","Bear","Raccoon","Weasel","Seal","Sealion","Cat","Monkey"]
sarich_dist = [
    [0],
    [32, 0],
    [48, 26, 0],
    [51, 34, 42, 0],
    [50, 29, 44, 44, 0],
    [48, 33, 44, 38, 24, 0],
    [98, 84, 92, 86, 89, 90, 0],
    [148,136,152,142,142,142,148,0]]

sarich_dist_mtx = DistanceMatrix(sarich_names,sarich_dist)

print("UPGMA tree")
sarich_upgma_tree = DistanceTreeConstructor().upgma(sarich_dist_mtx)
rm_inner(sarich_upgma_tree)
Phylo.draw(sarich_upgma_tree,branch_labels=lambda x: round(x.branch_length,2))

print("NJ tree")
sarich_nj_tree = DistanceTreeConstructor().nj(sarich_dist_mtx)
rm_inner(sarich_nj_tree)
Phylo.draw(sarich_nj_tree,branch_labels=lambda x: round(x.branch_length,2))

Starting from an alignment

In [None]:
# load alignment
# this time, lets use an alignment with a bit more characters!

aln = AlignIO.read('phy_msa_ext.fasta', 'fasta')
print(aln)
print("=================")
print(aln[0].id)
print(aln[0].seq)

In [None]:
# compute distance matrix
dist_mtx = DistanceCalculator('identity').get_distance(aln)
print(dist_mtx)
# dist_mtx.names # names of elements, used for indexing
# dist_mtx.matrix # nested list of numerical lists in lower triangular format

In [None]:
# Construct the phylogenetic tree using Neighbor Joining algorithm
nj_tree = DistanceTreeConstructor().nj(dist_mtx)

# remove inner labels (opinional)
rm_inner(nj_tree)

# print the tree in newick format
print(nj_tree.format("newick"))

# draw the tree two different ways (image or as text)
Phylo.draw(nj_tree)
Phylo.draw_ascii(nj_tree)

In [None]:
# reroot the tree using "Alpha"
nj_tree.root_with_outgroup("Alpha")
Phylo.draw(nj_tree)

nj_tree.root_at_midpoint()
Phylo.draw(nj_tree)

# Exercise 1:
1. Repeat with 'upgma' instead of 'nj'? Do the results change?
```Python
upgma_tree = DistanceTreeConstructor().upgma(dist_mtx)
```
2. What is the difference?

## BOOTSTRAP!

In [None]:
# libraries for bootstrap support
from Bio.Phylo.Consensus import bootstrap, bootstrap_trees, get_support

# function to randomly generate 100 alignments (sampled with replacement
# from our original alignment)
bootstrap_aln = bootstrap(aln, 100)
list_bootstrap_aln = list(bootstrap_aln)

# lets go through the first 3!
for n in range(3):
  b_aln = list_bootstrap_aln[n]
  b_dist_mtx = DistanceCalculator('identity').get_distance(b_aln)
  b_nj_tree = DistanceTreeConstructor().nj(b_dist_mtx)
  rm_inner(b_nj_tree)
  
  print("=================================")
  print("Bootstrap replicate:",n+1)
  print("=================================")
  print(b_aln)
  print(b_dist_mtx)
  Phylo.draw(b_nj_tree)

In [None]:
# luckily BioPython provides a "wrapper" function "bootstrap_trees()" that will do
# all the steps in the previous cell for us!
nj_trees = bootstrap_trees(aln, 100, DistanceTreeConstructor(DistanceCalculator('identity'),'nj'))
list_nj_trees = list(nj_trees)

# lets look at the first 3 trees
for n in range(3):
  print("=================================")
  print("Bootstrap replicate:",n+1)
  print("=================================")
  rm_inner(list_nj_trees[n])
  Phylo.draw(list_nj_trees[n])


In [None]:
# now that we have our 100 bootstrap trees we can use the
# "get_support()" function to add support values to nj_tree!
nj_tree = get_support(nj_tree, list_nj_trees)
Phylo.draw(nj_tree)

# Exercise 2:
1. repeat with primates.fasta dataset
2. reroot using Mouse/Bovine
```Python
tree.root_with_outgroup("Mouse","Bovine")
```
3. Who does Human share the closest ancestor to?
4. Create a new distance matrix, but this time apply the Juke_Cantor correction. and repeat steps 1-2. Does anything change?
```Python
jc_dist_mtx = DistanceMatrix(dist_mtx.names,jc_correction(dist_mtx.matrix))
```

In [None]:
############
## ANSWER ##
############

primates_aln = AlignIO.read('phy_primates.fasta', 'fasta')

primates_dist_mtx = DistanceCalculator('identity').get_distance(primates_aln)
primates_nj_tree = DistanceTreeConstructor().nj(primates_dist_mtx)
rm_inner(primates_nj_tree)
primates_nj_tree.root_with_outgroup("Mouse","Bovine")
Phylo.draw(primates_nj_tree)

In [None]:
primates_jc_dist_mtx = DistanceMatrix(primates_dist_mtx.names,jc_correction(primates_dist_mtx.matrix))
primates_jc_nj_tree = DistanceTreeConstructor().nj(primates_jc_dist_mtx)
rm_inner(primates_jc_nj_tree)
primates_jc_nj_tree.root_with_outgroup("Mouse","Bovine")
Phylo.draw(primates_jc_nj_tree)
