Phylogeny, part 3 - playing with small parsimony

In [11]:
# Let's do weighted small parsimony 

## Problem 1

import sys

class N: 
  """ Class to represent internal node, leaves are strings """
  def __init__(self, left, right, leftBranchLength=1.0, rightBranchLength=1.0):
    self.left, self.right, self.leftBranchLength, self.rightBranchLength = left, right, leftBranchLength, rightBranchLength
    
tree = N(N("A", N("T", "T")), N(N(N("T", "T"), "A"), "A")) # Example tree

def subCost(ancestorChar, descendantChar, branchLength):
  """ Substitution cost function """
  return 0 if ancestorChar == descendantChar else 1

# positive infinity
p_inf = float("inf")

def parsimonyCost(t, alphabet="ACGT", subCostFn=subCost):
  """ Calculates the cost of substitutions for the given tree t node of a tree, 
  returns dictionary of alphabet characters to costs"""
  # Code to write - hint use isinstance function to determine if node is internal or leaf
  pDict = {}
  for i in alphabet:
    pDict[i] = 0
  if isinstance(t, str):
    cDict = {}
    for i in alphabet:
      if i != t:
        cDict[i] = p_inf
      else:
        cDict[i] = 0
    return cDict
  
  else: #do recursive 
    leftLeaf = parsimonyCost(t.left, alphabet=alphabet, subCostFn=subCostFn)
    rightLeaf = parsimonyCost(t.right, alphabet=alphabet, subCostFn=subCostFn)
    for i in pDict:
      leftChild = []; rightChild = []
      for j in rightLeaf:
        leftChild.append(subCostFn(i, j, t.leftBranchLength)+leftLeaf[j])
        rightChild.append(subCostFn(i, j, t.rightBranchLength)+rightLeaf[j])
      pDict[i] = min(leftChild) + min(rightChild)
    return pDict
                                        
print(parsimonyCost(tree)) # Should print {'A': 2, 'C': 4, 'G': 4, 'T': 3}

{'A': 2, 'C': 4, 'G': 4, 'T': 3}


Now let's make this fully probablistic

In [12]:
# Problem 2: Adapt the substitution cost function to use the Jukes Cantor function, using -log probability of substitution
# for cost

import math

def jc(d):
  """ Jukes Cantor function which gives probability of observing a substitution for a given branch length"""
  return (3.0/4.0) * (1.0 - math.exp(-(4.0/3.0) * d))

def jukesCantorSubCostFn(ancestorChar, descendantChar, branchLength):
  """ Jukes Cantor probability function - this is an exercise. 
  
  Use Jukes Cantor to calculate log probability of substitution given branch length and ancestor and descendant characters"""
  # Code to implement 
  return -math.log(jc(branchLength)/3) if ancestorChar != descendantChar else -math.log(1-jc(branchLength))

bl = 0.0000001
tree = N(N("A", N("A", "A", bl, bl), bl, bl), N(N(N("A", "A", bl, bl), "A", bl, bl), "A", bl, bl), bl, bl) # Example tree

print(parsimonyCost(tree, subCostFn=jukesCantorSubCostFn)) # Should print something like: {'A': 1.199999980764579e-06, 'C': 34.433417011866986, 'G': 34.433417011866986, 'T': 34.433417011866986}

# Things to consider: (this does not need to be completed for the homework)

## This is "Ancestral Maximal Likelihood", in that it takes a min over the -log probs
## Q. How would you convert to ML? 
## A. Try changing the mins to log sums, e.g. to calculate log(x + y), given log(x) and log(y) 


{'A': 1.199999980764579e-06, 'C': 34.433417011866986, 'G': 34.433417011866986, 'T': 34.433417011866986}
