In [1]:
import numpy as np

# test data
t_A = '[&R] ((1:0.96,3:0.96):0.14,2:1.10,4:1.10);' # rooted
t_B = '[&R] (1:1.167,2:1.167,3:1.167:4:1.167);'

t_C = '[&R] ((1:0.5,2:0.5):0.60,(3:0.5,4:0.5):0.60);' # no branches
t_test = '[&R] (1:0.96,3:0.96):0.14;' 

# species by 3-site matrix
msa = np.array([[1,0,2], # species 1
                 [1,0,0], # species 2
                 [0,1,2], # species 3
                 [0,1,0]] # species 4
               )

Q = [[0.2, 0.3, 0.5], # site 1
     [0.2, 0.3, 0.5], # site 2
     [0.2, 0.3, 0.5]  # site 3
    ]

In [2]:
# https://bio.libretexts.org/Bookshelves/Evolutionary_Developmental_Biology/Phylogenetic_Comparative_Methods_(Harmon)/08%3A_Fitting_Models_of_Discrete_Character_Evolution/8.07%3A_Appendix_-_Felsenstein's_Pruning_Algorithm
# t = '((((1:1.0, 2:1.0):0.5, 3:1.5):1.0,((4:0.5, 5:0.5):2.0):0.5), 6:2.5);'
# msa = np.array([[0], [1], [0], [2], [2], [1]])


In [4]:

numsites = len(msa[0])
lambda_site = dict()

for site in range(numsites):
    lambda_site[site] = np.unique(msa.T[site])
    
    
char_probs = dict()
for site in range(numsites):
    char_probs[site] = dict()
    chars, counts = np.unique(msa.T[site], return_counts=True)
    for i in range(len(chars)):
        char_probs[site][chars[i]] = counts[i]/len(msa.T[site])

In [5]:
for site in range(numsites):
    print("site:", site)
    print(char_probs[0])

site: 0
{0: 0.5, 1: 0.5}
site: 1
{0: 0.5, 1: 0.5}
site: 2
{0: 0.5, 1: 0.5}


In [6]:
q_dict = dict()
for site in range(numsites):
    q_dict[site] = dict()
    # get alphabet
    for idx, char in enumerate(np.unique(msa.T[site])):
        q_dict[site][char] = Q[site][idx]

In [7]:
q_dict

{0: {0: 0.2, 1: 0.3}, 1: {0: 0.2, 1: 0.3}, 2: {0: 0.2, 2: 0.3}}

In [454]:
# nwkt.as_string(schema="newick")
# nwkt.as_ascii_plot()

In [8]:
def get_branchlen(child_node):
    # print("getting branch length", child_node.edge_length)
    if child_node.edge_length is None:
        print(child_node.child_nodes())
    return child_node.edge_length

In [106]:
2 * (math.exp(-0.5) * 1.0)

1.2130613194252668

In [99]:
import math

def prob_same(nodedict, node_likelihood, site, curr_node):
    # print("prob_same")
    # prob of staying in 0 
    all_child_prob = 1.0
    # print(curr_node.child_nodes())
    for c in curr_node.child_nodes():
        char_state_prob = 0.0
        for alpha in nodedict[c][site]:
            print("PS: get_branchlen(c)", c, get_branchlen(c))
            tp = math.exp(-get_branchlen(c))
            char_state_prob += tp * node_likelihood[c][site]
        all_child_prob *= char_state_prob
    return all_child_prob

def prob_change(q_dict, nodedict, node_likelihood, site, curr_state, curr_node):
    all_child_prob = 1.0
    for c in curr_node.child_nodes():
        char_state_prob = 0.0
        for alpha in nodedict[c][site]:
            q_ialpha = q_dict[site][alpha] 
            tp = q_ialpha * (1 - math.exp(get_branchlen(c)))
            char_state_prob += tp * node_likelihood[c][site]
        all_child_prob *= char_state_prob
    print("PC: get_branchlen(c)", c, all_child_prob)
    return all_child_prob


In [100]:
def likelihood_under_n(nodedict, node_likelihood, n, site):
    # n is an internal node
    child_states = set()
        
    if n not in nodedict:
        nodedict[n] = dict()
        nodedict[n][site] = dict()
        
    # identify all child states. 
    # this constrains n's possible states.
    child_states = set()
    for child in n.child_nodes():
        if child.is_leaf():
            child_states.add(get_char(msa, child, site))
        else:
            for x in nodedict[child][site]:
                state_prob = nodedict[child][site][x]
                if state_prob > 0.0:
                    child_states.add(x)
                    
    parent_poss_states = dict()
    if len(child_states) == 1:
        if 0 in child_states: # probability 0 -> 0
            parent_poss_states[0] = prob_same(nodedict, node_likelihood, site, n) 
        else:
            for c in child_states: # probability c -> c != 0
                parent_poss_states[c] = 1.0 
            # probability 0 -> c (alpha)
            parent_poss_states[0] = prob_change(q_dict, nodedict, node_likelihood, site, 0, n)  
    else:
        # probability 0 -> 1 and 0 -> 2 or
        # probability 0 -> 0 and 0 -> 1 WLOG
        parent_poss_states[0] = 1.0
    for x in parent_poss_states.keys():
        # save into nodedict
        nodedict[n][site][x] = parent_poss_states[x]
        # product over all possible states
        node_likelihood[n][site] *= parent_poss_states[x]
    
    return nodedict, node_likelihood

In [101]:
def get_char(msa, leaf_node, site):
    # print(leaf_node)
    return msa[int(leaf_node.taxon.__str__().replace("'", ""))-1][site]

In [102]:
# nodedict # node # site -> saves the probabilities of the possible states
# node_likelihood # node # site

In [109]:
import dendropy

# we take in a rooted tree
nwkt = dendropy.Tree.get(data=t_C, schema="newick")
print(nwkt)
nodedict = dict() # maps node to possible states, with probabilities
node_likelihood = dict() # maps node to likelihood of subtree under node

for n in nwkt.postorder_node_iter():
    print("node:", n)
    if n.taxon is not None: # must be a leaf node, set up 
        nodedict[n] = dict()
        node_likelihood[n] = dict()
        for site in range(numsites):
            char_state = get_char(msa, n, site)
            nodedict[n][site] = dict()
            nodedict[n][site][char_state] = 1.0
            node_likelihood[n][site] = 1.0
        
    elif n.taxon is None: # must be an internal node
        for site in range(numsites):
            print("site:", site)
            if n not in nodedict:
                nodedict[n] = dict()
                node_likelihood[n] = dict()
            
            nodedict[n][site] = dict()
            node_likelihood[n][site] = 1.0
            
            nodedict, node_likelihood = likelihood_under_n(nodedict, node_likelihood, n, site)

# last n is the root node 

print("Calculating likelihood according to a root node r*")
# SETTING UP r*, say r* -> r is dist 0.2
root_edge_len = 0.2
tree_likelihood = 1.0
for site in range(numsites):
    # under node_likelihood, calculate the prob    
    for rootchar in nodedict[n][site].keys():
        prob_rootchar = nodedict[n][site][x]
        if prob_rootchar > 0.0: 
            if rootchar == 0:
                tree_likelihood *= (math.exp(-root_edge_len)) * node_likelihood[n][site]
            else:
                q_ialpha = q_dict[site][rootchar]
                tree_likelihood *= (1 - math.exp(-root_edge_len)) * q_ialpha * node_likelihood[n][site]

print(nwkt)
print(tree_likelihood)


((1:0.5,2:0.5):0.6,(3:0.5,4:0.5):0.6)
node: <Node object at 0x7fa9e83fbd10: 'None' (<Taxon 0x7fa9e83fb790 '1'>)>
node: <Node object at 0x7fa9e7e71510: 'None' (<Taxon 0x7fa9e7e71490 '2'>)>
node: <Node object at 0x7fa9e83fb390: 'None' (None)>
site: 0
PC: get_branchlen(c) <Node object at 0x7fa9e7e71510: 'None' (<Taxon 0x7fa9e7e71490 '2'>)> 0.03787553583529101
site: 1
PS: get_branchlen(c) <Node object at 0x7fa9e83fbd10: 'None' (<Taxon 0x7fa9e83fb790 '1'>)> 0.5
PS: get_branchlen(c) <Node object at 0x7fa9e7e71510: 'None' (<Taxon 0x7fa9e7e71490 '2'>)> 0.5
site: 2
node: <Node object at 0x7fa9e7e66810: 'None' (<Taxon 0x7fa9e7f1be50 '3'>)>
node: <Node object at 0x7fa9e7f1b0d0: 'None' (<Taxon 0x7fa9e7f1be90 '4'>)>
node: <Node object at 0x7fa9e7e660d0: 'None' (None)>
site: 0
PS: get_branchlen(c) <Node object at 0x7fa9e7e66810: 'None' (<Taxon 0x7fa9e7f1be50 '3'>)> 0.5
PS: get_branchlen(c) <Node object at 0x7fa9e7f1b0d0: 'None' (<Taxon 0x7fa9e7f1be90 '4'>)> 0.5
site: 1
PC: get_branchlen(c) <Node obj

nodedict: index with node, then site, then possible alphabet characters, mapped to the probability of taking on that character.
node_likelihood: index with node, then site maps to probability.

In [17]:
t_A = 2.062468362106625e-05
t_B = 0.001953125
t_C = 8.236339393882161e-09

In [18]:
t_C < t_B

True

In [19]:
t_A > t_C

True

In [20]:
# ziheng yang - must always be less than f_max <= 
# as you increase the number of k, should converge, check paper... 

In [None]:
# # make this into a function 
# def felsenstein(T, Q, msa): 
#     # tree w/ branch lengths as input
#     # output a likelihood
#     ##
#     pass 


In [18]:
import math

p_0_to_1 = 0.3 * (1 - math.e**(-0.96))
p_0_to_0 = math.e**(-0.96)
p_0_to_2 = 0.5 * (1 - math.e**(-0.96))
p_2_to_2 = 1.0

In [41]:
p_0_to_1

0.18513213420746638

In [42]:
p_0_to_2

0.30855355701244397

In [19]:
pos_1 = p_0_to_1 * p_0_to_0
pos_2 = pos_1
pos_3 = p_0_to_2 ** 2 

In [20]:
pos_1

0.07088577715342857

In [43]:
pos_2

0.07088577715342857

In [21]:
pos_3

0.09520529754503151

In [45]:
a1 = 0.2 * math.e**(-0.14) * pos_1 # eq doesn't multiply by q_ialpha
a = math.e**(-0.14) * pos_1
a

0.06162513414097765

In [47]:
c = 0.5 * (1 - math.e**(-0.14)) * 1.0
c

0.06532088230059707

In [51]:
b1 = 0.2 * math.e**(-0.14) * pos_3
b = math.e**(-0.14) * pos_3

b

0.08276750947436685

In [54]:
a * a * b * c

2.053183117873688e-05

In [36]:
c

0.06532088230059707

In [None]:
Tree Likelihood: -14.621847872951003
