In [1]:
import numpy as np
from scipy.integrate import dblquad
from scipy.integrate import quad
import pandas as pd
from scipy.linalg import expm

In [57]:
def rate_mat_JC69(mu):
    """
    This function returns the rate matrix for the JC69 model. 
    
    Parameters
    ----------
    mu : numeric
        Mutation rate
    """
    return np.full((4, 4), mu/4)-np.diag([mu, mu, mu, mu])
def g_single_coal(Q, aa, bb, cc, dd, t, u):
    """
    This function calculates the probability of observing the 
    nucleotides b, c and d given a, t, u and Q. a and b are the
    nucleotides at the leaf nodes, while d is the nucleotide at 
    the coalescent node and c is the nucleotide at the root. t
    is the total time of the interval, and u is the time from the
    leaves to node d.
    
    P(b = bb, c = cc, d = dd | a == aa, Q, t, u)
    
          c     -> root node
          |
        __d__   -> coalescent node
        |   |   
        a   b   -> leaf nodes
    
    Parameters
    ----------
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    aa : integer
        Index of the nucleotide corresponding to a
    bb : integer
        Index of the nucleotide corresponding to b
    cc : integer
        Index of the nucleotide corresponding to c
    dd : integer
        Index of the nucleotide corresponding to d
    t : numeric
        Total time of the interval (from a/b to c)
    u : numeric
        Total time until coalescence (from a/b to d)
    """
    return np.exp(-u)*(expm(u*Q)[aa,dd])*(expm(u*Q)[dd,bb])*(expm((t-u)*Q)[dd,cc])
def p_b_c_given_a_single_coal(t, Q):
    """
    This function calculates the probability of observing the 
    nucleotides b and c given a, t and Q. a and b are the
    nucleotides at the leaf nodes, while c is the nucleotide at the root. 
    t is the total time of the interval. It performs the integral 
    of g_single_coal from 0 to t with respect to u, and it sums
    over all possible nucleotides for d. 
    
    P(b = bb, c = cc | a == aa, Q, t)
    
          c     -> root node
          |
        __d__   -> coalescent node
        |   |   
        a   b   -> leaf nodes
    
    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b to c)
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    """
    nt = ['A', 'G', 'C', 'T']
    arr = np.empty((4**3, 4))
    acc = 0
    for aa in range(4):
        for bb in range(4):
            for cc in range(4):
                cumsum = 0
                for dd in range(4):
                    res, err = quad(lambda u: g_single_coal(Q, aa, bb, cc, dd, t, u), 0, t)
                    cumsum += 1/(1-np.exp(-t))*res
                arr[acc] = [aa,bb,cc,cumsum]
                acc += 1
    df = pd.DataFrame(arr, columns = ['a', 'b', 'c', 'prob'])
    df['a'] = [nt[int(i)] for i in df['a']]
    df['b'] = [nt[int(i)] for i in df['b']]
    df['c'] = [nt[int(i)] for i in df['c']]
    return df
def p_b_given_a(t, Q):
    """
    This function calculates the probability of observing the 
    nucleotide b given a, t and Q. a is the starting nucleotide,
    while b is the end nucleotide. t is the total time of the interval. 
    
    P(b = bb | a == aa, Q, t)
    
    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b to c)
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    """
    nt = ['A', 'G', 'C', 'T']
    mat = np.zeros((4, 4))
    for i in range(len(t)):
        mat = mat+t[i]*Q[i]
    arr = np.empty((4**2, 3))
    acc = 0
    mat = expm(mat)
    for aa in range(4):
        for bb in range(4):
            arr[acc] = [aa,bb,mat[aa,bb]]
            acc += 1
    df = pd.DataFrame(arr, columns = ['a', 'b', 'prob'])
    df['a'] = [nt[int(i)] for i in df['a']]
    df['b'] = [nt[int(i)] for i in df['b']]
    return df
def b_c_given_a_to_dict_a_b_c(df):
    """
    This function converts the data frame as outputted
    by p_b_c_given_a_single_coal or p_b_c_given_a_double_coal
    into a dictionary. How to use the dictionary:
    
        P(b, c | a) = dct[a][b][c]
    
    Parameters
    ----------
    df : data frame
        As outputted by p_b_c_given_a_single_coal or 
        p_b_c_given_a_double_coal
    """
    df = df.groupby(['a', 'b', 'c']).sum().reset_index()
    df = df.groupby(['a', 'b']).apply(lambda x: dict(zip(x.c, x.prob))).reset_index()
    df.columns = ['a', 'b', 'val']
    df = df.groupby('a').apply(lambda x: dict(zip(x.b, x.val))).to_dict()
    return df
def b_given_a_to_dict_a_b(df):
    """
    This function converts the data frame as outputted
    by p_b_given_a into a dictionary. How to use the dictionary:
    
        P(b | a) = dct[a][b]
    
    Parameters
    ----------
    df : data frame
        As outputted by p_b_given_a
    """
    return df.groupby(['a']).apply(lambda x: dict(zip(x.b, x.prob))).to_dict()

In [None]:
def calc_emissions_single(t_vec, theta_vec):
    return

In [43]:
def calc_emissions_single(t_vec, theta_vec):
    
    t1 = t_vec[0] # a to A
    t2 = t_vec[1] # ab to c
    t3 = t_vec[2] # c to a'
    t4 = t_vec[3] # a'b' to c'
    t5 = t1+t2+t3+t4+t_vec[4] # c' to D
    t6 = t1+t2+t3 # b' to C
    mu_vec = [(4/3)*theta for theta in theta_vec]
    mu1 = mu_vec[0]
    mu2 = mu_vec[1]
    mu3 = mu_vec[2]
    mu4 = mu_vec[3]
    mu5 = mu_vec[4]
    mu6 = mu_vec[5]
    
    # b,c | a
    df2 = calc_df(t2, mu2).groupby(['a', 'b', 'c']).sum().reset_index()
    df2_ = df2.groupby(['a', 'b']).apply(lambda x: dict(zip(x.c, x.prob))).reset_index()
    df2_.columns = ['a', 'b', 'val']
    df2_ = df2_.groupby('a').apply(lambda x: dict(zip(x.b, x.val))).to_dict()
    # df2_[a][b][c]

    # b',c' | a'
    df4 = calc_df(t4, mu4).groupby(['a', 'b', 'c']).sum().reset_index()
    df4_ = df4.groupby(['a', 'b']).apply(lambda x: dict(zip(x.c, x.prob))).reset_index()
    df4_.columns = ['a', 'b', 'val']
    df4_ = df4_.groupby('a').apply(lambda x: dict(zip(x.b, x.val))).to_dict()
    # df4_[a'][b'][c']
    
    # a | A
    df1 = wrapper_2(mu1, t1)
    df1_ = df1.groupby(['A']).apply(lambda x: dict(zip(x.a, x.prob))).to_dict()
    # df1_[A][a]

    # B | b
    df3 = wrapper_2(mu3, t3)
    df3_ = df3.groupby(['A']).apply(lambda x: dict(zip(x.a, x.prob))).to_dict()
    # df3_[b][B]

    # D | c'
    df5 = wrapper_2(mu5, t5)
    df5_ = df5.groupby(['A']).apply(lambda x: dict(zip(x.a, x.prob))).to_dict()
    # df5_[c'][D]

    # C | b'
    df6 = wrapper_2(mu6, t6)
    df6_ = df6.groupby(['A']).apply(lambda x: dict(zip(x.a, x.prob))).to_dict()
    # df5_[b'][C]
    
    emissions = {}
    for A in ['A', 'C', 'T', 'G']:
        for B in ['A', 'C', 'T', 'G']:
            for C in ['A', 'C', 'T', 'G']:
                for D in ['A', 'C', 'T', 'G']:
                    acc = 0
                    for a in ['A', 'C', 'T', 'G']:
                        for b in ['A', 'C', 'T', 'G']:
                            for c in ['A', 'C', 'T', 'G']:
                                for a_ in ['A', 'C', 'T', 'G']:
                                    for b_ in ['A', 'C', 'T', 'G']:
                                        for c_ in ['A', 'C', 'T', 'G']:
                                            res = 1
                                            res = res*df1_[A][a]
                                            res = res*df1_[b][B]
                                            res = res*df2_[a][b][c]
                                            res = res*df3_[c][a_]
                                            res = res*df4_[a_][b_][c_]
                                            res = res*df6_[b_][C]
                                            res = res*df5_[c_][D]
                                            acc += res
                    emissions[A+B+C+D] = acc/4
    
    return emissions

In [58]:
t = [0.2,0.2,0.3]
mu = [1,1.5,0.2]
Q = [rate_mat_JC69(i) for i in mu]
b_given_a_to_dict_a_b(p_b_c_given_a_single_coal(t,Q))

{'A': {'A': 0.6784067978866112,
  'G': 0.10719773403779628,
  'C': 0.10719773403779628,
  'T': 0.10719773403779628},
 'C': {'A': 0.10719773403779627,
  'G': 0.10719773403779626,
  'C': 0.6784067978866111,
  'T': 0.10719773403779627},
 'G': {'A': 0.10719773403779631,
  'G': 0.6784067978866111,
  'C': 0.10719773403779628,
  'T': 0.10719773403779628},
 'T': {'A': 0.10719773403779631,
  'G': 0.10719773403779628,
  'C': 0.10719773403779628,
  'T': 0.6784067978866113}}

In [49]:
t = 2
mu = 1
Q = rate_mat_JC69(mu)
df = p_b_c_given_a_single_coal(t, Q)
df.groupby(['a'])['prob'].sum()

a
A    1.0
C    1.0
G    1.0
T    1.0
Name: prob, dtype: float64

True

In [70]:
nt = ['A', 'G', 'C', 'T']


In [62]:
t = 0.01
mu = 1
Q = rate_mat_JC69(mu)
df2 = p_b_c_given_a(t, Q)
b_c_given_a_to_dict_a_b_c(df2)

{'A': {'A': {'A': 0.9888308144199243,
   'C': 0.0012437707813540001,
   'G': 0.0012437707813540001,
   'T': 0.001243770781354},
  'C': {'A': 0.0012355202973190673,
   'C': 0.0012355202973190669,
   'G': 4.125242017466484e-06,
   'T': 4.125242017466484e-06},
  'G': {'A': 0.0012355202973190673,
   'C': 4.125242017466484e-06,
   'G': 0.0012355202973190673,
   'T': 4.125242017466484e-06},
  'T': {'A': 0.0012355202973190673,
   'C': 4.125242017466484e-06,
   'G': 4.125242017466484e-06,
   'T': 0.0012355202973190673}},
 'C': {'A': {'A': 0.0012355202973190673,
   'C': 0.0012355202973190673,
   'G': 4.125242017466484e-06,
   'T': 4.125242017466484e-06},
  'C': {'A': 0.0012437707813540001,
   'C': 0.9888308144199243,
   'G': 0.0012437707813540001,
   'T': 0.0012437707813540001},
  'G': {'A': 4.125242017466484e-06,
   'C': 0.0012355202973190667,
   'G': 0.0012355202973190673,
   'T': 4.125242017466484e-06},
  'T': {'A': 4.125242017466484e-06,
   'C': 0.0012355202973190669,
   'G': 4.125242017466

In [51]:
df2_

{'A': {'A': {'A': 0.21416346174506642,
   'G': 0.1080830895954234,
   'C': 0.10808308959542343,
   'T': 0.10808308959542343},
  'C': {'A': 0.04577933356079768,
   'G': 0.031151878017312867,
   'C': 0.04577933356079768,
   'T': 0.031151878017312867},
  'G': {'A': 0.04577933356079768,
   'G': 0.045779333560797685,
   'C': 0.031151878017312867,
   'T': 0.031151878017312867},
  'T': {'A': 0.04577933356079768,
   'G': 0.031151878017312867,
   'C': 0.031151878017312867,
   'T': 0.045779333560797685}},
 'C': {'A': {'A': 0.04577933356079768,
   'G': 0.031151878017312867,
   'C': 0.04577933356079767,
   'T': 0.031151878017312867},
  'C': {'A': 0.10808308959542343,
   'G': 0.10808308959542343,
   'C': 0.21416346174506648,
   'T': 0.10808308959542343},
  'G': {'A': 0.031151878017312867,
   'G': 0.045779333560797685,
   'C': 0.04577933356079768,
   'T': 0.031151878017312874},
  'T': {'A': 0.031151878017312867,
   'G': 0.031151878017312867,
   'C': 0.045779333560797685,
   'T': 0.04577933356079769}

In [None]:
def g(u,v,prm=prmv,mu=mu,t=t):
    tmp = 1
    tm = [-mu*u,-mu*u,-mu*v,-mu*(v+u),-mu*(t-v-u)]
    for i in range(5):
        tmp = tmp*(1/4+prm[i]*np.exp(tm[i]))
    tmp = tmp*3*np.exp(-3*u)*np.exp(-(v-u))
    return tmp

In [121]:
t = 0.2
# t = np.inf
mu = 0.1
prmv = [0, 0, 0, 0, 0]
k = 1+0.5*np.exp(-3*t)-1.5*np.exp(-t)

In [116]:
def g(u,v,prm=prmv,mu=mu,t=t):
    tmp = 1
    tm = [-mu*u,-mu*u,-mu*v,-mu*(v+u),-mu*(t-v-u)]
    for i in range(5):
        tmp = tmp*(1/4+prm[i]*np.exp(tm[i]))
    tmp = tmp*3*np.exp(-3*u)*np.exp(-(v-u))
    return tmp
def g_wrapper(u,v):
    return g(u,v,prm=prmv,mu=mu,t=t)

In [122]:
# t = np.inf
arr = []
for aa in ['A', 'T', 'G', 'C']:
    for bb in ['A', 'T', 'G', 'C']:
        for cc in ['A', 'T', 'G', 'C']:
            for dd in ['A', 'T', 'G', 'C']:
                cumsum = 0
                for ee in ['A', 'T', 'G', 'C']:
                    for ff in ['A', 'T', 'G', 'C']:
                        prmv[0] = 3/4 if aa==ee else -1/4
                        prmv[1] = 3/4 if ee==bb else -1/4
                        prmv[2] = 3/4 if ee==ff else -1/4
                        prmv[3] = 3/4 if ff==cc else -1/4
                        prmv[4] = 3/4 if ff==dd else -1/4
                        res, err = dblquad(lambda x, y: g(y,x,prm=prmv,mu=mu,t=t), 0, t, lambda x: x, lambda x: t)
                        cumsum += res
                arr.append([aa, bb, cc, dd, cumsum/k])
df = pd.DataFrame(arr, columns = ['a', 'b', 'c', 'd', 'prob'])

$P(b,c,d|a)=$

In [123]:
df

Unnamed: 0,a,b,c,d,prob
0,A,A,A,A,0.195468
1,A,A,A,T,0.195468
2,A,A,A,G,0.195468
3,A,A,A,C,0.195468
4,A,A,T,A,0.014271
...,...,...,...,...,...
251,C,C,G,C,0.014271
252,C,C,C,A,0.195468
253,C,C,C,T,0.195468
254,C,C,C,G,0.195468
