In [1]:
import numpy as np
from scipy.integrate import dblquad
from scipy.integrate import quad
import pandas as pd
from scipy.linalg import expm

In [2]:
def rate_mat_JC69(mu):
    """
    This function returns the rate matrix for the JC69 model. 
    
    Parameters
    ----------
    mu : numeric
        Mutation rate
    """
    return np.full((4, 4), mu/4)-np.diag([mu, mu, mu, mu])
def g_single_coal(Q, aa, bb, cc, dd, t, u):
    """
    This function calculates the probability of observing the 
    nucleotides b, c and d given a, t, u and Q. a and b are the
    nucleotides at the leaf nodes, while d is the nucleotide at 
    the coalescent node and c is the nucleotide at the root. t
    is the total time of the interval, and u is the time from the
    leaves to node d.
    
    P(b = bb, c = cc, d = dd | a == aa, Q, t, u)
    
          c     -> root node
          |
        __d__   -> coalescent node
        |   |   
        a   b   -> leaf nodes
    
    Parameters
    ----------
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    aa : integer
        Index of the nucleotide corresponding to a
    bb : integer
        Index of the nucleotide corresponding to b
    cc : integer
        Index of the nucleotide corresponding to c
    dd : integer
        Index of the nucleotide corresponding to d
    t : numeric
        Total time of the interval (from a/b to c)
    u : numeric
        Total time until coalescence (from a/b to d)
    """
    return np.exp(-u)*(expm(u*Q)[aa,dd])*(expm(u*Q)[dd,bb])*(expm((t-u)*Q)[dd,cc])
def g_double_coal(Q, aa, bb, cc, dd, ee, ff, t, u, v):
    tmp = 1
    tmp = tmp*(expm(Q*u)[aa,ee])
    tmp = tmp*(expm(Q*u)[ee,bb])
    tmp = tmp*(expm(Q*(v-u))[ee,ff])
    tmp = tmp*(expm(Q*v)[ff,cc])
    tmp = tmp*(expm(Q*(t-v))[ff,dd])
    tmp = tmp*3*np.exp(-3*u)*np.exp(-(v-u))
    return tmp
def p_b_c_given_a_single_coal(t, Q):
    """
    This function calculates the probability of observing the 
    nucleotides b and c given a, t and Q. a and b are the
    nucleotides at the leaf nodes, while c is the nucleotide at the root. 
    t is the total time of the interval. It performs the integral 
    of g_single_coal from 0 to t with respect to u, and it sums
    over all possible nucleotides for d. 
    
    P(b = bb, c = cc | a == aa, Q, t)
    
          c     -> root node
          |
        __d__   -> coalescent node
        |   |   
        a   b   -> leaf nodes
    
    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b to c)
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    """
    nt = ['A', 'G', 'C', 'T']
    arr = np.empty((4**3, 4))
    acc = 0
    for aa in range(4):
        for bb in range(4):
            for cc in range(4):
                cumsum = 0
                for dd in range(4):
                    res, err = quad(lambda u: g_single_coal(Q, aa, bb, cc, dd, t, u), 0, t)
                    cumsum += 1/(1-np.exp(-t))*res
                arr[acc] = [aa,bb,cc,cumsum]
                acc += 1
    df = pd.DataFrame(arr, columns = ['a', 'b', 'c', 'prob'])
    df['a'] = [nt[int(i)] for i in df['a']]
    df['b'] = [nt[int(i)] for i in df['b']]
    df['c'] = [nt[int(i)] for i in df['c']]
    return df
def p_b_c_given_a_double_coal(t, Q):
    """
    P(b = bb, c = cc | a == aa, Q, t)
    
          c     -> root node
          |
        __d__   -> coalescent node
        |   |   
        a   b   -> leaf nodes
    
    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b to c)
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    """
    nt = ['A', 'G', 'C', 'T']
    arr = np.empty((4**4, 5))
    acc = 0
    for aa in range(4):
        for bb in range(4):
            for cc in range(4):
                for dd in range(4):
                    cumsum = 0
                    for ee in range(4):
                        for ff in range(4):
                            res, err = dblquad(lambda u, v: g_double_coal(Q, aa, bb, cc, dd, ee, ff, t, u, v), 0, t, lambda u: u, t)
                            cumsum += res
                    arr[acc] = [aa,bb,cc,dd,cumsum]
                    acc += 1
    df = pd.DataFrame(arr, columns = ['a', 'b', 'c', 'd', 'prob'])
    df['a'] = [nt[int(i)] for i in df['a']]
    df['b'] = [nt[int(i)] for i in df['b']]
    df['c'] = [nt[int(i)] for i in df['c']]
    df['d'] = [nt[int(i)] for i in df['d']]
    return df

def p_b_given_a(t, Q):
    """
    This function calculates the probability of observing the 
    nucleotide b given a, t and Q. a is the starting nucleotide,
    while b is the end nucleotide. t is the total time of the interval. 
    
    P(b = bb | a == aa, Q, t)
    
    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b to c)
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    """
    nt = ['A', 'G', 'C', 'T']
    mat = np.zeros((4, 4))
    for i in range(len(t)):
        mat = mat+t[i]*Q[i]
    arr = np.empty((4**2, 3))
    acc = 0
    mat = expm(mat)
    for aa in range(4):
        for bb in range(4):
            arr[acc] = [aa,bb,mat[aa,bb]]
            acc += 1
    df = pd.DataFrame(arr, columns = ['a', 'b', 'prob'])
    df['a'] = [nt[int(i)] for i in df['a']]
    df['b'] = [nt[int(i)] for i in df['b']]
    return df
def b_c_d_given_a_to_dict_a_b_c_d(df):
    """
    This function converts the data frame as outputted
    by p_b_c_given_a_single_coal or p_b_c_given_a_double_coal
    into a dictionary. How to use the dictionary:
    
        P(b, c, d | a) = dct[a][b][c][d]
    
    Parameters
    ----------
    df : data frame
        As outputted by p_b_c_given_a_double_coal
    """
    # df = df.groupby(['a', 'b', 'c', 'd']).sum().reset_index()
    df = df.groupby(['a', 'b', 'c']).apply(lambda x: dict(zip(x.d, x.prob))).reset_index()
    df.columns = ['a', 'b', 'c', 'val']
    df = df.groupby(['a', 'b']).apply(lambda x: dict(zip(x.c, x.val))).reset_index()
    df.columns = ['a', 'b', 'val']
    df = df.groupby('a').apply(lambda x: dict(zip(x.b, x.val))).to_dict()
    return df
def b_c_given_a_to_dict_a_b_c(df):
    """
    This function converts the data frame as outputted
    by p_b_c_given_a_single_coal or p_b_c_given_a_double_coal
    into a dictionary. How to use the dictionary:
    
        P(b, c | a) = dct[a][b][c]
    
    Parameters
    ----------
    df : data frame
        As outputted by p_b_c_given_a_single_coal
    """
    # df = df.groupby(['a', 'b', 'c']).sum().reset_index()
    df = df.groupby(['a', 'b']).apply(lambda x: dict(zip(x.c, x.prob))).reset_index()
    df.columns = ['a', 'b', 'val']
    df = df.groupby('a').apply(lambda x: dict(zip(x.b, x.val))).to_dict()
    return df
def b_given_a_to_dict_a_b(df):
    """
    This function converts the data frame as outputted
    by p_b_given_a into a dictionary. How to use the dictionary:
    
        P(b | a) = dct[a][b]
    
    Parameters
    ----------
    df : data frame
        As outputted by p_b_given_a
    """
    return df.groupby(['a']).apply(lambda x: dict(zip(x.b, x.prob))).to_dict()

In [102]:
t = [1, 0]
Q = [rate_mat_JC69(0.1), rate_mat_JC69(0.1)]
p_b_given_a(t, Q)

Unnamed: 0,a,b,prob
0,A,A,0.928628
1,A,G,0.023791
2,A,C,0.023791
3,A,T,0.023791
4,G,A,0.023791
5,G,G,0.928628
6,G,C,0.023791
7,G,T,0.023791
8,C,A,0.023791
9,C,G,0.023791


In [3]:
def g_single_coal_JC69(mu, aa, bb, cc, dd, t, u):
    prm = np.zeros(3)
    prm[0] = 3/4 if aa==dd else -1/4
    prm[1] = 3/4 if dd==bb else -1/4
    prm[2] = 3/4 if dd==cc else -1/4
    tmp = 1
    tm = [-mu*u,-mu*u,-mu*(t-u)]
    for i in range(3):
        tmp = tmp*(1/4+prm[i]*np.exp(tm[i]))
    tmp = tmp*np.exp(-u)
    return tmp
def p_b_c_given_a_JC69(t, mu):
    nt = ['A', 'G', 'C', 'T']
    arr = np.empty((4**3, 4))
    acc = 0
    for aa in range(4):
        for bb in range(4):
            for cc in range(4):
                cumsum = 0
                for dd in range(4):
                    res, err = quad(lambda u: g_single_coal_JC69(mu, aa, bb, cc, dd, t, u), 0, t)
                    cumsum += res/(1-np.exp(-t))
                arr[acc] = [aa,bb,cc,cumsum]
                acc += 1
    df = pd.DataFrame(arr, columns = ['a', 'b', 'c', 'prob'])
    df['a'] = [nt[int(i)] for i in df['a']]
    df['b'] = [nt[int(i)] for i in df['b']]
    df['c'] = [nt[int(i)] for i in df['c']]
    return df

In [4]:
def g_double_coal_JC69(mu, aa, bb, cc, dd, ee, ff, t, u, v):
    prm = np.zeros(5)
    prm[0] = 3/4 if aa==ee else -1/4
    prm[1] = 3/4 if ee==bb else -1/4
    prm[2] = 3/4 if ee==ff else -1/4
    prm[3] = 3/4 if ff==cc else -1/4
    prm[4] = 3/4 if ff==dd else -1/4
    tmp = 1
    tm = [-mu*u,-mu*u,-mu*(v-u),-mu*v,-mu*(t-v)]
    for i in range(5):
        tmp = tmp*(1/4+prm[i]*np.exp(tm[i]))
    tmp = tmp*3*np.exp(-3*u)*np.exp(-(v-u))
    return tmp
def p_b_c_d_given_a_JC69(t, mu):
    nt = ['A', 'G', 'C', 'T']
    arr = np.empty((4**4, 5))
    acc = 0
    for aa in range(4):
        for bb in range(4):
            for cc in range(4):
                for dd in range(4):
                    cumsum = 0
                    for ee in range(4):
                        for ff in range(4):
                            res, err = dblquad(lambda v, u: g_double_coal_JC69(mu, aa, bb, cc, dd, ee, ff, t, u, v), 0, t, lambda u: u, t)
                            cumsum += res
                    arr[acc] = [aa,bb,cc,dd,cumsum]
                    acc += 1
    df = pd.DataFrame(arr, columns = ['a', 'b', 'c', 'd', 'prob'])
    df['a'] = [nt[int(i)] for i in df['a']]
    df['b'] = [nt[int(i)] for i in df['b']]
    df['c'] = [nt[int(i)] for i in df['c']]
    df['d'] = [nt[int(i)] for i in df['d']]
    df['prob'] = [i/(1+0.5*np.exp(-3*t)-1.5*np.exp(-t)) for i in df['prob']]
    return df

In [69]:
def calc_emissions_single_JC69(
    a0_a1_t_vec, b0_b1_t_vec, a1b1_ab0_t, ab0_ab1_t_vec, 
    ab1c1_abc0_t, c0_c1_t_vec, d0_abc0_t_vec,
    a0_a1_mu_vec, b0_b1_mu_vec, a1b1_ab0_mu, ab0_ab1_mu_vec, 
    ab1c1_abc0_mu, c0_c1_mu_vec, d0_abc0_mu_vec
):
  
    # a0 to a1
    Q_vec = [rate_mat_JC69(i) for i in a0_a1_mu_vec]
    df_a = p_b_given_a(t = a0_a1_t_vec, Q = Q_vec)
    df_a = b_given_a_to_dict_a_b(df_a)
    # df_a[a0][a1]
    
    # b1 to b0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(b0_b1_mu_vec))]
    df_b = p_b_given_a(t = list(reversed(b0_b1_t_vec)), Q = Q_vec)
    df_b = b_given_a_to_dict_a_b(df_b)
    # df_b[b1][b0]
    
    # c1 to c0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(c0_c1_mu_vec))]
    df_c = p_b_given_a(t = list(reversed(c0_c1_t_vec)), Q = Q_vec)
    df_c = b_given_a_to_dict_a_b(df_c)
    # df_c[c1][c0]
    
    # abc0 to d0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(d0_abc0_mu_vec))]
    df_d = p_b_given_a(t = list(reversed(d0_abc0_t_vec)), Q = Q_vec)
    df_d = b_given_a_to_dict_a_b(df_d)
    # df_d[abc0][d0]
    
    # ab0 to ab1
    Q_vec = [rate_mat_JC69(i) for i in ab0_ab1_mu_vec]
    df_ab = p_b_given_a(t = ab0_ab1_t_vec, Q = Q_vec)
    df_ab = b_given_a_to_dict_a_b(df_ab)
    # df_ab[ab0][ab1]
    
    # First coalescent
    df_first = p_b_c_given_a_JC69(t = a1b1_ab0_t, mu = a1b1_ab0_mu)
    df_first = b_c_given_a_to_dict_a_b_c(df_first)
    # df_first[a1][b1][ab0]
    
    # Second coalescent
    df_second = p_b_c_given_a_JC69(t = ab1c1_abc0_t, mu = ab1c1_abc0_mu)
    df_second = b_c_given_a_to_dict_a_b_c(df_second)
    # df_second[a1][b1][ab0]
    
    emissions = {}
    for a0 in ['A', 'C', 'T', 'G']:
        for b0 in ['A', 'C', 'T', 'G']:
            for c0 in ['A', 'C', 'T', 'G']:
                for d0 in ['A', 'C', 'T', 'G']:
                    acc = 0
                    for a1 in ['A', 'C', 'T', 'G']:
                        for b1 in ['A', 'C', 'T', 'G']:
                            for c1 in ['A', 'C', 'T', 'G']:
                                for ab0 in ['A', 'C', 'T', 'G']:
                                    for ab1 in ['A', 'C', 'T', 'G']:
                                        for abc0 in ['A', 'C', 'T', 'G']:
                                            res = 1
                                            res = res*df_a[a0][a1]
                                            res = res*df_b[b1][b0]
                                            res = res*df_first[a1][ab0][b1]
                                            res = res*df_ab[ab0][ab1]
                                            res = res*df_second[ab1][abc0][c1]
                                            res = res*df_c[c1][c0]
                                            res = res*df_d[abc0][d0]
                                            acc += res
                    emissions[a0+b0+c0+d0] = acc/4
                
    return emissions

In [72]:
def calc_emissions_double_JC69(
    a0_a1_t_vec, b0_b1_t_vec, c0_c1_t_vec, a1b1c1_abc0_t, d0_abc0_t_vec,
    a0_a1_mu_vec, b0_b1_mu_vec, c0_c1_mu_vec, a1b1c1_abc0_mu, d0_abc0_mu_vec
):
    # a0 to a1
    Q_vec = [rate_mat_JC69(i) for i in a0_a1_mu_vec]
    df_a = p_b_given_a(t = a0_a1_t_vec, Q = Q_vec)
    df_a = b_given_a_to_dict_a_b(df_a)
    # df_a[a0][a1]
    
    # b1 to b0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(b0_b1_mu_vec))]
    df_b = p_b_given_a(t = list(reversed(b0_b1_t_vec)), Q = Q_vec)
    df_b = b_given_a_to_dict_a_b(df_b)
    # df_b[b1][b0]
    
    # c1 to c0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(c0_c1_mu_vec))]
    df_c = p_b_given_a(t = list(reversed(c0_c1_t_vec)), Q = Q_vec)
    df_c = b_given_a_to_dict_a_b(df_c)
    # df_c[c1][c0]
    
    # abc0 to d0 
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(d0_abc0_mu_vec))]
    df_d = p_b_given_a(t = list(reversed(d0_abc0_t_vec)), Q = Q_vec)
    df_d = b_given_a_to_dict_a_b(df_d)
    # df_d[abc0][d0]
    
    # Double coalescent
    df_double = p_b_c_d_given_a_JC69(t = a1b1c1_abc0_t, mu = a1b1c1_abc0_mu)
    df_double = b_c_d_given_a_to_dict_a_b_c_d(df_double)
    # df_double[a1][b1][c1][abc0]
    
    emissions = {}
    for a0 in ['A', 'C', 'T', 'G']:
        for b0 in ['A', 'C', 'T', 'G']:
            for c0 in ['A', 'C', 'T', 'G']:
                for d0 in ['A', 'C', 'T', 'G']:
                    acc = 0
                    for a1 in ['A', 'C', 'T', 'G']:
                        for b1 in ['A', 'C', 'T', 'G']:
                            for c1 in ['A', 'C', 'T', 'G']:
                                for abc0 in ['A', 'C', 'T', 'G']:
                                    res = 1
                                    res = res*df_a[a0][a1]
                                    res = res*df_b[b1][b0]
                                    res = res*df_c[c1][c0]
                                    res = res*df_double[a1][b1][c1][abc0]
                                    res = res*df_d[abc0][d0]
                                    acc += res
                    emissions[a0+b0+c0+d0] = acc/4
    return emissions

In [89]:
mu = 0.1
single = calc_emissions_single_JC69(
a0_a1_t_vec = [1, 1], b0_b1_t_vec = [1, 1], a1b1_ab0_t = 1, ab0_ab1_t_vec = [1, 1], 
ab1c1_abc0_t = 1, c0_c1_t_vec = [1, 3, 1], d0_abc0_t_vec = [7, 2, 1],
a0_a1_mu_vec = [mu, mu], b0_b1_mu_vec = [mu, mu], a1b1_ab0_mu = mu, ab0_ab1_mu_vec = [mu, mu], 
ab1c1_abc0_mu = mu, c0_c1_mu_vec = [mu, mu, mu], d0_abc0_mu_vec = [mu, mu, mu]
)
np.array(list(single.values())).sum()

1.0000000000000007

In [90]:
mu = 0.1
double = calc_emissions_double_JC69(
a0_a1_t_vec = [1, 3, 1], b0_b1_t_vec = [1, 3, 1], c0_c1_t_vec = [1, 3, 1], a1b1c1_abc0_t = 1, d0_abc0_t_vec = [7, 2, 1],
a0_a1_mu_vec = [mu, mu, mu], b0_b1_mu_vec = [mu, mu, mu], c0_c1_mu_vec = [mu, mu, mu], a1b1c1_abc0_mu = mu, d0_abc0_mu_vec = [mu, mu, mu]
)
np.array(list(double.values())).sum()

1.0000000000000002

In [91]:
single

{'AAAA': 0.05021515598993559,
 'AAAC': 0.016614698055837995,
 'AAAT': 0.01661469805583799,
 'AAAG': 0.01661469805583799,
 'AACA': 0.009474845747568492,
 'AACC': 0.006841767729390447,
 'AACT': 0.0045359013512987646,
 'AACG': 0.004535901351298756,
 'AATA': 0.009474845747568494,
 'AATC': 0.004535901351298761,
 'AATT': 0.006841767729390435,
 'AATG': 0.004535901351298755,
 'AAGA': 0.009474845747568498,
 'AAGC': 0.0045359013512987515,
 'AAGT': 0.0045359013512987515,
 'AAGG': 0.006841767729390425,
 'ACAA': 0.003928209770324481,
 'ACAC': 0.0017746716594937372,
 'ACAT': 0.0014795355945930165,
 'ACAG': 0.0014795355945930165,
 'ACCA': 0.0017385989372207169,
 'ACCC': 0.0037090741186036804,
 'ACCT': 0.001418509811411126,
 'ACCG': 0.001418509811411126,
 'ACTA': 0.0010168073473405493,
 'ACTC': 0.000991854286431694,
 'ACTT': 0.001117214379851535,
 'ACTG': 0.0006967182215309853,
 'ACGA': 0.0010168073473405734,
 'ACGC': 0.0009918542864317044,
 'ACGT': 0.0006967182215309854,
 'ACGG': 0.001117214379851546

In [93]:
[3]+[3]*int(2!=0)

[3, 3]

In [94]:
import numpy as np
from scipy.stats import truncexpon
from scipy.stats import expon

def cutpoints_AB(n_int_AB, t_AB, coal_AB):
    """
    This function returns a the cutpoints for the
    intervals for the two-sequence CTMC. The cutpoints
    will be defined by the quantiles of a truncated
    exponential distribution. 
    
    Parameters
    ----------
    n_int_AB : integer
        Number of intervals in the two-sequence CTMC.
    t_AB : float
        Total time interval of the two-sequence CTMC
    coal_AB : float
        coalescent rate of the two-sequence CTMC.
    """
    # Define probabilities for quantiles
    quantiles_AB = np.array(list(range(n_int_AB+1)))/n_int_AB
    # Define truncexpon shape parameters
    lower, upper, scale = 0, t_AB, 1/coal_AB
    # Get quantiles
    cut_AB = truncexpon.ppf(quantiles_AB, b=(upper-lower)/scale, loc=lower, scale=scale)
    return cut_AB 

In [99]:
cutpoints_AB(3, 2, 1)

array([0.        , 0.33998861, 0.85906752, 2.        ])