In [128]:
from IPython.display import SVG
import numpy as np
import pandas as pd
from scipy.linalg import expm
import itertools
from scipy.stats import truncexpon
import ast
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [3]:
def iter_lst_to_ggplot(iter_lst_2):
    dat = pd.DataFrame(columns = ['id', 'seg_y', 'seg_xmin', 'seg_xmax', 'dot_color_left', 'dot_color_right'])
    for i in range(len(iter_lst_2)):
        for j in range(len(iter_lst_2[i])):
            if iter_lst_2[i][j][0] == 0:
                xmin = 0
                xmax = 1
                left = '-'
                right = str(iter_lst_2[i][j][1])
            elif iter_lst_2[i][j][1] == 0:
                xmin = -1
                xmax = 0
                left = str(iter_lst_2[i][j][0])
                right = '-'
            else:
                xmin = -1
                xmax = 1
                left = str(iter_lst_2[i][j][0])
                right = str(iter_lst_2[i][j][1])
            dat.loc[len(dat)] = [i, j, xmin, xmax, left, right]
    return dat

In [4]:
%%R

library(tidyverse)

# This function returns a plot of the different states
plot_states <- function(dat) {
    plt <- as_tibble(dat) %>%
        mutate(
            dot_color_left = ifelse(dot_color_left == '-', NA, dot_color_left),
            dot_color_right = ifelse(dot_color_right == '-', NA, dot_color_right)
        ) %>%
        ggplot() +
        geom_segment(aes(x = seg_xmin, xend = seg_xmax,
                         y = seg_y, yend = seg_y)) +
        geom_point(aes(x = seg_xmin, y = seg_y, 
                       color = is.na(dot_color_left), 
                       fill = dot_color_left,
                  shape = dot_color_left == 1),
                  size = 4) +
        geom_point(aes(x = seg_xmax, y = seg_y, 
                       color = is.na(dot_color_right), 
                       fill = dot_color_right,
                   shape = dot_color_right == 1),
                   size = 4) +
        theme_void() +
        facet_wrap(~id, scales = 'free') +
        scale_fill_manual(
            na.value = 'transparent',
            values = c('black', 'white', 'purple', 'yellow', 'orange', 'green', 'black'),
            breaks = as.character(c(1, 2, 3, 4, 5, 6, 7))
        ) +
        scale_color_manual(
            values = c('black', 'transparent')
        ) +
        scale_shape_manual(values=c(24, 21)) +
        scale_x_continuous(expand = c(0.2, 0.2)) +
        scale_y_continuous(expand = c(0.2, 0.2)) +
        theme(legend.position = 'none',
              panel.border = element_rect(colour = "black", fill = NA, size = 1),
              axis.line=element_blank(),
              panel.background=element_blank(),panel.grid.major=element_blank(),
              panel.grid.minor=element_blank(),plot.background=element_blank()) 
    plt
}

R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.5     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   2.0.2     ✔ forcats 0.5.1

R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()



In [5]:
def get_trans_mat(iter_lst, coal, rho):
    """
    This function returns a transition matrix given a 
    list of states, a coalescent rate and a recombination
    rate. 
    """
    trans_mat = np.full((len(iter_lst), len(iter_lst)), 0.0)
    for i in range(len(iter_lst)):
        new_lst = recombination(iter_lst[i])
        idx_lst = [iter_lst.index(j) for j in new_lst]
        for j in idx_lst:
            trans_mat[i,j]=rho
        new_lst = coalescent(iter_lst[i])
        idx_lst = [iter_lst.index(j) for j in new_lst]
        for j in idx_lst:
            trans_mat[i,j]=coal
    for i in range(len(iter_lst)):
        trans_mat[i,i]=-sum(trans_mat[i])
    return trans_mat

In [233]:
def combine_states(iter_lst_a, iter_lst_b, probs_a, probs_b):
    """
    Given two lists of states and their probabilities, this
    function returns a list of combined states and their
    probabilities. 
    """
    iter_lst_ab = []
    probs_ab = []
    for i in range(len(iter_lst_a)):
        for j in range(len(iter_lst_b)):
            iter_lst_ab.append(sorted(iter_lst_a[i]+iter_lst_b[j]))
            probs_ab.append(probs_a[i]*probs_b[j])
    # Define new data frame
    df = pd.DataFrame()
    # Save names of state
    df['name'] = [str(i) for i in iter_lst_ab]
    # Save probabilities
    df['value'] = probs_ab
    # Group by state and sum probabilities
    df = df.groupby("name", as_index=False).sum()
    return list(df['name']), list(df['value'])

In [10]:
def trans_mat_num(trans_mat, coal, rho):
    """
    This function returns a transition matrix given a 
    string matrix whose values are either '0', or 'R'
    or 'C' preceeded by a number, corresponding to a
    multiplication factor for the recombination and 
    the coalescence rate, respectively. The user can
    specify these two numerical rates. The function
    calculates the rates in the diagonals as  
    (-1)*rowSums
    """
    num_rows, num_cols = trans_mat.shape
    trans_mat_num = np.full((num_rows, num_cols), 0.0)
    for i in range(num_rows):
        for j in range(num_cols):
            if trans_mat[i,j] == '0':
                trans_mat_num[i,j] = 0.0
            else:
                trans_mat_num[i,j] = int(trans_mat[i,j][0])*(coal if trans_mat[i,j][1]=='C' else rho)
    for i in range(num_rows):
        trans_mat_num[i,i]=-sum(trans_mat_num[i])
    return trans_mat_num

In [45]:
def cutpoints_AB(n_int_AB, t_AB, coal_AB):
    # Define probabilities for quantiles
    quantiles_AB = np.array(list(range(n_int_AB+1)))/n_int_AB
    # Define truncexpon shape parameters
    lower, upper, scale = 0, t_AB, coal_AB
    # Get quantiles
    cut_AB = truncexpon.ppf(quantiles_AB, b=(upper-lower)/scale, loc=lower, scale=scale)
    return cut_AB 

In [60]:
def cutpoints_ABC(n_int_ABC, coal_ABC):
    cut_ABC = [-np.log(1-i/n_int_ABC)/coal_ABC for i in range(0, n_int_ABC)]+[np.inf] 
    return cut_ABC

In [None]:
def get_tab_AB(n_states_AB):
    tab = np.zeros((n_states_AB*n_states_AB+n_states_AB*2+1, 9))
    tab_names = []
    acc = 0

In [115]:
def load_trans_mat(n_seq):
    df = pd.read_csv('../02_state_space/trans_mats/trans_mat_simple_'+str(n_seq)+'.csv')
    d = {'names': pd.concat([df['from_str'], df['to_str']]),
     'values': pd.concat([df['from'], df['to']])}
    df_2 = pd.DataFrame(data=d).drop_duplicates().sort_values(by=['values'])
    df_1 = df[['value', 'from', 'to']].pivot(index='from',columns='to',values='value').fillna('0')
    df_1.columns.name = None
    df_1 = df_1.reset_index().iloc[:, 1:]
    return np.array(df_1), list(df_2['names'])


In [243]:
def get_HMM_trans_mat(t_A,    t_B,    t_AB,    t_C, 
                      rho_A,  rho_B,  rho_AB,  rho_C,  rho_ABC, 
                      coal_A, coal_B, coal_AB, coal_C, coal_ABC,
                      n_int_AB, n_int_ABC):
    
    ##########################
    ### One-sequence CTMCs ###
    ##########################
    
    (trans_mat_1, state_space_1) = load_trans_mat(1)
    state_space_A = [ast.literal_eval(i) for i in state_space_1]
    (trans_mat_2, state_space_2) = load_trans_mat(2)
    state_space_AB = [ast.literal_eval(i) for i in state_space_2]
    
    # These are (2x2) matrices
    trans_mat_A = trans_mat_num(trans_mat_1, coal_A, rho_A)
    trans_mat_B = trans_mat_num(trans_mat_1, coal_B, rho_B)
    trans_mat_C = trans_mat_num(trans_mat_1, coal_C, rho_C)
    
    # These are (1x2) vectors
    final_A = expm(trans_mat_A*t_A)[0]
    final_B = expm(trans_mat_B*t_B)[0]
    final_C = expm(trans_mat_C*t_C)[0]
    
    (comb_AB_name, comb_AB_value) = combine_states(state_space_A, state_space_A, final_A, final_B)
    pi_AB = [comb_AB_value[comb_AB_name.index(i)] if i in comb_AB_name else 0 for i in state_space_2]
        
    return pi_AB
    
    #########################
    ### Two-sequence CTMC ###
    #########################
    
    cut_AB = cutpoints_AB(n_int_AB, t_AB, coal_AB)
    
    cut_ABC = cutpoints_ABC(n_int_ABC, coal_ABC)
    
    

In [242]:
get_HMM_trans_mat(1, 1, 2, 3, 
                  1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 
                  3, 3)

[0.18691126810387715,
 0.4908421805556328,
 0.32224655134048985,
 0,
 0,
 0,
 0,
 0,
 0]

# Defining the coalescent hidden Markov model

The idea is to extend Mailund et al. (2011) to model ILS:
1. We will model the divergence of three isolated sequences using a two-nucleotide continuous-time Markov chain (CTMC). 
2. After some time, two of the sequences will be merged and their fate will be model with a two-nucleotide, two-sequence CTMC. The distribution of the starting states will be determined by the individual CTMCs in the previous step.
3. After some time, the resulting sequences will be mixed with the remaining isolated sequence. The starting distribution of the states will be determined by the two-sequence CTMC and the one-sequence CTMC of the last sequence. Now, we will define a discrete Markov jump process, where the states will correspond to either V0 (the coalescence has happened in the previous two-sequence CTMC), or a time-discretized state of other possible coalescences. 

## One-sequence CTMC

We will start by defining a one-sequence, two-nucleotide CTMC. The system will be formed of two possible states, either linked or unlinked:

In [22]:
%%R -o trans_mat_simple_1_edited -o states_simple_1

# Read he csv file
trans_mat_simple_1 <- read_csv('../02_state_space/trans_mats/trans_mat_full_1.csv')

# Transform the transition matrix to a spread format
trans_mat_simple_1_edited <- trans_mat_simple_1 %>%
    select(from, to, value)  %>%
    mutate(
        from = as.factor(from), 
        to = as.factor(to)
    ) %>%
    spread(to, value, fill = '0') %>%
    select(-from) %>%
    as.data.frame()

# Save the state names
states_simple_1 <- tibble(
        names = c(trans_mat_simple_1$from_str, trans_mat_simple_1$to_str),
        values = c(trans_mat_simple_1$from, trans_mat_simple_1$to)
    ) %>%
    unique() %>%
    arrange(values) %>%
    as.data.frame()

Rows: 2 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): from_str, to_str, value
dbl (2): from, to

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [23]:
trans_mat_simple_1_edited = np.array(trans_mat_simple_1_edited)

In [76]:
coal_A = 1
rho_A = 2
trans_mat_A = trans_mat_num(trans_mat_simple_1_edited, coal_A, rho_A)
trans_mat_A

array([[-2.,  2.],
       [ 1., -1.]])

The end probabilities can therefore be obtain by matrix exponentiation. For example, for a waiting time of 0.1, and assuming we start in the linked state:

In [25]:
t_A = 0.1
final_A = expm(trans_mat_A*t_A)[0]
final_A

array([0.82721215, 0.17278785])

We can do the same for the second sequence, here tagged as "2". Assuming a coalescent rate of 0.5, a recombination rate of 1 and a coalescent time of 0.1 (same as before):

In [32]:
coal_B = 0.5
rho_B = 1
trans_mat_B = trans_mat_num(trans_mat_simple_1_edited, coal_B, rho_B)
trans_mat_B

t_B = 0.1
final_B = expm(trans_mat_B*t_B)[0]
final_B

array([0.90713865, 0.09286135])

In [62]:
states_simple_1

Unnamed: 0,names,values
1,"[(1, 1)]",1.0
2,"[(0, 1), (1, 0)]",2.0


In [94]:
a = pd.read_csv('../02_state_space/trans_mats/trans_mat_full_1.csv')

In [105]:
d = {'names': pd.concat([a['from_str'], a['to_str']]),
     'values': pd.concat([a['from'], a['to']])}
pd.DataFrame(data=d).drop_duplicates().sort_values(by=['values'])

Unnamed: 0,names,values
1,"[(1, 1)]",1
0,"[(0, 1), (1, 0)]",2


In [97]:

a



Unnamed: 0,from_str,to_str,value,from,to
0,"[(0, 1), (1, 0)]","[(1, 1)]",1C,2,1
1,"[(1, 1)]","[(0, 1), (1, 0)]",1R,1,2


In [81]:
trans_mat_num(np.array(df[[1, 2]]), coal_B, rho_B)

array([[-1. ,  1. ],
       [ 0.5, -0.5]])