### imports

In [1]:
import numpy as np
import strange
import toytree
from scipy import stats
import ipyparallel as ipp
import h5py
from collections import Counter
from copy import deepcopy
from numba import jit

### make tree

In [2]:
rtr = toytree.rtree().coaltree(ntips=8,seed=42)
rtr3 = rtr.mod.node_scale_root_height(3)
rtr3.draw();

### simulate!

In [3]:
Ne = 50000
mut = 1e-8
kwargs = {
    "workdir": "../tests",
    "mutation_rate": mut,
    "recombination_rate": 1e-9,
    "theta": Ne*mut*4,
    "length": int(1e6), 
    "get_sequences": True,
    "random_seed": 42,
}

# simulation object
coal8 = strange.Coalseq(tree=rtr3, name="coal8", **kwargs)

In [5]:
coal8.tree_table.head(10)

Unnamed: 0,end,length,mstree,nsnps,start,treeheight
0,54,54,"((8:204293.32434399396880,(6:64174.00872287491...",0,0,473009
1,568,514,"((8:204293.32434399396880,(6:64174.00872287491...",9,54,473009
2,675,107,"((8:204293.32434399396880,(6:64174.00872287491...",4,568,473009
3,2454,1779,"((8:204293.32434399396880,(6:64174.00872287491...",30,675,473009
4,2515,61,"((8:204293.32434399396880,(6:64174.00872287491...",2,2454,473009
5,2655,140,"((8:204293.32434399396880,(6:64174.00872287491...",2,2515,546145
6,2960,305,"((8:204293.32434399396880,(6:64174.00872287491...",5,2655,546145
7,3420,460,"((8:204293.32434399396880,(6:64174.00872287491...",13,2960,546145
8,3690,270,"(((4:104748.95632937064511,5:104748.9563293706...",6,3420,546145
9,4288,598,"(((4:104748.95632937064511,5:104748.9563293706...",14,3690,546145


What's the average gene tree length?

In [8]:
stats.expon.fit(coal8.tree_table.length)

(-1.384177667422846e-12, 895.2550239284736)

### Let's run MrBayes on each gene tree, and then separately on sliding windows of average gene tree size.

In [11]:
ipyclient = ipp.Client()
ipyclient

<ipyparallel.client.client.Client at 0x151b796d90>

In [12]:
sliding_obj = strange.SlidingWindow(name='coal8',workdir='../tests/',ipyclient=ipyclient)

In [13]:
sliding_obj.run_mb_mstrees()

[####################] 100% 0:13:14 | inferring mb trees on mstrees 

In [14]:
sliding_obj.run_mb_sliding_windows(slide_interval=895,window_size=895)

[####################] 100% 0:13:31 | inferring mb trees 

### read in the mb results using our resampling class:

In [2]:
mb_results = strange.MB_posts('../tests/coal8_mb_mstrees.hdf5')

the "posterior_list" object is a list of posterior tree distributions for each gene tree. This format is the first thing to fix to make things run more quickly... Probably better to make a master list of all observed topologies in posteriors and index them.

it would be better to store the posterior dists as pandas dfs with a coulumn identifying the mbtree index, a column identifying the index corresponding to a particular topology, and a column corresponding to the associated posterior prob.

In [3]:
len(mb_results.posterior_list)

1116

In [4]:
mb_results.posterior_list[0]

array([['(8,(((7,4),(6,(5,2))),3),1);', '(2,(4,(8,(7,(5,(6,3))))),1);',
        '((5,(4,2)),(7,(6,(8,3))),1);', '((5,(7,2)),(6,((8,4),3)),1);',
        '(8,(((7,(6,5)),2),(4,3)),1);', '(((7,5),(8,(6,(4,2)))),3,1);',
        '((8,(5,2)),(4,(7,(6,3))),1);', '(7,((8,(5,2)),((6,4),3)),1);',
        '((5,((8,6),2)),(7,(4,3)),1);', '((5,2),(8,((7,4),(6,3))),1);',
        '((7,2),(8,(4,(5,(6,3)))),1);', '((6,2),(7,((8,4),(5,3))),1);',
        '(8,((7,2),(4,(5,(6,3)))),1);', '((7,2),(4,(8,(5,(6,3)))),1);',
        '(8,((5,2),(6,(4,(7,3)))),1);', '((5,2),(6,(4,(7,(8,3)))),1);',
        '(7,(6,(((8,5),2),(4,3))),1);', '((6,2),(4,(5,(7,(8,3)))),1);',
        '(((8,7),(6,(5,2))),(4,3),1);', '(8,(7,((5,(4,(6,2))),3)),1);',
        '((5,(8,(7,(4,2)))),(6,3),1);', '((5,(6,(4,(7,(8,2))))),3,1);',
        '(6,((5,((8,4),(7,2))),3),1);', '((5,((8,4),(7,(6,2)))),3,1);',
        '((5,(8,((7,4),(6,2)))),3,1);', '(5,((8,(7,(6,(4,2)))),3),1);',
        '((8,(5,(4,(7,2)))),(6,3),1);', '((7,(5,(4,(6,(8,2))))),

### now say we want to resample posterior vals for each gene tree by redrawing some proportion from its own posterior distribution and some proportion from its immediate neighbor trees:

In [7]:
neighbor_resampled_3_times = mb_results.resample_neighbors(num_times=3,prop=.8,resamp_num=300)

In [8]:
len(neighbor_resampled_3_times)

1116

In [9]:
neighbor_resampled_3_times[0]

[array(['(4,((7,5),((6,(8,2)),3)),1);', '((8,(6,(5,(7,(4,2))))),3,1);',
        '(2,(6,(7,(8,(5,(4,3))))),1);', '(((7,(8,6)),2),(4,(5,3)),1);',
        '((4,((8,(7,6)),(5,2))),3,1);', '(((7,(8,6)),5),(2,(4,3)),1);',
        '(7,(6,(2,(5,(8,(4,3))))),1);', '(4,(((8,(7,6)),5),(2,3)),1);',
        '(((6,((8,(7,5)),4)),2),3,1);', '(((8,7),6),(4,(2,(5,3))),1);',
        '((7,(8,(6,5))),(2,(4,3)),1);', '(2,(((7,(8,6)),(5,4)),3),1);',
        '(8,((7,(6,5)),(4,(2,3))),1);', '(6,(7,((8,5),((4,2),3))),1);',
        '((8,(6,(7,(5,4)))),(2,3),1);', '((5,2),((7,(8,6)),(4,3)),1);',
        '(((7,(8,6)),5),((4,2),3),1);', '((4,(((8,7),6),(5,2))),3,1);',
        '((8,(5,2)),((7,(6,4)),3),1);', '(5,(((7,6),2),(8,(4,3))),1);',
        '((7,4),(5,(6,(8,(2,3)))),1);', '(2,(6,((7,((8,5),4)),3)),1);',
        '((5,(6,(7,4))),(2,(8,3)),1);', '((5,((7,(8,6)),4)),(2,3),1);',
        '(2,((8,(7,6)),((5,4),3)),1);', '(((8,7),6),((5,2),(4,3)),1);',
        '(((8,5),4),(7,(6,(2,3))),1);', '(5,(8,(2,(7,(6,(4,3))))

### now say we want to resample posterior vals for each gene tree by redrawing some proportion from its own posterior distribution and some proportion from trees that are some integer number of gene trees away. This distance is determined by a draw from a normal distribution of variance that we can define:

In [4]:
normal_resampled_3_times = mb_results.resample_normal(num_times=3,prop=.8,scale=3,resamp_num=300)

In [5]:
len(normal_resampled_3_times)

1116

In [6]:
normal_resampled_3_times[0]

[array(['(2,(6,(7,(8,(5,(4,3))))),1);', '((5,2),((8,(7,6)),(4,3)),1);',
        '(((5,(8,((7,6),4))),2),3,1);', '((8,(6,2)),(5,((7,4),3)),1);',
        '((4,(6,2)),(5,(7,(8,3))),1);', '(4,((((8,(7,6)),5),2),3),1);',
        '(6,(7,(5,((4,2),(8,3)))),1);', '(((5,4),2),((8,(7,6)),3),1);',
        '(2,(5,(((8,(7,6)),4),3)),1);', '(5,(((8,6),4),(7,(2,3))),1);',
        '(7,(((8,(6,4)),(5,2)),3),1);', '(((7,(8,6)),5),((4,2),3),1);',
        '(4,(5,(((8,(7,6)),2),3)),1);', '((8,(7,6)),(((5,4),2),3),1);',
        '((5,(((7,(8,6)),4),2)),3,1);', '(2,(4,(8,(7,(5,(6,3))))),1);',
        '(7,(8,(4,((5,(6,2)),3))),1);', '(8,((5,(6,(7,(4,2)))),3),1);',
        '(6,((5,((8,4),(7,2))),3),1);', '(2,((7,4),(5,(8,(6,3)))),1);',
        '(2,(8,(6,(4,((7,5),3)))),1);', '((7,(8,6)),((4,(5,2)),3),1);',
        '(8,(((7,4),(6,(5,2))),3),1);', '(2,(5,((7,(8,6)),(4,3))),1);',
        '((4,2),((((8,7),6),5),3),1);', '(((8,(7,6)),4),(5,(2,3)),1);',
        '((((8,7),6),2),(4,(5,3)),1);', '((5,(((8,7),6),4)),(2,3

### We would expect that doing this very many times (and with low variance...) would eliminate some of the low-frequency trees.

In [3]:
# this takes __WAY__ too long if we're wanting MCMC........
normal_resampled_100_times = mb_results.resample_normal(num_times=100,prop=.8,scale=1,resamp_num=300)

In [8]:
normal_resampled_100_times[0]

[array(['(2,(((8,(7,6)),(5,4)),3),1);', '(((7,6),(5,(4,(8,2)))),3,1);',
        '(4,((8,7),((6,2),(5,3))),1);', '((8,(7,6)),(4,((5,2),3)),1);',
        '(2,(((8,(7,6)),4),(5,3)),1);', '(((7,(8,6)),2),(5,(4,3)),1);',
        '(4,((((8,(7,6)),5),2),3),1);', '((8,(5,((7,6),2))),(4,3),1);',
        '((((8,7),6),2),((5,4),3),1);', '((5,2),(((8,7),6),(4,3)),1);',
        '((4,(5,(((8,7),6),2))),3,1);', '(5,(4,(((8,(7,6)),2),3)),1);',
        '(((5,4),2),((8,(7,6)),3),1);', '(6,(((8,5),(7,(4,2))),3),1);',
        '((8,(7,6)),((5,(4,2)),3),1);', '((5,(4,(8,(7,(6,2))))),3,1);',
        '((8,4),((((7,6),5),2),3),1);', '(5,(((8,(7,6)),4),(2,3)),1);',
        '(((8,(7,6)),4),(2,(5,3)),1);', '(((5,(((8,7),6),4)),2),3,1);',
        '((((8,(7,6)),(5,4)),2),3,1);', '((((7,(8,6)),4),2),(5,3),1);',
        '((7,6),(5,((4,(8,2)),3)),1);', '(5,(((8,(7,6)),(4,2)),3),1);',
        '(((5,((8,(7,6)),4)),2),3,1);'], dtype='|S28'),
 array([0.02      , 0.00333333, 0.00666667, 0.03666667, 0.00333333,
        0.03

In [7]:
print(np.max(normal_resampled_3_times[1000][1]))
print(np.max(normal_resampled_100_times[1000][1]))

0.08666666666666667
0.18666666666666668


### next steps: 
* scoring for mcmc
* rewrite this with a new posterior dist storage method. Probably makes for faster sampling...