## Simulating data for intro-ml

### Required software

In [1]:
# pip install -e . from cloned intro-ml repo to install simcat
# pip install dendropy
# pip install -e . from cloned toytree repo toyplot

In [2]:
import simcat   
import toytree    ## requires github Master branch currently
import toyplot
from dendropy.simulate import treesim  
## would be nice to replace dendropy sims with toytree funcs

### `simcat` has two class objects

In [3]:
print simcat.Model
print simcat.DataBase

<class 'simcat.simcat.Model'>
<class 'simcat.simcat.DataBase'>


### The `simcat.Model` object
This class is used to simulate count matrices for a given tree with provided admixture events. Trees are entered as either newick strings or as Toytree objects. Adding the `debug=True` argument prints more info about the generated data set. 

In [4]:
## generate a random tree
tree = toytree.rtree(4)
tree.tree.convert_to_ultrametric()
tree.draw(tree_style='c', node_labels='idx', tip_labels=False);

In [5]:
## simulate counts for a tree with no admixture
m1 = simcat.Model(tree, ntests=3, debug=True)
m1.run()
m1.counts.shape

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)



(3, 16, 16)

In [6]:
## simulate counts for a tree with one admixture edge [2->1, interval 0.2-0.5, rate 0.1]
m2 = simcat.Model(tree, admixture_edges=[(2, 1, 0.2, 0.5, 0.1)], ntests=3, debug=True)
m2.run()
m2.counts.shape

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)
demog mig: (0.20000000000000001, 0.5, 0.10000000000000001, (2, 1))

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)
demog mig: (0.20000000000000001, 0.5, 0.10000000000000001, (2, 1))

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)
demog mig: (0.20000000000000001, 0.5, 0.10000000000000001, (2, 1))



(3, 16, 16)

In [7]:
## simulate counts for 'ntests' admix events uniformly spread across the edge [2->1]
m3 = simcat.Model(tree, admixture_edges=[(2, 1, None, None, None)], ntests=3, debug=True)
m3.run()
m3.counts.shape

uniform testvals mig: (0, 200000) (0.0, 0.5)
demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)
demog mig: (40912, 113545, 0.46480804640857393, (2, 1))

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)
demog mig: (119108, 192902, 0.15818777729089295, (2, 1))

demog div: (400000, 2, 0)
demog div: (200000, 3, 2)
demog div: (200000, 1, 0)
demog mig: (130635, 149781, 0.091959405838547226, (2, 1))



(3, 16, 16)

### Generate a list of trees to pass to `simcat.Model` 
Here it would be nice to have simple functions to generate species trees with branch length distributions according to different processes (e.g., coalescent, Yule, b-d), and input parameters for those models. 

In [8]:
## generate a list of 1000 birth-death trees
simtrees = []
tt = []
while len(simtrees) < 10:
    try:
        ## generate tree
        stree = treesim.discrete_birth_death_tree(birth_rate=0.1, death_rate=0.01, ntax=4)
        
        ## convert to newick
        snewick = stree.as_string('newick', suppress_rooting=True)
        
        ## store if it has 4 tips (not sure why dendropy returns 5 tips sometimes)
        if len(toytree.tree(snewick)) == 4:
            simtrees.append(snewick)
    except Exception: 
        pass

In [9]:
## plot a couple of random b-d trees with node heights
ntrees = 4
canvas = toyplot.Canvas(width=800, height=200)
axes = [canvas.cartesian(grid=(1, ntrees, idx)) for idx in range(ntrees)]
for i in range(ntrees):
    ax = axes[i]
    ax.show = False
    toytree.tree(simtrees[i]).draw(
        axes=ax, tree_style='c', node_labels='name', tip_labels=False, node_size=16)

## The `simcat.DataBase` object

The `DataBase` object, in development, could take the list of generated trees and loop over all trees and edges on those trees, and then call `simcat.Model` to sample a range of admixture events on those edges and generate count matrices. The main job of the `DataBase` object is to parallelize all of those simulations and store the results. The results should be stored in a HDF5 database with metadata about the params of the test. 

In [10]:
## init a database
db1 = simcat.DataBase(name="test-1", workdir="databases")

The `DataBase` object should perform a loop somewhat like below...

In [11]:
## iterate across trees
for sidx in xrange(len(simtrees)):
    
    ## iterate across possible admixture edges
    ttree = toytree.tree(simtrees[sidx])
    for edge in simcat.get_all_admix_edges(ttree):
        
        ## create model object to sim: 1000 SNPS, 10 tests with admixture uniformly
        ## sampled across the selected edge length and with rate in U(0, 0.5)
        carr = simcat.Model(
            ttree, 
            admixture_edges=(edge[0], edge[1], None, None, None),
            ntests=10)
        
        ## run model to get counts and save array to HDF5
        carr.run()
        
        ## print to stdout some info about this run
        label = ("tree {}; {}->{}; {} snps; {} tests".format(
            sidx, edge[0], edge[1], carr.nsnps, carr.ntests))
        print label
        
        ## store the results
        ## .. hdf5 store array and info as metadata
        

tree 0; 1->2; 1000 snps; 10 tests
tree 0; 0->1; 1000 snps; 10 tests
tree 0; 3->2; 1000 snps; 10 tests
tree 0; 1->3; 1000 snps; 10 tests
tree 0; 0->3; 1000 snps; 10 tests
tree 0; 3->0; 1000 snps; 10 tests
tree 0; 3->1; 1000 snps; 10 tests
tree 0; 2->1; 1000 snps; 10 tests
tree 0; 0->2; 1000 snps; 10 tests
tree 0; 2->0; 1000 snps; 10 tests
tree 0; 2->3; 1000 snps; 10 tests
tree 0; 4->3; 1000 snps; 10 tests
tree 0; 4->2; 1000 snps; 10 tests
tree 0; 1->0; 1000 snps; 10 tests
tree 0; 3->5; 1000 snps; 10 tests
tree 0; 3->4; 1000 snps; 10 tests
tree 0; 2->4; 1000 snps; 10 tests
tree 0; 5->3; 1000 snps; 10 tests
tree 1; 1->2; 1000 snps; 10 tests
tree 1; 0->1; 1000 snps; 10 tests
tree 1; 3->2; 1000 snps; 10 tests
tree 1; 1->3; 1000 snps; 10 tests
tree 1; 0->3; 1000 snps; 10 tests
tree 1; 3->0; 1000 snps; 10 tests
tree 1; 3->1; 1000 snps; 10 tests
tree 1; 2->1; 1000 snps; 10 tests
tree 1; 0->2; 1000 snps; 10 tests
tree 1; 2->0; 1000 snps; 10 tests
tree 1; 2->3; 1000 snps; 10 tests
tree 1; 4->3; 