## Using simcat to produce training data

In [1]:
import toytree
import simcat_working
import toyplot
from dendropy.simulate import treesim  ## would be nice to replace this with toytree sims...
import numpy as np
import h5py

In [2]:
## generate a list of 1000 birth-death trees
simtrees = []
tt = []
while len(simtrees) < 1:
    try:
        ## generate tree
        stree = treesim.discrete_birth_death_tree(birth_rate=0.1, death_rate=0.01, ntax=4)
        
        ## convert to newick
        snewick = stree.as_string('newick', suppress_rooting=True)
        
        ## store if it has 4 tips (not sure why dendropy returns 5 tips sometimes)
        if len(toytree.tree(snewick)) == 4:
            simtrees.append(snewick)
    except Exception: 
        pass

In [4]:
## plot a couple of random b-d trees with node heights
ntrees = 4
canvas = toyplot.Canvas(width=800, height=200)
axes = [canvas.cartesian(grid=(1, ntrees, idx)) for idx in range(ntrees)]
for i in range(ntrees):
    ax = axes[i]
    ax.show = False
    toytree.tree(simtrees[i]).draw(
        axes=ax, tree_style='c', node_labels='name', tip_labels=False, node_size=16)

In [8]:
## init a database
db1 = simcat.DataBase(name="test-1")

In [5]:
## iterate across trees
for sidx in xrange(len(simtrees)):
    
    ## iterate across possible admixture edges
    ttree = toytree.tree(simtrees[sidx])
    for edge in simcat.get_all_admix_edges(ttree):
        
        ## create model object to sim: 1000 SNPS, 10 tests with admixture uniformly
        ## sampled across the selected edge length and with rate in U(0, 0.5)
        carr = simcat.Model(
            ttree, 
            admixture_edges=(edge[0], edge[1], None, None, None),
            ntests=10)
        
        ## run model to get counts and save array to HDF5
        carr.run()
        
        ## print to stdout some info about this run
        label = ("tree {}; {}->{}; {} snps; {} tests".format(
            sidx, edge[0], edge[1], carr.nsnps, carr.ntests))
        print label
        
        ## store the results
        ## .. hdf5 store array and info as metadata
        

NameError: name 'simcat' is not defined

In [16]:
testing=simcat_working.get_all_admix_edges(toytree.tree(simtrees[0]))

In [48]:
sorted(testing.items())

[((0, 1), (0.0, 8.0)),
 ((0, 2), (0.0, 8.0)),
 ((0, 3), (0.0, 8.0)),
 ((1, 0), (0.0, 8.0)),
 ((1, 2), (0.0, 8.0)),
 ((1, 3), (0.0, 8.0)),
 ((2, 0), (0.0, 8.0)),
 ((2, 1), (0.0, 8.0)),
 ((2, 3), (0.0, 12.0)),
 ((2, 4), (8.0, 12.0)),
 ((3, 0), (0.0, 8.0)),
 ((3, 1), (0.0, 8.0)),
 ((3, 2), (0.0, 12.0)),
 ((3, 4), (8.0, 12.0)),
 ((3, 5), (12.0, 15.0)),
 ((4, 2), (8.0, 12.0)),
 ((4, 3), (8.0, 12.0)),
 ((5, 3), (12.0, 15.0))]

In [54]:
testing.items()

[((1, 2), (0.0, 8.0)),
 ((0, 1), (0.0, 8.0)),
 ((3, 2), (0.0, 12.0)),
 ((1, 3), (0.0, 8.0)),
 ((0, 3), (0.0, 8.0)),
 ((3, 0), (0.0, 8.0)),
 ((3, 1), (0.0, 8.0)),
 ((2, 1), (0.0, 8.0)),
 ((0, 2), (0.0, 8.0)),
 ((2, 0), (0.0, 8.0)),
 ((2, 3), (0.0, 12.0)),
 ((4, 3), (8.0, 12.0)),
 ((4, 2), (8.0, 12.0)),
 ((1, 0), (0.0, 8.0)),
 ((3, 5), (12.0, 15.0)),
 ((3, 4), (8.0, 12.0)),
 ((2, 4), (8.0, 12.0)),
 ((5, 3), (12.0, 15.0))]

In [25]:
for i in testing:
    print(i[0])

1
0
3
1
0
3
3
2
0
2
2
4
4
1
3
3
2
5


In [None]:
    def _generate_database(self):
        """
        Parses parameters in self.params to create all combinations
        of parameter values to test. Returns the number of the simulations.
        Simulation metadata is appended to datasets. 
        """
        if len(self.database):
            groupnum = len(self.database)+1
        simnum = 0
        for treenum in range(len(self.trees)):
            groupname = '/tree'+str(groupnum)
            self.database.create_group(groupname)
            admixedges = simcat_working.get_all_admix_edges(self.trees[treenum])
            newfile[groupname].attrs['admix_intervals'] = testing.values()
            newfile[groupname].attrs['admix_branches'] = testing.keys()
            groupnum += 1
            simnum += len(admixedges)
        return simnum

In [108]:
groupnum = 3
('/tree'+str(groupnum))

'/tree3'

In [111]:
groupnum += 1

In [112]:
groupnum

2

In [None]:
    def run(self, ipyclient, force=False):
        """
        Distribute simulations across a parallel Client. If continuing
        a previous run then any unfinished simulation will be queued up
        to run. 
        """
        self.database = h5py.File('myfile.hdf5','r+')
        for treenum in xrange(len(self.database)):
            currtree = toytree.tree(self.trees[treenum])
            groupname = "/tree" + str(treenum)
            for sim in xrange(len(self.database[groupname].attrs['admix_branches']))
                carr = simcat.Model(
                    ttree, 
                    admixture_edges=(edge[0], edge[1], None, None, None),
                    ntests=10)

                ## run model to get counts and save array to HDF5
                carr.run()
                
                for testnum in range(carr.counts.shape[0]):
                    ## need to create a group for each test of each branch
                    self.database.create_group(groupname+'/test')
                
        ## wrapper for ipyclient to close nicely when interrupted

                carr.counts
        pass

In [43]:
carr.ntests

10

In [None]:
(3, 3)

In [12]:
#!/usr/bin/env python

"""
Generate large database of site counts from coalescent simulations 
based on msprime + toytree for using in machine learning algorithms. 
"""

## make py3 compatible
from __future__ import print_function

## imports
import os
import h5py
import numba
import sklearn
import toyplot
import toytree
import itertools
import numpy as np
import msprime as ms



class Model(object):
    """
    A coalescent model for returning ms simulations. 
    """
    def __init__(self, 
        tree, 
        admixture_edges=[], 
        Ne=int(1e5), 
        mut=1e-8, 
        nsnps=1000, 
        ntests=100, 
        seed=12345, 
        **kwargs):
        """
        An object for running simulations to attain genotype matrices for many
        independent runs to sample Nrep SNPs. 
        
        Parameters:
        -----------
        tree: (str)
            A newick string representation of a species tree with edges in 
            units of generations.
            
        admixture_edges (list):
            A list of admixture events in the format (source, dest, start, end, rate).
        
        Ne (int):
            Effective population size (single fixed value currently)
            
        mut (float):
            Mutation rate.   
        """
        ## init random seed
        np.random.seed(seed)

        ## hidden argument to turn on debugging
        self._debug = [True if kwargs.get("debug") else False][0]
        
        ## store sim params as attrs
        self.Ne = Ne
        self.mut = mut
        self.nsnps = nsnps
        self.ntests = ntests
        
        ## parse the input tree
        if isinstance(tree, toytree.tree):
            self.tree = tree
        elif isinstance(tree, str):
            self.tree = toytree.tree(tree)
        else:
            raise TypeError("input tree should be newick str or Toytree object")
        self.ntips = len(self.tree)

        ## parse the input admixture edges. It should a list of tuples, or list
        ## of lists where each element has five values. 
        if admixture_edges:
            if isinstance(admixture_edges, tuple):
                admixture_edges = [admixture_edges]
            if not isinstance(admixture_edges[0], (list, tuple)):
                admixture_edges = [admixture_edges]
        for event in admixture_edges:
            if len(event) != 5:
                raise ValueError(
                    "admixture events should each be a tuple with 5 values")
        self.admixture_edges = admixture_edges
        
        ## generate migration parameters from the tree and admixture_edges
        ## stores data in memory as self.test_values
        self.get_test_values()
        
        ## store results (empty until you run .run())
        self.counts = None
        
        
    def get_test_values(self): 
        """
        Generates mrates and mtimes arrays for a range of values (ns) where
        migration rate is uniformly sampled, and its start and end points are
        uniformly sampled but contained within 0.05-0.95% of the branch length. 
        Rates are drawn uniformly between 0.0 and 0.95. 
        """
        ## init a dictionary for storing arrays for each admixture scenario
        self.test_values = {}
        
        ## iterate over events in admixture list
        idx = 0
        for event in self.admixture_edges:
            
            ## if times and rate were provided then use em.
            if all(event[-3:]):
                mrates = np.repeat(event[4], self.ntests)
                mtimes = np.stack([
                    np.repeat(event[2], self.ntests), 
                    np.repeat(event[3], self.ntests)], axis=1)
                self.test_values[idx] = {"mrates": mrates, "mtimes": mtimes}

            ## otherwise generate uniform values across edges
            else:        
                ## get migration rates from zero to ~full
                minmig = 0.0
                maxmig = 0.5
                mrates = np.random.uniform(minmig, maxmig, self.ntests)

                ## get divergence times from source start to end
                self._intervals = get_all_admix_edges(self.tree)                
                snode = self.tree.tree.search_nodes(idx=event[0])[0]
                dnode = self.tree.tree.search_nodes(idx=event[1])[0]
                interval = self._intervals[snode.idx, dnode.idx]
                edge_min = int(interval[0] * 2. * self.Ne)
                edge_max = int(interval[1] * 2. * self.Ne)
                mtimes = np.sort(
                    np.random.uniform(edge_min, edge_max, self.ntests*2)
                    .reshape((self.ntests, 2)), axis=1).astype(int)
                self.test_values[idx] = {"mrates": mrates, "mtimes": mtimes}
                if self._debug:
                    print("uniform testvals mig:", (edge_min, edge_max), (minmig, maxmig))
            idx += 1
            
    
    def plot_test_values(self):
        """
        Returns a toyplot canvas 
        """
        ## setup canvas
        canvas = toyplot.Canvas(height=300, width=500)
        ax0 = canvas.cartesian(
            grid=(1, 2, 0), xlabel="migration durations", ylabel="simulation index")
        ax1 = canvas.cartesian(
            grid=(1, 2, 1), xlabel="proportion migrants", ylabel="frequency")

        if self.test_values.keys():
            ## get values for the first admixture edge
            mtimes = self.test_values[0]["mtimes"]
            mrates = self.test_values[0]["mrates"]
            mt = mtimes[mtimes[:, 1].argsort()]
            boundaries = np.column_stack((mt[:, 0], mt[:, 1]))
            #durations = mtimes[:, 1] - mtimes[:, 0]

            ## plot
            ax0.fill(boundaries, along='y')
            ax1.bars(np.histogram(mrates))# * durations))
            return canvas
        else:
            raise ValueError(
                "No test_values generated. Model object must have admixture edges")


    ## functions to build simulation options 
    def _get_demography(self, idx):
        """
        returns demography scenario based on an input tree and admixture
        edge list with events in the format (source, dest, start, end, rate)
        """
        ## Define demographic events for msprime
        demog = set()

        ## tag min index child for each node, since at the time the node is 
        ## called it may already be renamed by its child index b/c of divergence
        ## events.
        for node in self.tree.tree.traverse():
            if node.children:
                node._schild = min([i.idx for i in node.get_descendants()])
            else:
                node._schild = node.idx

        ## Add divergence events
        for node in self.tree.tree.traverse():
            if node.children:
                dest = min([i._schild for i in node.children])
                source = max([i._schild for i in node.children])
                time = int(node.height * 2. * self.Ne)
                demog.add(ms.MassMigration(time, source, dest))
                if self._debug:
                    print('demog div:', (time, source, dest))
        
        ## Add migration edges
        for key in self.test_values:
            mdict = self.test_values[key]
            time = mdict['mtimes'][idx]
            rate = mdict['mrates'][idx]
            source, dest = self.admixture_edges[key][:2]

            ## rename nodes at time of admix in case divergences renamed them
            snode = self.tree.tree.search_nodes(idx=source)[0]
            dnode = self.tree.tree.search_nodes(idx=dest)[0]

            demog.add(ms.MigrationRateChange(time[0], rate, (snode._schild, dnode._schild)))
            demog.add(ms.MigrationRateChange(time[1], 0, (snode._schild, dnode._schild)))
            if self._debug:
                print('demog mig:', (time[0], time[1], rate, (snode._schild, dnode._schild)))

        ## sort events by time
        demog = sorted(list(demog), key=lambda x: x.time)
        if self._debug:
            print("")
        return demog


    def _get_popconfig(self):
        """
        returns population_configurations for N tips of a tree
        """
        population_configurations = [
            ms.PopulationConfiguration(sample_size=1, initial_size=self.Ne) \
            for ntip in range(self.ntips)]
        return population_configurations

        
    def _simulate(self, idx):
        """
        performs simulations with params varied across input values.
        """       
        ## set up simulation
        sim = ms.simulate(
            num_replicates=self.nsnps * 100,
            mutation_rate=self.mut,
            migration_matrix=np.zeros((self.ntips, self.ntips), dtype=int).tolist(),
            population_configurations=self._get_popconfig(),
            demographic_events=self._get_demography(idx)
        )
        return sim

   
    def mutate_jc(self, geno):
        """
        mutates sites with 1 into a new base in {0, 1, 2, 3}
        """
        for ridx in range(geno.shape[0]):

            ## get an array of starting bases, e.g., [0, 0, 0, 0]
            init = np.repeat(np.random.randint(0, 4), self.ntips)

            ## get the base it will mutate to e.g., 1
            notinit = list(set([0, 1, 2, 3]) - set(init))

            ## change mutated bases to notinit if there is a mut
            if np.sum(geno[ridx]):
                init[geno[ridx]==1] = np.random.choice(notinit)
                geno[ridx] = init
                ## return 1 SNP 
                return geno[0, :]
            else:
                return np.array([])
            
        ## if geno shape is 0
        return np.array([])       
    

    def run(self):
        """
        run and parse results for nsamples simulations.
        """
        ## storage for output
        self.counts = np.zeros((self.ntests, 16, 16), dtype=int)    
        for ridx in range(self.ntests):
            ## run simulation for demography idx
            sims = self._simulate(ridx)
            
            ## array to store site counts
            carr = np.zeros((self.nsnps, 16, 16))
            
            ## continue until nsnps are simulated
            fidx = 0
            while fidx < self.nsnps:
                ## get just the first mutation
                bingenos = sims.next().genotype_matrix()
                ## convert to sequence under JC
                sitegenos = self.mutate_jc(bingenos)
                ## count it
                if sitegenos.size:
                    carr[fidx] = count(sitegenos)
                    fidx += 1
                    
            ## fill site counts into 16x16 matrix
            self.counts[ridx] = carr.sum(axis=0)



## jitted functions for running super fast
@numba.jit(nopython=True)
def count(i):
    """
    return a 16x16 matrix of site counts from sitegenos
    """
    arr = np.zeros((16, 16), dtype=np.uint16)
    arr[(4*i[0])+i[1], (4*i[2])+i[3]] += 1
    return arr    



class DataBase(object):
    """
    An object to parallelize simulations over many parameter settings
    and store finished reps in a HDF5 database    
    """
    def __init__(self, name, treelist, Ne, mut, nsnps, ntests, seed, workdir="sim-databases/", force=False, **kwargs):
        
        ## identify this set of simulations
        self.name = name
        self.workdir = workdir
        self.path = os.path.join(workdir, self.name+".hdf5")
        self.trees = treelist
        
        ## Define function to use below to make arguments the correct length
        def _make_vector(arg):
            if type(arg) is int or type(arg) is float:
                return([arg]*len(self.trees))
            if (type(arg) is list) and (len(arg) is len(self.trees)):
                return(arg)
            else: raise ValueError("Ne, mut, nsnps, and ntests arguments each must be an int/float (which will be applied to all trees) or be a list the same length as the tree list")

        
        ## Accept arguments of length 1 or of the same length as the treelist
        self.Ne = _make_vector(Ne)
        self.mut = _make_vector(mut)
        self.nsnps = _make_vector(nsnps)
        self.ntests = _make_vector(ntests)
        self.seed = _make_vector(seed)
        
        ## make sure workdir exists
        if not os.path.exists(workdir):
            os.makedirs(workdir)
        
        ## create database in 'w-' mode to prevent overwriting
        if os.path.exists(self.path):
            if force:
                ## exists and destroy it
                if raw_input("Are you sure you want to overwrite the database? "):
                    os.remove(self.path)
                    self.database = h5py.File(self.path, mode='w')                    
            else:
                ## exists append to it
                self.database = h5py.File(self.path, mode='a')
        else:
            ## does not exist
            self.database = h5py.File(self.path, mode='w-')     

        ## Create datasets for all planned simulations and write
        ## accompanying metadata for sim params in each data set
        self.ndatasets = self._generate_database()
        self.database.close()

        
    def _generate_database(self):
        """
        Parses parameters in self.params to create all combinations
        of parameter values to test. Returns the number of the simulations.
        Simulation metadata is appended to datasets. 
        """
        
        ## Does an arguments group already exist in your database file?
        try:
            self.database['args']
        except:
            total_treenum = 0
            args_array = np.empty(shape=[0,13])
            argsexists = False
        else:
            total_treenum = len(self.database['args'])
            args_array = self.database['args']
            argsexists = True
        for treenum in range(len(self.trees)):
            ## Get each tree
            currtree = toytree.tree(self.trees[treenum])
            
            ## Get each possible admixture event
            admixedges = get_all_admix_edges(currtree)
            intervals = admixedges.values()
            branches = admixedges.keys()
            onetreetests=np.empty(shape=[0,13])
            for event in range(len(branches)):
                ## initialize a model -- we'll use this to get parameters for each test on this admixture event
                carr = Model(Ne = self.Ne[treenum],
                             mut = self.mut[treenum],
                             nsnps = self.nsnps[treenum],
                             ntests = self.ntests[treenum],
                             seed = self.seed[treenum],
                             tree = currtree, 
                             admixture_edges=(branches[event][0], branches[event][1], None, None, None))
                ## save relevant parameters for each test
                eventtest = np.column_stack([[total_treenum]*carr.ntests, 
                                np.repeat(np.array([branches[event]]),carr.ntests,axis = 0),
                                np.repeat(np.array([intervals[event]]),carr.ntests,axis = 0),
                                carr.test_values[0]['mtimes'], 
                                carr.test_values[0]['mrates'],
                                [carr.Ne]*carr.ntests,
                                [carr.mut]*carr.ntests,
                                [carr.nsnps]*carr.ntests,
                                [carr.ntests]*carr.ntests,
                                [self.seed[treenum]]*carr.ntests])
                onetreetests=np.vstack([onetreetests,eventtest])

            total_treenum += 1
            ## Add to the overall args array
            args_array = np.vstack([args_array, np.array(onetreetests)])
            
            ## We should be holding the whole database in Python right now, so we want to add to a blank slate
            self.database.clear()
            self.database.create_dataset("args", data=args_array)
        return(len(args_array))



    def run(self, force=False):
        
        """
        Distribute simulations across a parallel Client. If continuing
        a previous run then any unfinished simulation will be queued up
        to run. 
        """
        
        def _add_mat(arr, numberdone):
            """
            Add one matrix to the HDF5 'counts' group. Collette book page 39.
            """
            counts_set[numberdone,:,:] = arr
            numberdone += 1
            return(numberdone)

        def _done(numberdone):
            """
            Resize your HDF5 'counts' group at the end to the same length as filled count matrices. Collette book page 40.
            """
            counts_set.resize((numberdone,16,16))
        
        #run(self, ipyclient, force=False):
        mydatabase = h5py.File(self.path, mode='r+')
        sizeargs = mydatabase['args'].len()
        
        ## Does a counts group already exist in your database file?
        try:
            mydatabase['counts']
        except:
            ## if 'counts' doesn't exist
            numberdone = 0 # will adjust this at the end of the loop
            countexists = False
            ## initialize the group
            counts_set = mydatabase.create_dataset('counts',(1,16,16),maxshape = (None, 16, 16), chunks = (4,16,16),dtype=int)
        else:
            numberdone = len(mydatabase['counts'])
            countexists = True
        

        
        trigger = 0 # will change this to 1 once we are done
        while not trigger:
            argsleft = sizeargs - numberdone # fill this at the beginning of each loop

            if argsleft > 1000:
                windowsize = 1000
            else:
                windowsize = argsleft

            ## create empty dataset to hold your set of int paras
            argsints = np.empty((windowsize,6),dtype=int)
            ## fill the dataset with the window of values you want
            mydatabase['args'].read_direct(argsints, np.s_[numberdone:(numberdone+windowsize),[0,1,2,8,10,12]])
            ## create empty dataset to hold your set of float paras
            argsflts = np.empty((windowsize,4),dtype=float)
            ## fill the dataset with the window of values you want
            mydatabase['args'].read_direct(argsflts, np.s_[numberdone:(numberdone+windowsize),[5,6,7,9]])


            # if it does exist
            #nmats = len(counts_set)


            counts_set.resize((len(counts_set)+1000,16,16))
            for idx in xrange(windowsize):
                treenum, sourcebr, destbr, Ne, nsnps, seed = argsints[idx,:]
                mtimerecent, mtimedistant, mrate, mut = argsflts[idx,:]
                mod = Model(tree = self.trees[treenum],
                            admixture_edges = [(sourcebr,destbr,mtimerecent,mtimedistant,mrate)],
                            Ne = Ne,
                            nsnps = nsnps,
                            mut = mut,
                            seed = seed,
                            ntests = 1)
                mod.run()
                numberdone = _add_mat(mod.counts,numberdone)
                #numberdone += 1
                print(numberdone)
            _done(numberdone)
            
            ## Exits the loop if we're out of parameter samples in the database 'args' group
            if numberdone is sizeargs:
                trigger = 1
            print(str(numberdone) + str(sizeargs))
        
        mydatabase.close()
        
        ## wrapper for ipyclient to close nicely when interrupted
        #pass
        return("Done writing database with " + str(numberdone) + " count matrices.")
    
   

### Convenience functions on toytrees
def get_all_admix_edges(ttree):
    """
    Find all possible admixture edges on a tree. Edges are unidirectional, 
    so the source and dest need to overlap in time interval.    
    """
    ## for all nodes map the potential admixture interval
    for snode in ttree.tree.traverse():
        if snode.is_root():
            snode.interval = (None, None)
        else:
            snode.interval = (snode.height, snode.up.height)
    
    ## for all nodes find overlapping intervals
    intervals = {}
    for snode in ttree.tree.traverse():
        for dnode in ttree.tree.traverse():
            if not snode.is_root() and (snode != dnode):
                ## check for overlap
                smin, smax = snode.interval
                dmin, dmax = dnode.interval

                ## find if nodes have interval where admixture can occur
                low_bin = max(smin, dmin)
                top_bin = min(smax, dmax)
                if top_bin > low_bin:
                    aedge = (snode.idx, dnode.idx, low_bin, top_bin)
                    intervals[(snode.idx, dnode.idx)] = (low_bin, top_bin)
    return intervals



NameError: name '__init__' is not defined

In [245]:
mydatabase.clear()

In [6]:
testing.ndatasets

90

In [7]:
testing.run()

'Done writing database with 90 count matrices.'

In [15]:
testing = simcat_working.DataBase(name = "testing",mut=1e-8,nsnps=1000,ntests=5,treelist=simtrees,Ne = 100000,seed=12345)

IOError: Unable to create file (File exists)

In [8]:
mydatabase = h5py.File("sim-databases/testing.hdf5",mode = 'r')

In [14]:
mydatabase['counts'][3]

array([[ 0, 30, 33, 29,  1,  0,  0,  0,  4,  0,  0,  0,  1,  0,  0,  0],
       [ 2,  0,  0,  0, 20, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0, 25,  0, 23,  0,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 24,  0,  0, 22],
       [30, 23,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  0, 21,  0, 32, 37,  0,  3,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  0,  4,  0,  0,  0, 25, 32,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0, 28,  0, 21],
       [26,  0, 19,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 25, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  2,  0,  0,  0,  2,  0, 30, 33,  0, 20,  0,  0,  5,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0, 22, 29],
       [29,  0,  0, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  0, 23,  0, 27,  0,  0,  0,

In [249]:
## create empty dataset to hold your set of int paras
argsints = np.empty((800,6),dtype=int)
## fill the dataset with the window of values you want
mydatabase['args'].read_direct(argsints, np.s_[0:800,[0,1,2,8,10,12]])
## create empty dataset to hold your set of float paras
argsflts = np.empty((800,4),dtype=float)
## fill the dataset with the window of values you want
mydatabase['args'].read_direct(argsflts, np.s_[0:800,[5,6,7,9]])

In [250]:
## function to add one matrix
def _add_mat(arr):
    global nmats
    counts_set[nmats,:,:] = arr
    nmats += 1
## resize at the end
def _done():
    counts_set.resize((nmats,16,16))

In [251]:
# if doesn't yet exist
counts_set = mydatabase.create_dataset('counts',(1,16,16),maxshape = (None, 16, 16), chunks = (4,16,16),dtype=int)
## this helps us keep up with number of mats
nmats = 0

# if it does exist
#nmats = len(counts_set)

In [252]:

counts_set.resize((len(counts_set)+1000,16,16))
for idx in xrange(800):
    treenum, sourcebr, destbr, Ne, nsnps, seed = argsints[idx,:]
    mtimerecent, mtimedistant, mrate, mut = argsflts[idx,:]
    mod = simcat_working.Model(tree = simtrees[treenum],
                admixture_edges = [(sourcebr,destbr,mtimerecent,mtimedistant,mrate)],
                Ne = Ne,
                nsnps = nsnps,
                mut = mut,
                seed = seed,
                ntests = 1)
    mod.run()
    add_mat(mod.counts)
    print(idx)
done()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [253]:
mydatabase.close()

In [244]:
np.shape(mod.counts)

(100, 16, 16)

In [232]:
mydatabase.clear()

In [None]:
treenum, sourcebr, destbr, Ne, nsnps, seed = mydatabase['args'][0,[0,1,2,8,10,12]]

In [198]:
mtimerecent, mtimedistant, mrate, mut = mydatabase['args'][0,[5,6,7,9]]

array([  2.38210000e+04,   3.85800000e+04,   4.64808046e-01,
         1.00000000e-08])

In [175]:
testarr= np.empty(shape = (1000,16,16),dtype=int)
for i in xrange(1000):
    testarr[i] = np.reshape(np.random.randint(100,size=256),(16,16))#mod.counts 

In [176]:
nmats = 0
counts_set.resize((len(counts_set)+1000,16,16))
for idx in xrange(1000):
#     treenum, sourcebr, destbr, mtimerecent, mtimedistant, mrate, Ne, mut, nsnps, seed = mydatabase['args'][1:20,[0,1,2,5,6,7,8,9,10,12]]
#    mod = Model(Ne[idx])
#    mod.run()
#    mod.counts
#    add_mat(mod.counts)
    add_mat(testarr[idx])
done()

In [177]:
counts_set[100]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [42]:
f['testappend'][0,]

array([ 0.1501835 ,  0.48962671,  0.37734497,  0.8486014 ,  0.91109723,
        0.38384873,  0.31549591,  0.56839412,  0.18781804,  0.12584154], dtype=float32)

In [26]:
f = h5py.File("newfile",'w')

In [28]:
f.create_dataset('testappend', (2000,10), maxshape = (None, 10))

<HDF5 dataset "testappend": shape (2000, 10), type "<f4">

In [32]:
add_trace_2(np.random.random(10))

NameError: global name 'ntraces' is not defined

In [10]:
type([4]) is list

True

In [16]:
testing = simcat_working.DataBase(name = "testing",mut=1e-8,nsnps=1000,ntests=5,treelist=simtrees,Ne = 100000,seed=12345)

In [10]:
testing.mut

[1e-08, 1e-08, 1e-08, 1e-08, 1e-08, 1e-08, 1e-08, 1e-08, 1e-08, 1e-08]

In [17]:
test1=h5py.File("sim-databases/testing.hdf5","r+")

In [18]:
np.array(test1['args'])[100]

array([  1.00000000e+00,   5.00000000e+00,   4.00000000e+00,
         4.00000000e+00,   6.00000000e+00,   1.03821000e+05,
         1.18580000e+05,   4.64808046e-01,   1.00000000e+04,
         1.00000000e-08,   1.00000000e+03,   5.00000000e+00,
         1.23450000e+04])

In [None]:
.reshape([1,256])

In [4]:
len(simtrees)

10

In [17]:
testing = simcat_working.DataBase(name="testing",treelist=simtrees)

In [18]:
testing.ndatasets

1360

In [6]:
np.vstack([np.empty(shape=[0,8]),testing.ndatasets])

array([[  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   7.00000000e+00,   1.04680000e+06,
          1.34582900e+06,   4.64808046e-01],
       [  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   7.00000000e+00,   1.17430000e+04,
          1.49022000e+05,   1.58187777e-01],
       [  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   7.00000000e+00,   4.18185000e+05,
          9.18975000e+05,   9.19594058e-02],
       [  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   7.00000000e+00,   1.13373700e+06,
          1.22104600e+06,   1.02280139e-01],
       [  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   7.00000000e+00,   1.01315900e+06,
          1.35050600e+06,   2.83862515e-01],
       [  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   7.00000000e+00,   8.99465000e+05,
          1.00443500e+06,   2.9

In [18]:
currtree = toytree.tree(simtrees[0])
admixedges = get_all_admix_edges(currtree)
intervals = admixedges.values()
branches = admixedges.keys()
carr = simcat_working.Model(
    currtree, 
    admixture_edges=(branches[0][0], branches[0][1], None, None, None),
    ntests=10)
np.column_stack([carr.test_values[0]['mtimes'],carr.test_values[0]['mrates']])

In [10]:
np.repeat(np.array([[2,5]]),2,axis = 0)

array([2, 5])

In [None]:
for snode in ttree.tree.traverse():
    if snode.is_root():
        snode.interval = (None, None)
    else:
        snode.interval = (snode.height, snode.up.height)

## for all nodes find overlapping intervals
intervals = {}
for snode in ttree.tree.traverse():
    for dnode in ttree.tree.traverse():
        if not snode.is_root() and (snode != dnode):
            ## check for overlap
            smin, smax = snode.interval
            dmin, dmax = dnode.interval

            ## find if nodes have interval where admixture can occur
            low_bin = max(smin, dmin)
            top_bin = min(smax, dmax)
            if top_bin > low_bin:
                aedge = (snode.idx, dnode.idx, low_bin, top_bin)
                intervals[(snode.idx, dnode.idx)] = (low_bin, top_bin)

In [66]:
for snode in newtest.tree.traverse():
    if snode.is_root():
        snode.interval = (None, None)
    else:
        snode.interval = (snode.height, snode.up.height)

In [19]:
newfile = h5py.File("sim-databases/testing.hdf5", mode='r+')     

In [7]:
len(list(np.array([np.array(newfile['args'])[i][0] for i in range(len(np.array(newfile['args'])))]).astype(int)))


1500

In [10]:
newfile.clear()

In [21]:
list(newfile['args'])

[array([  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   8.00000000e+00,   1.19634300e+06,
          1.53809000e+06,   4.64808046e-01]),
 array([  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   8.00000000e+00,   1.34210000e+04,
          1.70311000e+05,   1.58187777e-01]),
 array([  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   8.00000000e+00,   4.77925000e+05,
          1.05025700e+06,   9.19594058e-02]),
 array([  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   8.00000000e+00,   1.29570000e+06,
          1.39548100e+06,   1.02280139e-01]),
 array([  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   8.00000000e+00,   1.15789600e+06,
          1.54343600e+06,   2.83862515e-01]),
 array([  0.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   8.00000000e+00,   1.02796000e+06,
          1.14792500e+06, 

In [102]:
newfile.create_group("tree2")

<HDF5 group "/tree2" (0 members)>

In [54]:
try:
    newfile['args']
except:
    args_exists = False
else:
    args_exists = True

In [113]:
newfile.close()

In [55]:
args_exists

True

In [93]:
testing.keys()
testing.values()

[(0.0, 8.0),
 (0.0, 8.0),
 (0.0, 12.0),
 (0.0, 8.0),
 (0.0, 8.0),
 (0.0, 8.0),
 (0.0, 8.0),
 (0.0, 8.0),
 (0.0, 8.0),
 (0.0, 8.0),
 (0.0, 12.0),
 (8.0, 12.0),
 (8.0, 12.0),
 (0.0, 8.0),
 (12.0, 15.0),
 (8.0, 12.0),
 (8.0, 12.0),
 (12.0, 15.0)]

In [97]:
newfile['/tree1'].attrs['admix_intervals'] = testing.values()
newfile['/tree1'].attrs['admix_branches'] = testing.keys()



In [98]:
newfile['/tree1'].attrs['admix_branches']

array([[1, 2],
       [0, 1],
       [3, 2],
       [1, 3],
       [0, 3],
       [3, 0],
       [3, 1],
       [2, 1],
       [0, 2],
       [2, 0],
       [2, 3],
       [4, 3],
       [4, 2],
       [1, 0],
       [3, 5],
       [3, 4],
       [2, 4],
       [5, 3]])

In [99]:
newfile['/tree1'].attrs['admix_intervals']

array([[  0.,   8.],
       [  0.,   8.],
       [  0.,  12.],
       [  0.,   8.],
       [  0.,   8.],
       [  0.,   8.],
       [  0.,   8.],
       [  0.,   8.],
       [  0.,   8.],
       [  0.,   8.],
       [  0.,  12.],
       [  8.,  12.],
       [  8.,  12.],
       [  0.,   8.],
       [ 12.,  15.],
       [  8.,  12.],
       [  8.,  12.],
       [ 12.,  15.]])

In [78]:
newfile.create_dataset()

[(u'tree1', <HDF5 group "/tree1" (2 members)>)]

In [68]:
intervals = {}
for snode in newtest.tree.traverse():
    for dnode in newtest.tree.traverse():
        if not snode.is_root() and (snode != dnode):
            ## check for overlap
            smin, smax = snode.interval
            dmin, dmax = dnode.interval

            ## find if nodes have interval where admixture can occur
            low_bin = max(smin, dmin)
            top_bin = min(smax, dmax)
            if top_bin > low_bin:
                aedge = (snode.idx, dnode.idx, low_bin, top_bin)
                intervals[(snode.idx, dnode.idx)] = (low_bin, top_bin)