## Demonstrating phymsim

### Imports

In [18]:
import toytree
import toyplot
import phymsim

### Define a tree. Node heights are in units of generations!

In [19]:
tree = toytree.rtree.coaltree(5)
tree.draw(node_labels=True, node_sizes=15, tip_labels_align=True);
print("tree height: " + str(tree.treenode.height))

tree height: 0.3962441


In [20]:
tree = tree.mod.node_scale_root_height(tree.treenode.height*10*10000*4)
print("tree height: " + str(tree.treenode.height))

tree height: 158497.64


### Define a phymsim model. 

In [21]:
mod = phymsim.Model(tree,
                    Ne = 10000, 
                    mut=1e-7,
                   recomb=9e-9)

### Generate gene trees and sequences for each locus. 

In [22]:
mod.run(num_loci = 10,
        size = 5000)

We can look at starts and stops for the gene trees. Notice that the `inferred_trees` column has nothing for now.

In [23]:
mod.df

Unnamed: 0,locus_idx,starts,stops,bps,newicks,cumulative_bps,inferred_trees
0,0,0.000000,399.114605,399,"(r4:171878.93510039325338,(r3:135044.601240783...",399,
1,0,399.114605,2003.711868,1605,"(r4:171878.93510039325338,(r3:135044.601240783...",2004,
2,0,2003.711868,3215.876200,1212,"(r4:171878.93510039325338,((r0:31028.726571306...",3216,
3,0,3215.876200,4029.008186,813,"(r4:171878.93510039325338,(r2:135044.601240783...",4029,
4,0,4029.008186,4424.136472,395,"((r3:73409.16564434758038,(r0:31028.7265713068...",4424,
5,0,4424.136472,4469.888225,46,"(r4:203004.80203346943017,(r2:151000.469034697...",4470,
6,0,4469.888225,5000.000000,530,"(r4:203004.80203346943017,(r2:151000.469034697...",5000,
0,1,0.000000,68.697338,69,"(r4:170791.99710473464802,(r3:142028.118303298...",5069,
1,1,68.697338,341.245211,272,"(r4:170791.99710473464802,(r3:125979.354911937...",5341,
2,1,341.245211,382.042768,41,"(r4:170791.99710473464802,(r3:125979.354911937...",5382,


### Infer a tree at each locus with `iqtree`.

In [24]:
mod.infer_trees(method = 'iqtree')

In [25]:
mod.df

Unnamed: 0,locus_idx,starts,stops,bps,newicks,cumulative_bps,inferred_trees
0,0,0.000000,399.114605,399,"(r4:171878.93510039325338,(r3:135044.601240783...",399,"(r0:0.0026263194,r1:0.0036614296,((r2:0.009439..."
1,0,399.114605,2003.711868,1605,"(r4:171878.93510039325338,(r3:135044.601240783...",2004,"(r0:0.0026263194,r1:0.0036614296,((r2:0.009439..."
2,0,2003.711868,3215.876200,1212,"(r4:171878.93510039325338,((r0:31028.726571306...",3216,"(r0:0.0026263194,r1:0.0036614296,((r2:0.009439..."
3,0,3215.876200,4029.008186,813,"(r4:171878.93510039325338,(r2:135044.601240783...",4029,"(r0:0.0026263194,r1:0.0036614296,((r2:0.009439..."
4,0,4029.008186,4424.136472,395,"((r3:73409.16564434758038,(r0:31028.7265713068...",4424,"(r0:0.0026263194,r1:0.0036614296,((r2:0.009439..."
5,0,4424.136472,4469.888225,46,"(r4:203004.80203346943017,(r2:151000.469034697...",4470,"(r0:0.0026263194,r1:0.0036614296,((r2:0.009439..."
6,0,4469.888225,5000.000000,530,"(r4:203004.80203346943017,(r2:151000.469034697...",5000,"(r0:0.0026263194,r1:0.0036614296,((r2:0.009439..."
0,1,0.000000,68.697338,69,"(r4:170791.99710473464802,(r3:142028.118303298...",5069,"(r0:0.0028067520,r1:0.0034128597,(r2:0.0070096..."
1,1,68.697338,341.245211,272,"(r4:170791.99710473464802,(r3:125979.354911937...",5341,"(r0:0.0028067520,r1:0.0034128597,(r2:0.0070096..."
2,1,341.245211,382.042768,41,"(r4:170791.99710473464802,(r3:125979.354911937...",5382,"(r0:0.0028067520,r1:0.0034128597,(r2:0.0070096..."


### We can also look at the raw sequences:

In [26]:
mod.seqs.shape

(10, 5, 5000)

In [27]:
mod.seqs[0]

array([[2, 1, 0, ..., 0, 1, 2],
       [2, 1, 0, ..., 0, 1, 2],
       [2, 1, 0, ..., 0, 1, 2],
       [2, 1, 0, ..., 0, 1, 2],
       [2, 1, 0, ..., 0, 1, 2]], dtype=int8)

### If we want to generate SNPs in a simcat-like format, we can do that separately:

In [28]:
snps = mod._run_snps(1000)

In [29]:
snps

array([116,  16,  14, ...,  40,  39,  16])

In [30]:
for idx, mat in enumerate(snps.reshape((5,16,16))):
    toyplot.matrix(mat, label="Matrix " + str(idx), colorshow=True);

### If we want our model to have an admixture edge, we can do that too.

In [31]:
adedge = (0, 4, 0.5, 0.4)
mod = phymsim.Model(tree,
                    Ne = 10000, 
                    mut=1e-7,
                   recomb=9e-9,
                   admixture_edges=adedge)

In [32]:
mod.run(num_loci = 10,
        size = 5000)

In [33]:
snps_admixed = mod._run_snps(1000)

In [34]:
for idx, mat in enumerate(snps_admixed.reshape((5,16,16))):
    toyplot.matrix(mat, label="Matrix " + str(idx), colorshow=True);

#### notes from 10/3 meeting below.

In [None]:
# siulate data a DF and seqarray (both stored to mod); loci can be invariant...
mod.run(...)

# alternate method for fast simulation of SNPs, makes DF and seqarray; loops to ensure nsnps=nloci
mod.run_snps(...)

# fastest simcat alternative (specialized for ensure SNPs and don't mess with DFs)
mod._run_snps(...)

In [None]:
# look at DF 
mod.df

# look at seq arr 0
mod.seqs[0]



In [None]:
# OPTIONAL
# write CSV to file (optinoally)
mod.df.to_csv("filname.csv")

# write phylips to dir (calls write_phy() internally?)
mod.write_seqs(path="...", concat=True)
mod.write_snps(path="...", seed=123)


In [None]:
# infer trees for each locus 
mod.infer_trees(method="raxml")


In [None]:
# look at results
mod.df



In [11]:
mod.seqs

{'r1': array([3, 0, 3, ..., 2, 2, 3], dtype=int8),
 'r3': array([3, 0, 3, ..., 2, 2, 3], dtype=int8),
 'r2': array([3, 0, 2, ..., 2, 2, 3], dtype=int8),
 'r0': array([3, 0, 2, ..., 2, 2, 3], dtype=int8),
 'r4': array([3, 0, 2, ..., 2, 2, 3], dtype=int8)}

In [None]:
mod.write_phy(loc=0, path=...)

### goals
1. restructure Class object for API mode
2. complete write_seqs, write_phy, and infert_trees methods.
3. Run example code (init object, .run(), .infer_tree(), .df.to_csv()) for a couple data sets (trees, Nes, admixture) for simcat
4. write data for simcat to analyze...
5. Test simcat on getting results correct

## Now make the node values correspond to thetas:

In [75]:
idx = 0
Nes=[10000,20000,30000,40000,50000,60000,70000,80000,90000]
for node in tree.treenode.traverse():
    node.add_feature('Ne', Nes[idx])
    idx+=1

tree height: 4145.004


In [76]:
tree.draw(node_labels=tree.get_node_values("Ne", True, True), node_sizes=30, tip_labels_align=True);

In [77]:
norm_thetas = list(np.array(tree.get_edge_values("Ne"))/10000)
tree.draw(edge_widths=norm_thetas,tip_labels=True,);

In [78]:
mod = phymsim.Model(tree,
                    Ne = None, 
                    mut=1e-8,
                   recomb=1e-9)

In [79]:
mod._get_demography()

[{'type': 'mass_migration', 'time': 1386, 'source': 1, 'dest': 0, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 1386, 'growth_rate': None, 'initial_size': 70000, 'population': 0},
 {'type': 'mass_migration', 'time': 2095, 'source': 2, 'dest': 0, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 2095, 'growth_rate': None, 'initial_size': 50000, 'population': 0},
 {'type': 'mass_migration', 'time': 2889, 'source': 3, 'dest': 0, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 2889, 'growth_rate': None, 'initial_size': 30000, 'population': 0},
 {'type': 'mass_migration', 'time': 4145, 'source': 4, 'dest': 0, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 4145, 'growth_rate': None, 'initial_size': 10000, 'population': 0}]

In [80]:
[i.initial_size for i in mod._get_popconfig()] # this tells us the size of each population index

[90000, 80000, 60000, 40000, 20000]

In [81]:
gt_df2, seqs2 = mod.run_locus(50000,seqgen=True)
gt_df2

Unnamed: 0,starts,stops,bps,newicks
0,0.0,19455.908127,19456,"(r0:17304.66686415578806,((r1:6991.97506137920..."
1,19455.908127,24868.687562,5413,"((r1:6991.97506137920209,r4:6991.9750613792020..."
2,24868.687562,27339.247434,2470,"((r1:6991.97506137920209,r4:6991.9750613792020..."
3,27339.247434,31949.423471,4610,"((r1:6991.97506137920209,r4:6991.9750613792020..."
4,31949.423471,34001.19155,2052,"((r1:6991.97506137920209,r4:6991.9750613792020..."
5,34001.19155,43708.421009,9707,"((r1:6991.97506137920209,r4:6991.9750613792020..."
6,43708.421009,47286.784474,3579,"((r1:6991.97506137920209,r4:6991.9750613792020..."
7,47286.784474,50000.0,2713,"((r1:6991.97506137920209,r4:6991.9750613792020..."


In [89]:
(np.array(list(range(5,20)))-3)/2

array([1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. ,
       7.5, 8. ])

In [91]:
np.array(gt_df2)

array([[0.0, 19455.908127002396, 19456,
        '(r0:17304.66686415578806,((r1:6991.97506137920209,r4:6991.97506137920209):9186.02880066223588,(r2:7070.72458629240009,r3:7070.72458629240009):9107.27927574903879):1126.66300211435009);'],
       [19455.908127002396, 24868.687562381077, 5413,
        '((r1:6991.97506137920209,r4:6991.97506137920209):31581.69362858660679,(r0:17304.66686415578806,(r2:7070.72458629240009,r3:7070.72458629240009):10233.94227786338888):21269.00182581002082);'],
       [24868.687562381077, 27339.24743424618, 2470,
        '((r1:6991.97506137920209,r4:6991.97506137920209):31581.69362858660679,(r3:7070.72458629240009,(r0:6768.01376796623390,r2:6768.01376796623390):302.71081832616619):31502.94410367340970);'],
       [27339.24743424618, 31949.423470522608, 4610,
        '((r1:6991.97506137920209,r4:6991.97506137920209):76064.11481685213221,(r3:7070.72458629240009,(r0:6768.01376796623390,r2:6768.01376796623390):302.71081832616619):75985.36529193892784);'],
       [3

In [93]:
gt_df2['starts']

0        0.000000
1    19455.908127
2    24868.687562
3    27339.247434
4    31949.423471
5    34001.191550
6    43708.421009
7    47286.784474
Name: starts, dtype: float64

In [94]:
import h5py

In [149]:
here= h5py.File("./testit2.hdf5")

In [150]:
seqs = here.create_group('seqs')

In [153]:
for i in seqs2.keys():
    print(i)

r0
r1
r4
r2
r3


In [154]:
for i in seqs2.keys():
    seqs.create_dataset(i,data=seqs2[i])

In [99]:
writeit.create_dataset('starts',shape=(len(gt_df2['starts']),),data = gt_df2['starts'])

<HDF5 dataset "starts": shape (8,), type "<f8">

In [115]:
dt = h5py.special_dtype(vlen=str)

dset = writeit.create_dataset('newicks1', (len(gt_df2['newicks']),), dtype=dt)

In [116]:
dset = gt_df2['newicks']

In [110]:
toytree.tree(dset[0])

<toytree.Toytree.ToyTree at 0x1204150f0>

In [123]:
writeit['newicks3'] = gt_df2['newicks'].astype('S')

In [125]:
list(writeit['newicks3'])

[b'(r0:17304.66686415578806,((r1:6991.97506137920209,r4:6991.97506137920209):9186.02880066223588,(r2:7070.72458629240009,r3:7070.72458629240009):9107.27927574903879):1126.66300211435009);',
 b'((r1:6991.97506137920209,r4:6991.97506137920209):31581.69362858660679,(r0:17304.66686415578806,(r2:7070.72458629240009,r3:7070.72458629240009):10233.94227786338888):21269.00182581002082);',
 b'((r1:6991.97506137920209,r4:6991.97506137920209):31581.69362858660679,(r3:7070.72458629240009,(r0:6768.01376796623390,r2:6768.01376796623390):302.71081832616619):31502.94410367340970);',
 b'((r1:6991.97506137920209,r4:6991.97506137920209):76064.11481685213221,(r3:7070.72458629240009,(r0:6768.01376796623390,r2:6768.01376796623390):302.71081832616619):75985.36529193892784);',
 b'((r1:6991.97506137920209,r4:6991.97506137920209):95115.58412385180418,(r3:7070.72458629240009,(r0:6768.01376796623390,r2:6768.01376796623390):302.71081832616619):95036.83459893859981);',
 b'((r1:6991.97506137920209,r4:6991.97506137920