Here, I would like to show that we can just input any phylogenetic tree, designate a time and proportion of migration, and get an msprime sim.

In [80]:
import toytree
import phymsim
import numpy as np
print(toytree.__version__)
print(phymsim.__version__)

1.0.0
0.0.1-dev


### The species tree

In [81]:
# single-population random coalescent tree 
tree = toytree.rtree.coaltree(5, seed=333).mod.node_scale_root_height(1.5)

# set node Ne 
np.random.seed(3334)
for node in tree.treenode.traverse():
    node.add_feature("Ne", np.random.uniform(250000, 1000000))

# plot with Ne on edges
edges = np.array(tree.get_edge_values("Ne"))
tree.draw(ts='c', edge_widths=list(edges / edges.min() * 2));

### simulation object

In [105]:
# init a sim object ()
mod = phymsim.Model(tree, );


In [99]:
# takes node idxs as tip labels
mod.tree.get_tip_labels()

[0, 1, 2, 3, 4]

In [100]:
# can access original tip labels as well
mod.treeorig.get_tip_labels()

['r0', 'r1', 'r2', 'r3', 'r4']

### returns a treesequence dataframe and sequence alignment array.

It's not great to have an object that variably returns one or two results. If you want to have it sometimes not return the other it should still return None for it, that way you can always capture the result with the same code. 

In [87]:
# simulate the results
gt_df2, seqs2 = mod.run_locus(50000, seqgen=True);

It would be nice if the DF recorded how many SNPs are in each window as well (see SNP count code below).

In [88]:
# the tree sequence DF
print(gt_df2.shape)
gt_df2.head()

(315, 4)


Unnamed: 0,starts,stops,bps,newicks
0,0.0,81.086927,81,"((r2:966027.76357754739001,r4:966027.763577547..."
1,81.086927,184.401032,103,"(((r0:210078.66077706395299,r1:210078.66077706..."
2,184.401032,461.543433,278,"(((r3:493060.97795964800753,(r1:311496.9206811..."
3,461.543433,521.749992,60,"(r4:4397725.01832654513419,((r3:493060.9779596..."
4,521.749992,658.583595,137,"(r4:4060974.59099883772433,((r3:493060.9779596..."


In [89]:
# the first 5 trees
mtre = toytree.mtree(gt_df2.newicks.tolist())
mtre.draw_tree_grid(tree_style='c', shared_axis=True);

In [90]:
# 5 random trees
mtre = toytree.mtree(gt_df2.newicks.sample(5).tolist())
mtre.draw_tree_grid(tree_style='c', shared_axis=True);

In [91]:
# the sequence dict
seqs2

{'r2': array([0, 2, 1, ..., 3, 1, 2], dtype=int8),
 'r4': array([3, 1, 1, ..., 2, 2, 2], dtype=int8),
 'r3': array([1, 1, 2, ..., 1, 3, 3], dtype=int8),
 'r0': array([2, 2, 1, ..., 0, 1, 3], dtype=int8),
 'r1': array([2, 0, 3, ..., 0, 1, 2], dtype=int8)}

In [94]:
# since tips are idx labels (0,1,2,3) maybe a 2d array is better
phy = np.vstack(list(seqs2[i] for i in sorted(seqs2)))
print(phy.shape)
print(phy)

(5, 50000)
[[2 2 1 ... 0 1 3]
 [2 0 3 ... 0 1 2]
 [0 2 1 ... 3 1 2]
 [1 1 2 ... 1 3 3]
 [3 1 1 ... 2 2 2]]


In [95]:
# then it's easy to pull out SNPs
snps = phy[:, np.any(phy != phy[0], axis=0)]
print(snps.shape)
print(snps)

(5, 49631)
[[2 2 1 ... 0 1 3]
 [2 0 3 ... 0 1 2]
 [0 2 1 ... 3 1 2]
 [1 1 2 ... 1 3 3]
 [3 1 1 ... 2 2 2]]


## compare the in-house sequence simulation with seqgen

### simulate 25 sequences on our tree with seqgen



In [16]:
sumlists1 = []
for oneloop in range(25):
    gt_df1, seqs1 = mod.run_locus(50000,seqgen=True)
    gt_df1
    sum_list = [np.sum(seqs1['r0'] == seqs1['r1']),
     np.sum(seqs1['r0'] == seqs1['r2']),
     np.sum(seqs1['r0'] == seqs1['r3']),
     np.sum(seqs1['r0'] == seqs1['r4']),
     np.sum(seqs1['r1'] == seqs1['r2']),
     np.sum(seqs1['r1'] == seqs1['r3']),
     np.sum(seqs1['r1'] == seqs1['r4']),
     np.sum(seqs1['r2'] == seqs1['r3']),
     np.sum(seqs1['r2'] == seqs1['r4']),
     np.sum(seqs1['r3'] == seqs1['r4'])]
    sumlists1.append(sum_list)
    print(oneloop)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


### simulate 25 sequences on our tree with the in-house simulation

In [17]:
sumlists2 = []
for oneloop in range(25):
    gt_df1, seqs1 = mod.run_locus(50000,seqgen=False)
    sum_list = [np.sum(seqs1['r0'] == seqs1['r1']),
     np.sum(seqs1['r0'] == seqs1['r2']),
     np.sum(seqs1['r0'] == seqs1['r3']),
     np.sum(seqs1['r0'] == seqs1['r4']),
     np.sum(seqs1['r1'] == seqs1['r2']),
     np.sum(seqs1['r1'] == seqs1['r3']),
     np.sum(seqs1['r1'] == seqs1['r4']),
     np.sum(seqs1['r2'] == seqs1['r3']),
     np.sum(seqs1['r2'] == seqs1['r4']),
     np.sum(seqs1['r3'] == seqs1['r4'])]
    sumlists2.append(sum_list)
    print(oneloop)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


### compare the sequence matches for each taxa

In [22]:
print(np.mean([i[0] for i in sumlists1]))
print(np.mean([i[0] for i in sumlists2]))

19372.0
19897.88


In [24]:
print(np.mean([i[1] for i in sumlists1]))
print(np.mean([i[1] for i in sumlists2]))

16473.92
16348.84


In [26]:
print(np.mean([i[2] for i in sumlists1]))
print(np.mean([i[2] for i in sumlists2]))

15259.6
15240.24


In [27]:
print(np.mean([i[3] for i in sumlists1]))
print(np.mean([i[3] for i in sumlists2]))

15248.64
15272.88


In [28]:
print(np.mean([i[4] for i in sumlists1]))
print(np.mean([i[4] for i in sumlists2]))

16400.16
16352.04


In [29]:
print(np.mean([i[5] for i in sumlists1]))
print(np.mean([i[5] for i in sumlists2]))

15375.96
15223.4


In [30]:
print(np.mean([i[6] for i in sumlists1]))
print(np.mean([i[6] for i in sumlists2]))

15220.04
15172.12


In [31]:
print(np.mean([i[7] for i in sumlists1]))
print(np.mean([i[7] for i in sumlists2]))

15296.56
15289.84


In [32]:
print(np.mean([i[8] for i in sumlists1]))
print(np.mean([i[8] for i in sumlists2]))

15204.88
15213.92


In [33]:
print(np.mean([i[9] for i in sumlists1]))
print(np.mean([i[9] for i in sumlists2]))

15200.2
15160.92


## Make a tree where node values correspond to thetas:

In [184]:
tree = toytree.rtree.coaltree(5)
tree.draw(node_labels=True, node_sizes=15, tip_labels_align=True);

In [185]:
idx = 0
Nes=[10000,20000,30000,40000,50000,60000,70000,80000,90000]
for node in tree.treenode.traverse():
    node.add_feature('Ne', Nes[idx])
    idx+=1

In [186]:
tree.draw(node_labels=tree.get_node_values("Ne", True, True), node_sizes=15, tip_labels_align=True);

In [196]:
# get theta values from nodes
norm_thetas = list(np.array(tree.get_edge_values("Ne")) / 10000)

# plot as edge widths
tree.draw(ts='c', edge_widths=norm_thetas, tip_labels=True);

In [32]:
mod = phymsim.Model(tree,theta = 0.1,mut=1e-8)

In [33]:
mod._get_demography()

[{'type': 'mass_migration', 'time': 73247, 'source': 4, 'dest': 3, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 73247, 'growth_rate': None, 'initial_size': 20000, 'population': 3},
 {'type': 'mass_migration', 'time': 465510, 'source': 1, 'dest': 0, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 465510, 'growth_rate': None, 'initial_size': 70000, 'population': 0},
 {'type': 'mass_migration', 'time': 1212747, 'source': 2, 'dest': 0, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 1212747, 'growth_rate': None, 'initial_size': 30000, 'population': 0},
 {'type': 'mass_migration', 'time': 1448542, 'source': 3, 'dest': 0, 'proportion': 1.0},
 {'type': 'population_parameters_change', 'time': 1448542, 'growth_rate': None, 'initial_size': 10000, 'population': 0}]

In [34]:
[i.initial_size for i in mod._get_popconfig()] # this tells us the size of each population index

[90000, 80000, 60000, 50000, 40000]

In [35]:
gt_df2, seqs2 = mod.run_locus(50000,seqgen=False)
gt_df2

Unnamed: 0,starts,stops,bps,newicks
0,0.0,1674.852893,1675,"((r3:79277.25752043664397,r4:79277.25752043664..."
1,1674.852893,3521.611228,1847,"((r3:79277.25752043664397,r4:79277.25752043664..."
2,3521.611228,4479.862529,958,"((r3:79277.25752043664397,r4:79277.25752043664..."
3,4479.862529,4611.946795,132,"((r3:79277.25752043664397,r4:79277.25752043664..."
4,4611.946795,5361.744518,750,"((r3:79277.25752043664397,r4:79277.25752043664..."
5,5361.744518,5999.55111,638,"((r3:79277.25752043664397,r4:79277.25752043664..."
6,5999.55111,7611.306048,1611,"((r3:79277.25752043664397,r4:79277.25752043664..."
7,7611.306048,9469.025468,1858,"((r3:79277.25752043664397,r4:79277.25752043664..."
8,9469.025468,10423.312467,954,"((r3:79277.25752043664397,r4:79277.25752043664..."
9,10423.312467,11512.812265,1090,"((r3:79277.25752043664397,r4:79277.25752043664..."


In [40]:
toytree.tree(gt_df2.newicks[23]).draw()

(<toyplot.canvas.Canvas at 0x120d519e8>,
 <toyplot.coordinates.Cartesian at 0x120d51a58>)

### Species trees should be described in units of generations -- this is because the tips will line up at the front! This is not assured in coalescent unit trees

5000
5000
5000
5000
5000
