# simulating demography changes in four-taxon trees

imports:

In [414]:
import numpy as np
from copy import deepcopy
from numba import jit

defs:

In [767]:
@jit
def add_generation(population_obj):
    '''
    resamples from previous generation for each 
    '''
    obj=deepcopy(population_obj)
    newdat = np.zeros((num_genes, 1, anc_pop_size),dtype=np.int8)
    for gene_idx in range(len(obj)):
        newdat[gene_idx][0] = np.random.choice(obj[gene_idx][0],replace=True,size=len(obj[gene_idx][0]))
    return np.hstack([newdat,obj])

In [766]:
@jit
def run_a_bunch(starting_pop, num_gens):
    '''
    uses the add_generation function to resample across a bunch of generations.
    '''
    p = deepcopy(starting_pop)
    for _ in xrange(num_gens):
        p = add_generation(p)
    return p

#### Start by thinking about what we'll need to start with:

In [1047]:
num_genes = 50000
anc_pop_size = 20

In [1048]:
population_alleles = np.zeros((num_genes,1,anc_pop_size),dtype = np.int8)
# num_alleles should change to num_genetrees, number of generations, and num individuals

In [1049]:
#num_muts = np.repeat(1,num_genes) 
num_muts = np.random.normal(float(anc_pop_size)/2,size=num_genes).astype(int)

In [1050]:
num_muts[(num_muts > anc_pop_size)] = anc_pop_size

In [1051]:
for gene, muts in zip(population_alleles,num_muts):
    gene[0][np.random.choice(anc_pop_size,size=muts,replace=False)] = 1

Figure out the expected length to coalescence:

In [1052]:
coal_len = 4*anc_pop_size

Now let them drift:

In [1053]:
p3 = run_a_bunch(population_alleles, coal_len)
p12_anc = run_a_bunch(population_alleles, coal_len/2)
p1 = run_a_bunch(deepcopy(p12_anc), coal_len/2)
p2 = run_a_bunch(deepcopy(p12_anc), coal_len/2)

## Now we've simulated p1, p2, and p3. 

Let's run abba baba.

In [1059]:
sampled_genes = np.array(zip([np.random.choice(i[0]) for i in p1],[np.random.choice(i[0]) for i in p2],[np.random.choice(i[0]) for i in p3],np.repeat(0,num_genes)))

In [1060]:
nBABA = sum(np.equal(sampled_genes, np.array([1,0,1,0])).all(axis=1))

In [1061]:
nABBA = sum(np.equal(sampled_genes, np.array([0,1,1,0])).all(axis=1))

In [1062]:
nBABA

773

In [1063]:
nABBA

724

# Now start over with binomial draws rather than bullshit resampling

In [1214]:
num_genes = 2000
anc_pop_size = 10000

In [1215]:
freqs = np.random.uniform(0,1,size=num_genes).reshape(num_genes,1,1)
population_variants = (freqs*anc_pop_size).astype(int)

In [1217]:
@jit
def add_one_gen(population_obj, anc_pop_size, num_genes):
    # we just want the latest gen from our population obj
    obj = np.zeros((num_genes),dtype = int)
    for gene_idx in range(len(population_obj)):
        obj[gene_idx] = population_obj[gene_idx][0]
    
    # now convert the number of variants to frequencies
    obj_freqs = np.divide(obj.astype(float), anc_pop_size)
    
    # now sample a new variant number for each gene based on these frequencies
    newgen = np.random.binomial(n=anc_pop_size,p=obj_freqs).reshape(num_genes,1,1)
    
    # return the full array with the additional generation
    #return(np.hstack([newgen,population_obj]))
    
    # return just a single row, replacing the previous
    return(newgen)

In [1218]:
@jit
def run_a_bunch(population_obj, 
                ngens,
               anc_pop_size):
    '''
    uses the add_generation function to resample across a bunch of generations.
    '''
    p = deepcopy(population_obj)
    num_genes = population_obj.shape[0]
    for _ in xrange(ngens):
        p = add_one_gen(p,anc_pop_size,num_genes)
    return p

In [1219]:
run_a_bunch(population_variants,2000,anc_pop_size)

array([[[ 9037]],

       [[10000]],

       [[10000]],

       ...,

       [[10000]],

       [[ 7468]],

       [[ 1569]]])

In [1187]:
add_one_gen(population_variants,anc_pop_size,num_genes)

array([[[16],
        [16]],

       [[ 8],
        [ 9]],

       [[ 7],
        [ 5]],

       [[12],
        [12]],

       [[15],
        [16]],

       [[ 4],
        [ 4]],

       [[ 9],
        [10]],

       [[15],
        [15]],

       [[ 9],
        [ 9]],

       [[ 0],
        [ 0]]])

In [1163]:
obj=np.array([_[0] for _ in population_variants])

In [1136]:
obj_freqs = np.divide(obj.astype(float), anc_pop_size)

In [1138]:
newgen = np.random.binomial(n=anc_pop_size,p=obj_freqs).reshape(num_genes,1,1)

In [1141]:
np.hstack([newgen,population_variants])[0]

array([[14],
       [16]])

In [1123]:
obj_freqs

array([[0.8 ],
       [0.45],
       [0.25],
       [0.6 ],
       [0.8 ],
       [0.2 ],
       [0.5 ],
       [0.75],
       [0.45],
       [0.  ]])

In [1093]:
@jit
def add_generation(population_obj):
    '''
    resamples from previous generation for each 
    '''
    [_[0] for _ in population_variants]
    obj=deepcopy(population_obj)
    newdat = np.zeros((num_genes, 1, anc_pop_size),dtype=np.int8)
    for gene_idx in range(len(obj)):
        newdat[gene_idx][0] = np.random.choice(obj[gene_idx][0],replace=True,size=len(obj[gene_idx][0]))
    return np.hstack([newdat,obj])

array([[[0.95749479]],

       [[0.56996451]],

       [[0.12863604]],

       ...,

       [[0.05566363]],

       [[0.84168353]],

       [[0.93016868]]])

In [1064]:
float(np.absolute(nABBA-nBABA))/np.sum([nABBA,nBABA])

0.032732130928523714

In [1075]:
np.random.binomial(n=100,p=.5)

52

In [975]:
#@jit
def change_population_size(population,newsize):
    p = deepcopy(population)
    newshape = list(population.shape)
    newshape[1] = 1
    newshape[2] = newsize
    newpop = np.zeros(tuple(newshape))
    for idx in range(len(population)):
        newpop[idx][0] = np.random.choice(population[idx],newsize,replace = True)
    return(newpop)

In [722]:
change_population_size(population_alleles,200)

ValueError: a must be 1-dimensional

In [522]:
test = deepcopy(population_alleles)
counter = 0
while len(np.concatenate([np.unique(i[0]) for i in test])) != num_genes:
    test = add_generation(test)
    counter += 1
print(counter)

954


In [520]:
test

array([[[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 0, 1, ..., 0, 1, 1],
        [0, 1, 0, ..., 0, 0, 1],
        [1, 0, 1, ..., 0, 1, 1]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 1, ..., 0, 0, 1],
        [1, 1, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 1],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 1, 1]],

       ...,

       [[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 1, 1],
        [1, 0, 1, ..., 1, 0, 0]],

       [[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [0, 1, 0, ..., 

# Now think about what this means...

This is simulating backwards in time. We observe a polymorphism and randomly select parents for each individual. This is directional -- every starting individual has to have come from the previous generation, but the indivs in the previous generation don't have to be present in the current one.

Maybe a good approach to this would be to simulate single mutations. Most of them will immediately disappear, and we'll throw them away. But if the polymorphism persists for a specific amount of time (i.e. between the height of the tipmost node and the height of our phylogeny), then we can use this to model ILS and fixation.

Game plan: