In [2]:
import numpy as np
import msprime
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import resample
from statistics import mean, stdev
from math import sqrt
import seaborn as sns

In [4]:
seq_len = 1e8
rec_rate = 1e-8
mut_rate = 1e-8
split_time = 50 # in generations

In [5]:
demography = msprime.Demography()
demography.add_population(name="A", initial_size=1000) 
demography.add_population(name="B", initial_size=1000)
demography.add_population(name="C", initial_size=1000)
demography.add_population_split(time=split_time, derived=["A", "B"], ancestral="C")

PopulationSplit(time=50, derived=['A', 'B'], ancestral='C')

In [6]:
def get_fst(dA, dB, dAB):
    mean_within = (dA + dB) / 2
    between = dAB 
    Fst = 1 - mean_within.sum() / between.sum()
    return Fst

In [17]:
def simulation(replicate_times, resample_times, resample_size, sample_size, method=0):
    
    '''
    @replicate_times: times of replication of simulation
    @resample_times: times of resampling over each simulation  
    @resample_size: size of each resampling 
    @sample_size: a list of sample size
    @method: 0: resample with individuals, 1: resample with sites
    
    Return a dictionary that records coverage of all sample_size for each simulation
    and a list contains the fst value for each simulation.
    For example, coverage_rate[0] contains coverage value of all input sample_size of the first simulation
    '''

    fst = [] # fst values for all simulations 
    coverage_rate = {}
    
    # we will find the coverage for each simulation
    for i in range(replicate_times): 
        ts = msprime.sim_ancestry(
            samples={'A':1000, 'B':1000}, # diploid samples
            demography=demography,
            ploidy=2,
            sequence_length=seq_len,
            discrete_genome=False,
            recombination_rate=rec_rate, 
            model='dtwf',
        )

        ts = msprime.sim_mutations(
            ts, 
            rate=mut_rate, 
            discrete_genome=False,
            start_time=split_time,
            )
        
        popA = ts.samples(population = 0)
        popB = ts.samples(population = 1)
        
        dA = ts.diversity(popA, windows = 'sites', span_normalise=False)
        dB = ts.diversity(popB, windows = 'sites', span_normalise=False)
        dAB = ts.divergence([popA, popB], windows = 'sites', span_normalise=False)
        
        # fst values for the present realization
        this_fst = get_fst(dA, dB, dAB)
        fst.append(this_fst)
        
        print(f'Simulaiton {i}: {this_fst}')
        coverage_rate[i] = []
        
        for size in sample_size:
        
            within = []

            for j in range(resample_times):
                # resample over individuals 
                if method == 0:
                    sample_fst = resample_ind(ts, popA, popB, resample_size, n=size)
                else: # over sites 
                    sample_fst = resample_sites(dA, dB, dAB, resample_size, n=size)
                
                # find the 95% CI for the resample fst value 
                est_mean = mean(sample_fst)
                upper = est_mean + 1.96 * (stdev(sample_fst) / sqrt(resample_size))
                lower = est_mean - 1.96 * (stdev(sample_fst) / sqrt(resample_size))

                if this_fst < upper and this_fst > lower:
                    within.append(1)
                else:
                    within.append(0)
                
            coverage_rate[i].append(mean(within))
            
        print(coverage_rate[i])
        
        
    return coverage_rate, fst

def resample_ind(ts, popA, popB, resample_size, n):
    '''
    Resample individuals of the given size
    ''' 
    
    fst = []
    for i in range(resample_size):
        popA_random = np.random.choice(popA, size=n, replace=False)
        popB_random = np.random.choice(popB, size=n, replace=False)
        dA = ts.diversity(popA_random, windows = 'sites', span_normalise=False)
        dB = ts.diversity(popB_random, windows = 'sites', span_normalise=False)
        dAB = ts.divergence([popA_random, popB_random], windows = 'sites', span_normalise=False)
        fst.append(get_fst(dA, dB, dAB))
    
    return fst


def resample_sites(dA, dB, dAB, resample_size, n):
    '''
    Resample sites of the given size
    ''' 
    
    fst = []
    index = [i for i in range(len(dA))]
    
    for i in range(resample_size):
        random_index = np.random.choice(index, size=n, replace=False)
        dA_sites = dA[random_index]
        dB_sites = dB[random_index]
        dAB_sites = dAB[random_index]

        fst.append(get_fst(dA_sites, dB_sites, dAB_sites))
    
    return fst

In [12]:
n_ind = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
#n_sites = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

coverage_ind, fst_ind = simulation(replicate_times=10, resample_times=10, resample_size=20, sample_size=n_ind)


Simulaiton 0: 0.022263782796614895
[1, 0.8, 1, 1, 1, 1, 0.9, 1, 1, 1]
Simulaiton 1: 0.023483878181320028
[1, 0.9, 1, 0.8, 0.9, 0.9, 1, 0.9, 1, 1]
Simulaiton 2: 0.02355898508037979
[0.8, 1, 0.8, 0.9, 0.8, 1, 0.7, 0.8, 1, 0.9]
Simulaiton 3: 0.023244908934006214
[0.9, 1, 1, 0.9, 1, 0.9, 0.9, 1, 1, 0.9]
Simulaiton 4: 0.027021323465028724
[0.9, 1, 0.9, 1, 1, 1, 0.9, 0.9, 0.9, 1]
Simulaiton 5: 0.02330099623566173
[1, 0.8, 1, 1, 1, 0.9, 0.8, 1, 0.9, 1]
Simulaiton 6: 0.025683856495070212
[1, 1, 0.9, 0.9, 1, 0.9, 0.8, 1, 1, 0.8]
Simulaiton 7: 0.022461913110756293
[1, 0.8, 1, 0.8, 1, 1, 1, 1, 1, 1]
Simulaiton 8: 0.027578603162416004
[0.7, 1, 0.9, 0.8, 1, 0.9, 1, 0.8, 0.8, 1]
Simulaiton 9: 0.022686692641967987
[1, 1, 1, 0.9, 1, 0.9, 1, 0.9, 0.9, 0.8]


In [13]:
coverage_ind

{0: [1, 0.8, 1, 1, 1, 1, 0.9, 1, 1, 1],
 1: [1, 0.9, 1, 0.8, 0.9, 0.9, 1, 0.9, 1, 1],
 2: [0.8, 1, 0.8, 0.9, 0.8, 1, 0.7, 0.8, 1, 0.9],
 3: [0.9, 1, 1, 0.9, 1, 0.9, 0.9, 1, 1, 0.9],
 4: [0.9, 1, 0.9, 1, 1, 1, 0.9, 0.9, 0.9, 1],
 5: [1, 0.8, 1, 1, 1, 0.9, 0.8, 1, 0.9, 1],
 6: [1, 1, 0.9, 0.9, 1, 0.9, 0.8, 1, 1, 0.8],
 7: [1, 0.8, 1, 0.8, 1, 1, 1, 1, 1, 1],
 8: [0.7, 1, 0.9, 0.8, 1, 0.9, 1, 0.8, 0.8, 1],
 9: [1, 1, 1, 0.9, 1, 0.9, 1, 0.9, 0.9, 0.8]}

In [18]:
n_sites = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

coverage_sites, fst_sites = simulation(replicate_times=10, resample_times=10, resample_size=20, sample_size=n_sites, method=1)


Simulaiton 0: 0.023774599212821212
[0.9, 0.9, 1, 0.9, 0.9, 1, 1, 0.8, 0.9, 1]
Simulaiton 1: 0.024372596484649844
[1, 1, 0.9, 1, 1, 0.8, 1, 0.9, 1, 0.9]
Simulaiton 2: 0.02363257683131692
[1, 1, 0.9, 0.9, 1, 1, 0.9, 1, 0.9, 0.7]
Simulaiton 3: 0.024679965076707666
[0.9, 0.9, 0.9, 0.9, 1, 0.9, 1, 1, 0.8, 1]
Simulaiton 4: 0.02393117949089496
[0.9, 1, 0.9, 1, 0.8, 0.8, 1, 1, 0.9, 1]
Simulaiton 5: 0.022885717137219097
[0.9, 1, 0.8, 0.8, 0.9, 1, 1, 0.9, 1, 0.9]
Simulaiton 6: 0.021095052605427655
[0.9, 1, 1, 0.9, 0.9, 1, 0.8, 1, 0.9, 0.9]
Simulaiton 7: 0.023479512887795928
[1, 1, 1, 1, 1, 0.9, 1, 0.9, 0.8, 1]
Simulaiton 8: 0.021391165822375213
[1, 0.9, 1, 1, 0.9, 0.8, 0.9, 1, 1, 0.9]
Simulaiton 9: 0.02326111604684411
[1, 0.8, 1, 1, 0.9, 0.9, 0.9, 1, 1, 1]


In [16]:
coverage_sites

{0: [1, 1, 0.9, 1, 1, 0.8, 1, 1, 0.9, 0.9],
 1: [0.9, 1, 1, 1, 0.9, 1, 1, 0.9, 1, 1],
 2: [0.9, 0.9, 0.9, 1, 0.9, 1, 1, 1, 1, 0.9],
 3: [0.9, 0.8, 0.7, 0.9, 1, 1, 1, 1, 0.8, 0.8],
 4: [1, 0.9, 1, 0.8, 0.9, 1, 1, 1, 1, 0.9],
 5: [0.9, 0.8, 0.9, 0.9, 1, 0.9, 1, 0.9, 0.9, 0.9],
 6: [1, 1, 1, 1, 0.8, 1, 0.8, 1, 0.9, 1],
 7: [1, 0.8, 0.9, 1, 1, 0.8, 0.9, 1, 0.9, 1],
 8: [1, 1, 0.9, 0.9, 1, 1, 1, 1, 0.9, 0.9],
 9: [1, 1, 1, 0.9, 0.8, 1, 1, 0.8, 0.9, 1]}

## Each row is a simulation and column is the coverage rate for that sample size. There are in total 10 different simulations. 

In [31]:
df_ind = pd.DataFrame.from_dict(coverage_ind, orient='index', columns=n_ind)
df_ind

Unnamed: 0,100,200,300,400,500,600,700,800,900,1000
0,1.0,0.8,1.0,1.0,1.0,1.0,0.9,1.0,1.0,1.0
1,1.0,0.9,1.0,0.8,0.9,0.9,1.0,0.9,1.0,1.0
2,0.8,1.0,0.8,0.9,0.8,1.0,0.7,0.8,1.0,0.9
3,0.9,1.0,1.0,0.9,1.0,0.9,0.9,1.0,1.0,0.9
4,0.9,1.0,0.9,1.0,1.0,1.0,0.9,0.9,0.9,1.0
5,1.0,0.8,1.0,1.0,1.0,0.9,0.8,1.0,0.9,1.0
6,1.0,1.0,0.9,0.9,1.0,0.9,0.8,1.0,1.0,0.8
7,1.0,0.8,1.0,0.8,1.0,1.0,1.0,1.0,1.0,1.0
8,0.7,1.0,0.9,0.8,1.0,0.9,1.0,0.8,0.8,1.0
9,1.0,1.0,1.0,0.9,1.0,0.9,1.0,0.9,0.9,0.8


In [29]:
df_ind.mean()

1000     0.93
2000     0.93
3000     0.95
4000     0.90
5000     0.97
6000     0.94
7000     0.90
8000     0.93
9000     0.95
10000    0.94
dtype: float64

In [32]:
df_sites = pd.DataFrame.from_dict(coverage_ind, orient='index', columns=n_sites)
df_sites

Unnamed: 0,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
0,1.0,0.8,1.0,1.0,1.0,1.0,0.9,1.0,1.0,1.0
1,1.0,0.9,1.0,0.8,0.9,0.9,1.0,0.9,1.0,1.0
2,0.8,1.0,0.8,0.9,0.8,1.0,0.7,0.8,1.0,0.9
3,0.9,1.0,1.0,0.9,1.0,0.9,0.9,1.0,1.0,0.9
4,0.9,1.0,0.9,1.0,1.0,1.0,0.9,0.9,0.9,1.0
5,1.0,0.8,1.0,1.0,1.0,0.9,0.8,1.0,0.9,1.0
6,1.0,1.0,0.9,0.9,1.0,0.9,0.8,1.0,1.0,0.8
7,1.0,0.8,1.0,0.8,1.0,1.0,1.0,1.0,1.0,1.0
8,0.7,1.0,0.9,0.8,1.0,0.9,1.0,0.8,0.8,1.0
9,1.0,1.0,1.0,0.9,1.0,0.9,1.0,0.9,0.9,0.8


In [33]:
df_sites.mean()

1000     0.93
2000     0.93
3000     0.95
4000     0.90
5000     0.97
6000     0.94
7000     0.90
8000     0.93
9000     0.95
10000    0.94
dtype: float64