In [2]:
import numpy as np
import msprime
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import resample
from statistics import mean, stdev
from math import sqrt
import seaborn as sns

In [23]:
import allel

In [24]:
def sim():
    demography = msprime.Demography()
    demography.add_population(name="A", initial_size=1000) 
    demography.add_population(name="B", initial_size=1000)
    demography.add_population(name="C", initial_size=1000)
    demography.add_population_split(time=50, derived=["A", "B"], ancestral="C")
    
    ts = msprime.sim_ancestry(
            samples={'A':1000, 'B':1000}, # diploid samples
            demography=demography,
            ploidy=2,
            sequence_length=1e8,
            discrete_genome=False,
            recombination_rate=1e-8, 
            model='dtwf',
        )

    ts = msprime.sim_mutations(
            ts, 
            rate=1e-8, 
            discrete_genome=False,
            start_time=50,
        )
    return ts

In [48]:
def get_fst_with_duplicate_samples(ts, pop1_samples, pop2_samples):
    """return fst 
    ts = tree seqeunce
    pop1_samples - the samples from the first popualtion to be used
    pop2_samples - the samples from the second popualtion to be used
    
    both (pop1_samples and pop2_samples) may have duplicates.
    uses all available sites from the ts.    
    """    
    
    ga = allel.GenotypeArray(
        ts.genotype_matrix().reshape(
            ts.num_sites, ts.num_samples, 1), 
        dtype='i1')
    ac1 = ga[:, pop1_samples].count_alleles()
    ac2 = ga[:, pop2_samples].count_alleles()
    num, denom = allel.hudson_fst(ac1, ac2)
    fst = np.sum(num)/np.sum(denom)
    return(fst)

In [73]:
# requires scikit-allele
# install with conda install -c conda-forge scikit-allel

def get_fst_general(ts, pop1_samples, pop2_samples, sites_index):
    """return fst 
    ts = tree seqeunce
    pop1_samples = the samples from the first popualtion to be used
    pop2_samples = the samples from the second population to be used
    sites_index = the indexs of the sites to be used. 
    
    all of  (pop1_samples, pop2_samples, and sites_index) may have duplicates.
    """
    
    ga = allel.GenotypeArray(
        ts.genotype_matrix().reshape(
            ts.num_sites, ts.num_samples, 1), 
        dtype='i1')
    ac1 = ga[sites_index][:, pop1_samples, :].count_alleles()
    ac2 = ga[sites_index][:, pop2_samples].count_alleles()
    num, denom = allel.hudson_fst(ac1, ac2)
    fst = np.sum(num)/np.sum(denom)
    return(fst)


In [74]:
get_fst_general(ts, pop1_indices, pop2_indices, sites_index=np.arange(ts.num_sites))

0.025465254515040107

In [78]:
get_fst_general(ts, [0,0,2,3,4,2], pop2_indices, sites_index=[0,2,2,2])

0.017624196713741484

In [None]:
def get_hudson_fst(ac1, ac2):
    num, denom = allel.hudson_fst(ac1, ac2)
    return(np.sum(num)/np.sum(denom))

def get_fst2(ts):
    popA = ts.samples(population = 0)
    popB = ts.samples(population = 1)
    
    ga = allel.GenotypeArray(ts.genotype_matrix(), dtype='i1')

    pop1_indices = ts.samples(population=1)
    pop2_indices = ts.samples(population=2)
    subpops = [pop1_indices, pop2_indices]
    ac1 = ga[:, pop1_indices].count_alleles()
    ac2 = ga[:, pop2_indices].count_alleles()
    
   
    # Hudson Fst
    a, b = get_hudson_fst(ac1, ac2)
    hudson_fst_weighted = np.sum(a)/np.sum(b)
    hudson_fst_unweighted = np.nanmean(a/b)

In [3]:
def get_fst(ts, samples1, samples_2):
    dA = ts.diversity(samples1, windows='sites', span_normalise=False)
    dB = ts.diversity(samples_2, windows='sites', span_normalise=False)
    dAB = ts.divergence([samples1, samples_2], windows='sites', span_normalise=False)
    mean_within = (dA + dB) / 2
    between = dAB 
    Fst = 1 - mean_within.sum() / between.sum()
    return Fst

In [4]:
def get_fst_sites(dA, dB, dAB):
    mean_within = (dA + dB) / 2
    between = dAB 
    Fst = 1 - mean_within.sum() / between.sum()
    return Fst

In [None]:
def bootstrap_fst_inds(ts, nsamples, ntimes):
    popA = ts.samples(population = 0)
    popB = ts.samples(population = 1)
    for i in range(ntimes):

In [17]:
ts = sim()
ts.allele_frequency_spectrum(
    sample_sets = [[0,1,1,2,4,5], [18,19,21,40]],
    mode = 'site',
    span_normalise=False, polarised=True)

LibraryError: Duplicate sample value

In [18]:
gt = ts.genotype_matrix()

In [21]:
import allel

ModuleNotFoundError: No module named 'allel'

In [20]:
gt.shape

(22420, 4000)

In [12]:
def simulation(sim_times, 
               resample_times, 
               resample_size, 
               sample_size, 
               method=0):
    
    '''
    @replicate_times: times of replication of simulation
    @resample_times: times of resampling over each simulation  
    @resample_size: size of each resampling 
    @sample_size: a list of sample size
    @method: 0: resample with individuals, 1: resample with sites
    
    Return a dictionary that records coverage of all sample_size for each simulation
    and a list contains the fst value for each simulation.
    For example, coverage_rate[0] contains coverage value of all input sample_size of the first simulation
    '''

    fst = [] # fst values for each simulation
    coverage_rate = {}
    
    # we will find the coverage for each simulation
    for i in range(sim_times):
        ts = sim()
        popA = ts.samples(population = 0)
        popB = ts.samples(population = 1)
        # fst values for the present realization
        this_fst = get_fst(ts, popA, popB)
        fst.append(this_fst)
       
        print(f'Simulation {i}: {this_fst}')
        coverage_rate[i] = []
        
        for size in sample_size:
            within = []
            for j in range(resample_times):
                # resample over individuals 
                if method == 0:
                    sample_fst = resample_ind(ts, popA, popB, resample_size, n=size)
                else: # over sites 
                    sample_fst = resample_sites(ts, popA, popB, resample_size, n=size)
                
                # find the 95% CI for the resample fst value 
                est_mean = mean(sample_fst)
                upper = est_mean + 1.96 * (stdev(sample_fst) / sqrt(resample_size))
                lower = est_mean - 1.96 * (stdev(sample_fst) / sqrt(resample_size))

                if this_fst < upper and this_fst > lower:
                    within.append(1)
                else:
                    within.append(0)
                
            coverage_rate[i].append(mean(within))
            
        print(coverage_rate[i])
        
        
    return coverage_rate, fst

def resample_ind(ts, popA, popB, resample_size, n):
    '''
    Resample individuals of the given size
    ''' 
    
    resample_fst = []
    for i in range(resample_size):
        popA_random = np.random.choice(popA, size=n, replace=True)
        popB_random = np.random.choice(popB, size=n, replace=True)
        resample_fst.append(get_fst(ts, popA_random, popB_random))
    
    return resample_fst


def resample_sites(ts, popA, popB, resample_size, n):
    '''
    Resample sites of the given size
    ''' 
    dA = ts.diversity(popA, windows='sites', span_normalise=False)
    dB = ts.diversity(popB, windows='sites', span_normalise=False)
    dAB = ts.divergence([popA, popB], windows='sites', span_normalise=False)
    
    resample_fst = []
    
    index = [i for i in range(len(dA))]
    for i in range(resample_size):
        random_index = np.random.choice(index, size=n, replace=True)
        dA_sites = dA[random_index]
        dB_sites = dB[random_index]
        dAB_sites = dAB[random_index]
        resample_fst.append(get_fst_sites(dA_sites, dB_sites, dAB_sites))
    
    return resample_fst

In [13]:
#n_ind = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
#n_sites = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
n_ind = [100, 200]

coverage_ind, fst_ind = simulation(
    sim_times=10, 
    resample_times=10, 
    resample_size=20, sample_size=n_ind)


Simulation 0: 0.02426645673750283


LibraryError: Duplicate sample value

In [11]:
coverage_ind

{0: [1, 1],
 1: [1, 0.9],
 2: [1, 1],
 3: [1, 0.9],
 4: [1, 1],
 5: [0.9, 0.9],
 6: [0.8, 1],
 7: [0.9, 0.8],
 8: [1, 0.9],
 9: [0.8, 0.9]}

In [18]:
n_sites = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

coverage_sites, fst_sites = simulation(replicate_times=10, resample_times=10, resample_size=20, sample_size=n_sites, method=1)


Simulaiton 0: 0.023774599212821212
[0.9, 0.9, 1, 0.9, 0.9, 1, 1, 0.8, 0.9, 1]
Simulaiton 1: 0.024372596484649844
[1, 1, 0.9, 1, 1, 0.8, 1, 0.9, 1, 0.9]
Simulaiton 2: 0.02363257683131692
[1, 1, 0.9, 0.9, 1, 1, 0.9, 1, 0.9, 0.7]
Simulaiton 3: 0.024679965076707666
[0.9, 0.9, 0.9, 0.9, 1, 0.9, 1, 1, 0.8, 1]
Simulaiton 4: 0.02393117949089496
[0.9, 1, 0.9, 1, 0.8, 0.8, 1, 1, 0.9, 1]
Simulaiton 5: 0.022885717137219097
[0.9, 1, 0.8, 0.8, 0.9, 1, 1, 0.9, 1, 0.9]
Simulaiton 6: 0.021095052605427655
[0.9, 1, 1, 0.9, 0.9, 1, 0.8, 1, 0.9, 0.9]
Simulaiton 7: 0.023479512887795928
[1, 1, 1, 1, 1, 0.9, 1, 0.9, 0.8, 1]
Simulaiton 8: 0.021391165822375213
[1, 0.9, 1, 1, 0.9, 0.8, 0.9, 1, 1, 0.9]
Simulaiton 9: 0.02326111604684411
[1, 0.8, 1, 1, 0.9, 0.9, 0.9, 1, 1, 1]


In [16]:
coverage_sites

{0: [1, 1, 0.9, 1, 1, 0.8, 1, 1, 0.9, 0.9],
 1: [0.9, 1, 1, 1, 0.9, 1, 1, 0.9, 1, 1],
 2: [0.9, 0.9, 0.9, 1, 0.9, 1, 1, 1, 1, 0.9],
 3: [0.9, 0.8, 0.7, 0.9, 1, 1, 1, 1, 0.8, 0.8],
 4: [1, 0.9, 1, 0.8, 0.9, 1, 1, 1, 1, 0.9],
 5: [0.9, 0.8, 0.9, 0.9, 1, 0.9, 1, 0.9, 0.9, 0.9],
 6: [1, 1, 1, 1, 0.8, 1, 0.8, 1, 0.9, 1],
 7: [1, 0.8, 0.9, 1, 1, 0.8, 0.9, 1, 0.9, 1],
 8: [1, 1, 0.9, 0.9, 1, 1, 1, 1, 0.9, 0.9],
 9: [1, 1, 1, 0.9, 0.8, 1, 1, 0.8, 0.9, 1]}

## Each row is a simulation and column is the coverage rate for that sample size. There are in total 10 different simulations. 

In [31]:
df_ind = pd.DataFrame.from_dict(coverage_ind, orient='index', columns=n_ind)
df_ind

Unnamed: 0,100,200,300,400,500,600,700,800,900,1000
0,1.0,0.8,1.0,1.0,1.0,1.0,0.9,1.0,1.0,1.0
1,1.0,0.9,1.0,0.8,0.9,0.9,1.0,0.9,1.0,1.0
2,0.8,1.0,0.8,0.9,0.8,1.0,0.7,0.8,1.0,0.9
3,0.9,1.0,1.0,0.9,1.0,0.9,0.9,1.0,1.0,0.9
4,0.9,1.0,0.9,1.0,1.0,1.0,0.9,0.9,0.9,1.0
5,1.0,0.8,1.0,1.0,1.0,0.9,0.8,1.0,0.9,1.0
6,1.0,1.0,0.9,0.9,1.0,0.9,0.8,1.0,1.0,0.8
7,1.0,0.8,1.0,0.8,1.0,1.0,1.0,1.0,1.0,1.0
8,0.7,1.0,0.9,0.8,1.0,0.9,1.0,0.8,0.8,1.0
9,1.0,1.0,1.0,0.9,1.0,0.9,1.0,0.9,0.9,0.8


In [29]:
df_ind.mean()

1000     0.93
2000     0.93
3000     0.95
4000     0.90
5000     0.97
6000     0.94
7000     0.90
8000     0.93
9000     0.95
10000    0.94
dtype: float64

In [32]:
df_sites = pd.DataFrame.from_dict(coverage_ind, orient='index', columns=n_sites)
df_sites

Unnamed: 0,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
0,1.0,0.8,1.0,1.0,1.0,1.0,0.9,1.0,1.0,1.0
1,1.0,0.9,1.0,0.8,0.9,0.9,1.0,0.9,1.0,1.0
2,0.8,1.0,0.8,0.9,0.8,1.0,0.7,0.8,1.0,0.9
3,0.9,1.0,1.0,0.9,1.0,0.9,0.9,1.0,1.0,0.9
4,0.9,1.0,0.9,1.0,1.0,1.0,0.9,0.9,0.9,1.0
5,1.0,0.8,1.0,1.0,1.0,0.9,0.8,1.0,0.9,1.0
6,1.0,1.0,0.9,0.9,1.0,0.9,0.8,1.0,1.0,0.8
7,1.0,0.8,1.0,0.8,1.0,1.0,1.0,1.0,1.0,1.0
8,0.7,1.0,0.9,0.8,1.0,0.9,1.0,0.8,0.8,1.0
9,1.0,1.0,1.0,0.9,1.0,0.9,1.0,0.9,0.9,0.8


In [33]:
df_sites.mean()

1000     0.93
2000     0.93
3000     0.95
4000     0.90
5000     0.97
6000     0.94
7000     0.90
8000     0.93
9000     0.95
10000    0.94
dtype: float64