In [1]:
import numpy as np
import msprime
import matplotlib.pyplot as plt
import seaborn as sns
msprime.__version__

'1.0.1'

In [2]:
seq_len = 1e8
rec_rate = 1e-8
mut_rate = 1e-8
split_time = 50 # in generations

In [3]:
demography = msprime.Demography()
demography.add_population(name="A", initial_size=1000) 
demography.add_population(name="B", initial_size=1000)
demography.add_population(name="C", initial_size=1000)
demography.add_population_split(time=split_time, derived=["A", "B"], ancestral="C")

PopulationSplit(time=50, derived=['A', 'B'], ancestral='C')

In [4]:
# debugging the demographic history
# (not needed for now)
#print(demography)
#print(demography.debug())

In [5]:
ts = msprime.sim_ancestry(
    samples={'A':1000, 'B':1000}, # diploid samples
    demography=demography,
    ploidy=2,
    sequence_length=seq_len,
    discrete_genome=False,
    recombination_rate=rec_rate, 
    model='dtwf',
    random_seed=42
    )

In [6]:
ts = msprime.sim_mutations(
    ts, 
    rate=mut_rate, 
    discrete_genome=False,
    start_time=split_time,
    random_seed=42
    )

### The results are stored in a tree-sequence object

you can read more about the data model here:
https://tskit.dev/tskit/docs/stable/data-model.html#sec-data-model

In [7]:
ts

Tree Sequence,Unnamed: 1
Trees,46065
Sequence Length,100000000.0
Sample Nodes,4000
Total Size,7.7 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,173380,4.6 MiB,
Individuals,2000,31.3 KiB,
Migrations,0,4 Bytes,
Mutations,21942,621.4 KiB,
Nodes,30867,723.4 KiB,
Populations,3,278 Bytes,✅
Provenances,2,2.4 KiB,
Sites,21942,364.3 KiB,


## the genotype matrix represents the genotype of each (haploid) sample at each of the variable sites

In [8]:
gt = ts.genotype_matrix()
gt 

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [9]:
gt.shape

(21942, 4000)

### representing the same data as a genotype matrix is *much* larger in memory

In [10]:
# approx size in Megabytes
# compare to the size of the ts 
# careful this doesn't get too big
gt.nbytes/1e6

87.768

## # nearby sites are correlated, and so not independent

In [11]:
np.corrcoef(gt[0,:], gt[1,:])

array([[ 1.       , -0.1021119],
       [-0.1021119,  1.       ]])

In [12]:
# sites that are farther apart are less correlated
np.corrcoef(gt[0,:], gt[10000,:])

array([[1.        , 0.00511539],
       [0.00511539, 1.        ]])

In [13]:
# We will use Fst as a measure of how 

# calculate 

In [14]:
# get a list of haploid samples from populations A and B
popA_samples = ts.samples(population = 0)
popB_samples = ts.samples(population = 1)

## count the fraction of pairwise differences at each variable site
#### this uses functions from tskit


within each population

In [15]:
dA = ts.diversity(popA_samples, windows = 'sites', span_normalise=False)
print(len(dA)) # number of sites

21942


In [16]:
dB = ts.diversity(popB_samples, windows = 'sites', span_normalise=False)
print(len(dB)) # number of sites

21942


between populations

In [17]:
dAB = ts.divergence([popA_samples, popB_samples], windows = 'sites', span_normalise=False)
print(len(dAB)) # number of sites

21942


In [18]:
mean_within = (dA + dB)/2
between = dAB 
Fst = 1 - mean_within.sum()/between.sum()
Fst

0.023818257651805785

# Fst with just a subset of individuals
remember each 'indiviudal' is made up of two haploid 'samples'

In [19]:
# here just take the first 50 haploids from each population
# this could be made random with np.random.choice
popA_first50 = ts.samples(population = 0)[:50]
popB_first50 = ts.samples(population = 1)[:50]
# once the two new sets of samples are specified, proceed as above
dA_first50 = ts.diversity(popA_first50, windows = 'sites', span_normalise=False)
dB_first50 = ts.diversity(popB_first50, windows = 'sites', span_normalise=False)
dAB_first50 = ts.divergence([popA_first50, popB_first50], windows = 'sites', span_normalise=False)

mean_within_first50 = (dA_first50 + dB_first50)/2
between_first50 = dAB_first50
Fst_first50 = 1 - mean_within_first50.sum()/between_first50.sum()
Fst_first50

0.020776558273834178

# Fst across a subset of sites
one or more sites can be excluded from the Fst calcualtion by simply not including their diversity in the sums above.

In [34]:
# exclude the first site
dA_drop1site = dA_first50[1:]
dB_drop1site = dB_first50[1:]
dAB_drop1site = dAB_first50[1:]

In [36]:
# excluding a single site has a small effect on Fst
mean_within_drop1site = (dA_drop1site + dB_drop1site)/2
between_drop1site = dAB_drop1site
Fst_drop1site = 1 - mean_within_drop1site.sum()/between_drop1site.sum()
Fst_drop1site

0.020777382462107696

# exclude genomic regions
this is a new way for me to do this, but I expect it to work well.

I can see how this could be useful for the jackknife, but would take further work to get this to apply to the bootstrap

In [42]:
# remove information (and all sites) within the the interval [0 <-> 5e6]
interval_to_remove = np.array([[0, 5e6]])
ts_interval1_dropped = ts.delete_intervals(interval_to_remove)
ts_interval1_dropped

Tree Sequence,Unnamed: 1
Trees,43842
Sequence Length,100000000.0
Sample Nodes,4000
Total Size,7.3 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,165289,4.4 MiB,
Individuals,2000,31.3 KiB,
Migrations,0,4 Bytes,
Mutations,20979,594.1 KiB,
Nodes,29893,700.6 KiB,
Populations,3,278 Bytes,✅
Provenances,3,2.9 KiB,
Sites,20979,348.3 KiB,


Then proceed as above

In [43]:
# replace ts with ts_interval1_dropped
dA_interval1_dropped = ts_interval1_dropped.diversity(popA_samples, windows = 'sites', span_normalise=False)
dB_interval1_dropped = ts_interval1_dropped.diversity(popB_samples, windows = 'sites', span_normalise=False)
dAB_interval1_dropped = ts_interval1_dropped.divergence([popA_samples, popB_samples], windows = 'sites', span_normalise=False)

mean_within_interval1_dropped = (dA_interval1_dropped + dB_interval1_dropped)/2
between_interval1_dropped = dAB_interval1_dropped
Fst_interval1_dropped = 1 - mean_within_interval1_dropped.sum()/between_interval1_dropped.sum()
Fst_interval1_dropped

0.02399311094553991

## starter project outline
    - run the base simulation above, 
        - use all individuals and all loci - take this as the 'true' Fst.
    - replicate taking observations of 50 individuals from each population and 5000 sites.
        - use resampling methods to put 95% confidence intervals on Fst based on these 50 inds and 5000 sites.
        - resampling over sites:
            - single-site jackknife / bootstrap
            - block jackknife / bootstrap | using blocks of adjacent loci, maybe try 100 loci or a genomic region of length 5e6. 
        - resampling over individuals:
            - single-individual jackknife / bootstrap. 
        - repeat ~500 times and record how many times the 'True' value is within the confidence interval. 


# most diversity is within populations

In [22]:
mean_within.sum()/between.sum()

0.9761817423481942

# there are different interpretations of Fst

In [23]:
# tskit has a built-in Fst function, but (confusingly) it calculates a different quantity
ts.Fst([popA_samples, popB_samples], mode='site')

array(0.01205267)

# We can use population genetics theory generate the expected value of Fst 

Here we are using the Hudson Fst from Bhatia et al 2013
https://genome.cshlp.org/content/23/9/1514.long


In [26]:
# from Supplemental Info of Bhatia et al 2013
# in msprime the N is given in diploid size (with ploidy=2), 
# in the eq below N is also diploid  

def E_Fst(N, t):
    return 1 - (1 - 1/(2*N))**t

def Xi(N):
    return 1-1/(2*N)

def E_hudson(N1, N2, t):
    return 1-(Xi(N1)**t + Xi(N2)**t)/2

In [27]:
# diploid size 
E_hudson(1000, 1000, 50)

0.0246961856722111

In [28]:
# Compare to the realized value of Fst from above

In [29]:
Fst, E_hudson(1000, 1000, 50)

(0.023818257651805685, 0.0246961856722111)

# can also get Fst from the allele frequency spectrum

In [46]:
afs = ts.allele_frequency_spectrum(
    sample_sets = [popA_samples, popB_samples],
    mode = 'site',
    span_normalise=False, polarised=True)
# Hudson's Fst
n0 = afs.shape[0]-1 
n1 = afs.shape[1]-1
N = np.zeros_like(afs)
D = np.zeros_like(afs)
for i in np.arange(n0+1):
    for j in np.arange(n1+1):
        p0 = i/n0
        p1 = j/n1
        N[i,j] = (p0-p1)**2 - (p0*(1-p0))/(n0-1) - (p1*(1-p1))/(n1-1) 
        D[i,j] = p0*(1-p1)+p1*(1-p0)
Fst = (N*afs).sum()/(D*afs).sum()

Fst

0.023818257651805685