In [48]:
import numpy as np
import tskit
import fwdpy11
import gzip
import demes
import pandas as pd

## Calculate the SFS from the msprime simulation

I will look at the SFS for the followin pops.

- AFR
- IBS (EUR)
- MXB (NAT)
- MXL (Admixed)

In [139]:
ts_msprime = tskit.load('data/ts-msprime.ts')

In [140]:
ts_msprime

Tree Sequence,Unnamed: 1
Trees,6707
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,500
Total Size,1.6 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,26744,835.8 KiB,
Individuals,250,6.9 KiB,
Migrations,0,8 Bytes,
Mutations,7094,256.3 KiB,
Nodes,5350,146.3 KiB,
Populations,8,731 Bytes,✅
Provenances,2,6.7 KiB,
Sites,7070,172.6 KiB,


In [141]:
# What are the populations?
for x in ts_msprime.populations():
    print(x)

Population(id=0, metadata={'description': 'Equilibrium/root population', 'name': 'ancestral'})
Population(id=1, metadata={'description': 'Anatomically modern humans', 'name': 'AMH'})
Population(id=2, metadata={'description': 'Bottleneck out-of-Africa population', 'name': 'OOA'})
Population(id=3, metadata={'description': 'Yoruba in Ibadan, Nigeria', 'name': 'YRI'})
Population(id=4, metadata={'description': 'Iberian populations in Spain (IBS).', 'name': 'IBS'})
Population(id=5, metadata={'description': 'Han Chinese in Beijing, China', 'name': 'CHB'})
Population(id=6, metadata={'description': 'Native American,  Mexico.', 'name': 'MXB'})
Population(id=7, metadata={'description': 'Admixed population in Mexico', 'name': 'MXL'})


The function `ts.samples` returns an array of the sample node IDs in this tree sequence. 
NOTE: you can pass the population to get the list of nodes in that population.

In [144]:
# For example to get the individuals from MXL
ts_msprime.samples(7)

array([400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
       413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425,
       426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438,
       439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451,
       452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
       465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477,
       478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490,
       491, 492, 493, 494, 495, 496, 497, 498, 499], dtype=int32)

In [148]:
# We can get the SFS with
mxl_nodes = ts_msprime.samples(3)
ts_msprime.allele_frequency_spectrum(sample_sets=[mxl_nodes],span_normalise=False)


array([2.7045e+03, 1.1875e+03, 5.6150e+02, 3.0900e+02, 2.8200e+02,
       1.5650e+02, 1.7350e+02, 1.2100e+02, 1.2100e+02, 9.3500e+01,
       7.7000e+01, 8.0500e+01, 6.9000e+01, 5.6000e+01, 8.0000e+01,
       6.7000e+01, 5.2500e+01, 6.4000e+01, 4.2500e+01, 5.4500e+01,
       5.0000e+01, 2.3000e+01, 3.4000e+01, 3.3000e+01, 2.7000e+01,
       3.1000e+01, 2.4000e+01, 2.5000e+01, 2.6000e+01, 2.2500e+01,
       1.5000e+01, 3.3000e+01, 3.2000e+01, 3.5000e+01, 2.1000e+01,
       2.0000e+01, 2.6000e+01, 3.4000e+01, 1.8500e+01, 1.4000e+01,
       2.3000e+01, 1.1000e+01, 3.1000e+01, 2.8000e+01, 1.9000e+01,
       1.7000e+01, 1.3000e+01, 1.5000e+01, 1.5000e+01, 6.0000e+00,
       2.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e

In [55]:
def get_single_sfs(ts, pop_id):
    # this line tells us the ids from 
    # the given populations.
    sample_nodes = ts.samples(pop_id)
    sf = ts.allele_frequency_spectrum(sample_sets=[sample_nodes],span_normalise=False)
    return pd.DataFrame(
        {'F': sf,
         'derived_allel_freq': range(len(sf)),
         'pop_id': pop_id
        }
    )
    return sf

In [60]:
sfs_msprime = pd.concat([get_single_sfs(ts_msprime, i) for i in [3, 4, 6, 7]])

## Calculate the SFS from the fwdpy11 simulation


I will use `fwdpy11` functions to compute the SFS.

The other alternative is to conver the `fwdpy11-pop` to a `tskit` and simplify (remove samples) the tree
and then use the same approach as above to compute the SFS.

In [149]:
# load back the simuation results from fwdpy11
with gzip.open('data/sim-pop.gz', 'rb') as f:
    pop = fwdpy11.DiploidPopulation.load_from_pickle_file(f)

In [110]:
nodes = np.array(pop.tables.nodes, copy=False)
alive_nodes = pop.alive_nodes
deme3_nodes = alive_nodes[np.where(nodes["deme"][alive_nodes] == 3)[0]]

In [119]:
pop.tables.fs([deme3_nodes[:100]])

masked_array(data=[--, 116, 57, 34, 22, 25, 18, 11, 9, 14, 9, 3, 8, 8, 9,
                   4, 7, 9, 8, 9, 2, 6, 2, 5, 3, 2, 1, 3, 4, 2, 4, 2, 3,
                   4, 1, 1, 1, 5, 0, 2, 1, 2, 0, 1, 3, 2, 2, 1, 1, 1, 0,
                   0, 2, 1, 0, 0, 1, 0, 2, 0, 2, 0, 0, 3, 2, 2, 3, 0, 2,
                   1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 2, 1, 0, 0, 0, 0, 0,
                   1, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, --],
             mask=[ True, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False,

In [133]:
def sfs_single(apop, pop_id, N):
    """
    Args:
        apop: DiploidPopulation
        pop_id: the deme id
        N: number of nodes (diploid number) to include in the computation
    """
    nodes = np.array(apop.tables.nodes, copy=False)
    alive_nodes = apop.alive_nodes
    deme_nodes = alive_nodes[np.where(nodes["deme"][alive_nodes] == pop_id)[0]]
    sf = apop.tables.fs([deme_nodes[:N]])
    sf = sf.data
    return pd.DataFrame(
        {'F': sf,
         'derived_allel_freq': range(len(sf)),
         'pop_id': pop_id
        }
    )

In [134]:
sf = sfs_single(pop, 4, 100)

In [136]:
sfs_fwd = pd.concat([sfs_single(pop, i, 100) for i in [3, 4, 6, 7]])

In [138]:
# combine the SFS and save them to a frame
sfs_fwd['Simulator'] = 'fwdpy11'
sfs_msprime['Simulator'] = 'msprime'
pd.concat([sfs_fwd, sfs_msprime]).to_csv('results/simulated-sfss.csv', index=False)