In [1]:
import numpy as np
import tskit
import fwdpy11
import gzip
import demes
import pandas as pd

## Calculate the SFS from the msprime simulation

I will look at the SFS for the followin pops.

- AFR
- IBS (EUR)
- MXB (NAT)
- MXL (Admixed)

In [2]:
ts_msprime = tskit.load('data/ts-msprime.ts')

In [3]:
ts_msprime

Tree Sequence,Unnamed: 1
Trees,65892
Sequence Length,10000000.0
Time Units,generations
Sample Nodes,500
Total Size,14.9 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,252634,7.7 MiB,
Individuals,250,6.9 KiB,
Migrations,0,8 Bytes,
Mutations,70476,2.5 MiB,
Nodes,39682,1.1 MiB,
Populations,8,731 Bytes,✅
Provenances,2,6.7 KiB,
Sites,70236,1.7 MiB,


In [4]:
# What are the populations?
for x in ts_msprime.populations():
    print(x)

Population(id=0, metadata={'description': 'Equilibrium/root population', 'name': 'ancestral'})
Population(id=1, metadata={'description': 'Anatomically modern humans', 'name': 'AMH'})
Population(id=2, metadata={'description': 'Bottleneck out-of-Africa population', 'name': 'OOA'})
Population(id=3, metadata={'description': 'Yoruba in Ibadan, Nigeria', 'name': 'YRI'})
Population(id=4, metadata={'description': 'Iberian populations in Spain (IBS).', 'name': 'IBS'})
Population(id=5, metadata={'description': 'Han Chinese in Beijing, China', 'name': 'CHB'})
Population(id=6, metadata={'description': 'Native American,  Mexico.', 'name': 'MXB'})
Population(id=7, metadata={'description': 'Admixed population in Mexico', 'name': 'MXL'})


The function `ts.samples` returns an array of the sample node IDs in this tree sequence. 
NOTE: you can pass the population to get the list of nodes in that population.

In [5]:
# For example to get the individuals from MXL
ts_msprime.samples(7)

array([400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
       413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425,
       426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438,
       439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451,
       452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
       465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477,
       478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490,
       491, 492, 493, 494, 495, 496, 497, 498, 499], dtype=int32)

In [6]:
# We can get the SFS with
mxl_nodes = ts_msprime.samples(3)
ts_msprime.allele_frequency_spectrum(sample_sets=[mxl_nodes],span_normalise=False)


array([26905. , 10573.5,  5084.5,  3281. ,  2488. ,  1937.5,  1525. ,
        1258. ,  1189.5,   995.5,   976. ,   815.5,   723.5,   665.5,
         610.5,   584.5,   547.5,   570.5,   463. ,   447. ,   397. ,
         373. ,   383.5,   410. ,   369.5,   339. ,   303. ,   337. ,
         275.5,   252.5,   311. ,   264.5,   252. ,   278.5,   284. ,
         263.5,   281. ,   278. ,   277.5,   261. ,   249. ,   242. ,
         245. ,   205.5,   273.5,   226. ,   215.5,   244.5,   259.5,
         216.5,   104.5,     0. ,     0. ,     0. ,     0. ,     0. ,
           0. ,     0. ,     0. ,     0. ,     0. ,     0. ,     0. ,
           0. ,     0. ,     0. ,     0. ,     0. ,     0. ,     0. ,
           0. ,     0. ,     0. ,     0. ,     0. ,     0. ,     0. ,
           0. ,     0. ,     0. ,     0. ,     0. ,     0. ,     0. ,
           0. ,     0. ,     0. ,     0. ,     0. ,     0. ,     0. ,
           0. ,     0. ,     0. ,     0. ,     0. ,     0. ,     0. ,
           0. ,     

In [7]:
def get_single_sfs(ts, pop_id):
    # this line tells us the ids from 
    # the given populations.
    sample_nodes = ts.samples(pop_id)
    sf = ts.allele_frequency_spectrum(sample_sets=[sample_nodes],span_normalise=False)
    return pd.DataFrame(
        {'F': sf,
         'derived_allel_freq': range(len(sf)),
         'pop_id': pop_id
        }
    )
    return sf

In [8]:
sfs_msprime = pd.concat([get_single_sfs(ts_msprime, i) for i in [3, 4, 6, 7]])

## Calculate the SFS from the fwdpy11 simulation


I will use `fwdpy11` functions to compute the SFS.

The other alternative is to conver the `fwdpy11-pop` to a `tskit` and simplify (remove samples) the tree
and then use the same approach as above to compute the SFS.

In [9]:
# load back the simuation results from fwdpy11
with gzip.open('data/sim-pop.gz', 'rb') as f:
    pop = fwdpy11.DiploidPopulation.load_from_pickle_file(f)

In [10]:
nodes = np.array(pop.tables.nodes, copy=False)
alive_nodes = pop.alive_nodes
deme3_nodes = alive_nodes[np.where(nodes["deme"][alive_nodes] == 3)[0]]

In [11]:
pop.tables.fs([deme3_nodes[:100]])

masked_array(data=[--, 10939, 5392, 3418, 2555, 1823, 1499, 1340, 1145,
                   969, 783, 685, 670, 633, 581, 535, 508, 427, 424, 386,
                   362, 331, 305, 304, 283, 259, 271, 232, 232, 247, 197,
                   192, 200, 196, 216, 193, 181, 143, 174, 173, 175, 179,
                   165, 155, 119, 116, 150, 131, 124, 131, 145, 107, 106,
                   113, 100, 103, 110, 120, 74, 97, 98, 95, 76, 82, 99,
                   92, 79, 74, 85, 79, 73, 55, 70, 60, 75, 68, 63, 70, 68,
                   55, 53, 69, 69, 67, 61, 52, 63, 73, 74, 52, 56, 35, 55,
                   55, 55, 43, 58, 58, 44, 48, --],
             mask=[ True, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, Fals

In [12]:
def sfs_single(apop, pop_id, N):
    """
    Args:
        apop: DiploidPopulation
        pop_id: the deme id
        N: number of nodes (diploid number) to include in the computation
    """
    nodes = np.array(apop.tables.nodes, copy=False)
    alive_nodes = apop.alive_nodes
    deme_nodes = alive_nodes[np.where(nodes["deme"][alive_nodes] == pop_id)[0]]
    sf = apop.tables.fs([deme_nodes[:N]])
    sf = sf.data
    return pd.DataFrame(
        {'F': sf,
         'derived_allel_freq': range(len(sf)),
         'pop_id': pop_id
        }
    )

In [13]:
sf = sfs_single(pop, 4, 100)

In [14]:
sfs_fwd = pd.concat([sfs_single(pop, i, 100) for i in [3, 4, 6, 7]])

In [15]:
# combine the SFS and save them to a frame
sfs_fwd['Simulator'] = 'fwdpy11'
sfs_msprime['Simulator'] = 'msprime'
pd.concat([sfs_fwd, sfs_msprime]).to_csv('results/simulated-sfss.csv', index=False)