In [1]:
import fwdpy11
import numpy as np
import pandas as pd
import os
from itertools import chain
import gzip
from collections import Counter
import sys
sys.path.append('../')

from simutils.utils import simuldata
from simutils.simulation import simulate_neutral_variation

In [2]:
def load_pop(sim_id):
    inf = f'results/simulations/sim-seed-{sim_id}-pop.bin'
    return fwdpy11.DiploidPopulation.load_from_file(inf)

def id_exits(sim_id):
    inf = f'results/simulations/sim-seed-{sim_id}-pop.bin'
    return os.path.exists(inf)


def simplify_tree(ts, n=50):
    """
    Choose n random individuals from the ts
    and simplified to only those individuals
    Args:
        ts: tree sequence
        n: int number of individuals
    """
    ind_choosen = np.random.choice(ts.individuals(), n, replace=False)
    nodes_ids = []

    for x in ind_choosen:
        nodes_ids.extend(x.nodes)
        
    return ts.simplify(nodes_ids)

In [3]:
# Lets create a dict to keep the data


def get_sfs_for_variant_categories(ts_simplified):
    
    # constructor 
    mutations = {
        'neutral': [0, []],  # the second elemnt of the list will hold neutral mutations
        'missense': [1, []],
        'synonymous': [2, []],
        'LOF': [3, []],
    }

    labs = {
        0: 'neutral',
        1: 'missense',
        2: 'synonymous',
        3: 'LOF'
    }

    for m in ts_simplified.mutations():
        label = m.metadata['label']
        mutations[labs[label]][1].append(m)


    def get_sites(mut_type):
        """
        This function returns a tree having
        mutation only from mut_type
        Args:
            mut_type: str, neutral, LOF, etc.
        """
        to_exclude = list(set(mutations.keys()) - {mut_type})

        m_ex = []

        for x in to_exclude:
            m_ex.extend(mutations[x][1])

        ids_ex = [m.id for m in m_ex]
        return ts_simplified.delete_sites(ids_ex)


    ts_by_mutation = {x:get_sites(x) for x in mutations.keys()}
    sfs_by_mutation = {x: t.allele_frequency_spectrum(polarised=True, span_normalise=False)
                       for (x, t) in ts_by_mutation.items()}

    sfs = pd.DataFrame(sfs_by_mutation)
    sfs['DerivedFreq'] = list(range(sfs.shape[0]))

    sfs = sfs.melt(id_vars=['DerivedFreq'], var_name='MutType', value_name='Frequency')
    
    # drop the neutral category, since we simulate those lates
    return sfs[sfs['MutType'].isin(['LOF', 'missense'])]

In [4]:
def get_neutral_sf(ts, simdat):
    ts_noncoding, ts_synonymous = simulate_neutral_variation(ts, simdat)
    # get the SFS for the neutral genetic variation
    fs_syn = ts_synonymous.allele_frequency_spectrum(polarised=True, span_normalise=False)
    fs_noncod = ts_noncoding.allele_frequency_spectrum(polarised=True, span_normalise=False)

    fs_neutral = {
        'synonymous': fs_syn,
        'noncoding': fs_noncod
    }

    fs_neutral = pd.DataFrame(fs_neutral)
    fs_neutral['DerivedFreq'] = list(range(fs_neutral.shape[0]))
    fs_neutral = fs_neutral.melt(id_vars=['DerivedFreq'], var_name='MutType', value_name='Frequency')
    
    return fs_neutral

In [5]:
simdat = simuldata('test-data/', 23, 'test-data/')

def get_sfs(sim_id):
    pop = load_pop(sim_id)
    ts = pop.dump_tables_to_tskit()
    ts_simplified = simplify_tree(ts, 50)
    sf = get_sfs_for_variant_categories(ts_simplified)
    sf['sim_id'] = sim_id
    
    # get the SFS for neutral variation
    sf_neutral = get_neutral_sf(ts_simplified, simdat)
    sf_neutral['sim_id'] = sim_id
    return pd.concat([sf, sf_neutral])

In [6]:
sf = [get_sfs(i) for i in range(1, 101) if id_exits(i)]
sf = pd.concat(sf)
sf

Unnamed: 0,DerivedFreq,MutType,Frequency,sim_id
101,0,missense,0.0,1
102,1,missense,0.0,1
103,2,missense,3.0,1
104,3,missense,0.0,1
105,4,missense,0.0,1
...,...,...,...,...
197,96,noncoding,2.0,100
198,97,noncoding,0.0,100
199,98,noncoding,1.0,100
200,99,noncoding,2.0,100


In [7]:
sf = sf.groupby(['DerivedFreq', 'MutType']).sum()['Frequency'].reset_index()
sf.to_csv('results/SFSs_simulation.csv', index=False)