# Add neutral mutaions with a rate map


[Check these notes from Aaron](https://github.com/santiago1234/TheSciJournal/blob/main/journal-2022/Aaron-Notes/220721-DFE-expected-SFS-scaling.pdf)

In [1]:
import tskit
import fwdpy11
import msprime
import sys
import pandas as pd
import pybedtools

sys.path.append('../')
from simutils.utils import simuldata
from simutils.simulation import neutral_rate_maps

In [2]:
# Load the genomic region to simulate
simdat = simuldata('test-data/', sample_id=23, path_to_genetic_maps='test-data/')

In [3]:
# load fwdpy11 simulation output
pop = fwdpy11.DiploidPopulation.load_from_file('results/simulations/sim-seed-1-pop.bin')
ts = pop.dump_tables_to_tskit()

## Setting the recombination map

For the recombination map, say synonymous, we set the recombination rate of coding intervals to $u$ synonymous
and for the complement region (intergenic and intronic) we set $u$ to zero.


[See this code]() where I generate the msprime rate maps using the bed intervals mutation rates.

In [4]:
nr_maps = neutral_rate_maps(simdat)

In [5]:
print(nr_maps['noncoding'])


┌─────────────────────────────────────────────┐
│left    │right    │       mid│   span│   rate│
├─────────────────────────────────────────────┤
│0       │42494    │     21247│  42494│  1e-08│
│42494   │42569    │   42531.5│     75│      0│
│42569   │43298    │   42933.5│    729│  1e-08│
│43298   │43430    │     43364│    132│      0│
│43430   │44779    │   44104.5│   1349│  1e-08│
│44779   │44890    │   44834.5│    111│      0│
│44890   │46715    │   45802.5│   1825│  1e-08│
│46715   │46883    │     46799│    168│      0│
│46883   │48388    │   47635.5│   1505│  1e-08│
│48388   │48491    │   48439.5│    103│      0│
│⋯       │⋯        │         ⋯│      ⋯│      ⋯│
│883154  │883359   │  883256.5│    205│      0│
│883359  │978441   │    930900│  95082│  1e-08│
│978441  │978531   │    978486│     90│      0│
│978531  │978935   │    978733│    404│  1e-08│
│978935  │979052   │  978993.5│    117│      0│
│979052  │991503   │  985277.5│  12451│  1e-08│
│991503  │991670   │  991586.5│    167│

In [6]:
print(nr_maps['synonymous'])


┌───────────────────────────────────────────────┐
│left    │right    │       mid│   span│     rate│
├───────────────────────────────────────────────┤
│0       │42494    │     21247│  42494│        0│
│42494   │42569    │   42531.5│     75│  2.7e-09│
│42569   │43298    │   42933.5│    729│        0│
│43298   │43430    │     43364│    132│  2.7e-09│
│43430   │44779    │   44104.5│   1349│        0│
│44779   │44890    │   44834.5│    111│  2.7e-09│
│44890   │46715    │   45802.5│   1825│        0│
│46715   │46883    │     46799│    168│  2.7e-09│
│46883   │48388    │   47635.5│   1505│        0│
│48388   │48491    │   48439.5│    103│  2.7e-09│
│⋯       │⋯        │         ⋯│      ⋯│        ⋯│
│978441  │978531   │    978486│     90│  2.7e-09│
│978531  │978935   │    978733│    404│        0│
│978935  │979052   │  978993.5│    117│  2.7e-09│
│979052  │988479   │  983765.5│   9427│        0│
│988479  │988562   │  988520.5│     83│  2.7e-09│
│988562  │991503   │  990032.5│   2941│        0│

## Clear mutations

In [7]:
# and alternative approach to clear mutations
#ts # the tree sequence with mutations
#tables = ts.dump_tables() # get a copy of the data for editing
#tables.mutations.clear() # remove mutations
#tables.sites.clear() # remove sites
#ts_no_mutations = tables.tree_sequence() # new tree without mutations

In [8]:
# Remove mutations from the tree sequence
# get all the sites ids
all_ids = [v.site.id for v in ts.variants()]
ts_clear = ts.delete_sites(all_ids)
ts_clear

Tree Sequence,Unnamed: 1
Trees,1414
Sequence Length,1000000.0
Time Units,unknown
Sample Nodes,10000
Total Size,2.0 MiB
Metadata,dict  generation: 50000

Table,Rows,Size,Has Metadata
Edges,23428,732.1 KiB,
Individuals,5000,596.4 KiB,✅
Migrations,0,8 Bytes,
Mutations,0,447 Bytes,
Nodes,19412,530.8 KiB,
Populations,1,133 Bytes,✅
Provenances,2,1.1 KiB,
Sites,0,16 Bytes,


## Add the neutral mutations

In [9]:
ts_syn = msprime.sim_mutations(tree_sequence=ts_clear, rate=nr_maps['synonymous'])
ts_syn

Tree Sequence,Unnamed: 1
Trees,1414
Sequence Length,1000000.0
Time Units,unknown
Sample Nodes,10000
Total Size,2.0 MiB
Metadata,dict  generation: 50000

Table,Rows,Size,Has Metadata
Edges,23428,732.1 KiB,
Individuals,5000,596.4 KiB,✅
Migrations,0,8 Bytes,
Mutations,19,1.1 KiB,
Nodes,19412,530.8 KiB,
Populations,1,133 Bytes,✅
Provenances,3,11.2 KiB,
Sites,19,491 Bytes,


In [10]:
ts_noncoding = msprime.sim_mutations(tree_sequence=ts_clear, rate=nr_maps['noncoding'])
ts_noncoding

Tree Sequence,Unnamed: 1
Trees,1414
Sequence Length,1000000.0
Time Units,unknown
Sample Nodes,10000
Total Size,2.1 MiB
Metadata,dict  generation: 50000

Table,Rows,Size,Has Metadata
Edges,23428,732.1 KiB,
Individuals,5000,596.4 KiB,✅
Migrations,0,8 Bytes,
Mutations,1803,65.6 KiB,
Nodes,19412,530.8 KiB,
Populations,1,133 Bytes,✅
Provenances,3,8.8 KiB,
Sites,1801,44.0 KiB,


In [11]:
def simulate_neutral_variation(ts, simdat):
    """
    Simulate neutral genetic variation (noncoding and synonymous)
    Args:
        ts: tree sequence from forwar in time simulation
        simdat: the simulation data used to generate ts
    Returns:
        (ts_noncoding, ts_synonymous): the tree sequences with the 
            neutral mutations only
    """
    # get a dict with the rate maps for nuetral categories
    # synonymous and noncoding
    nr_maps = neutral_rate_maps(simdat)
    
    # remove existing mutations (selected) from (ts)
    # get the list of all ids
    all_ids = [v.site.id for v in ts.variants()]
    ts_clear = ts.delete_sites(all_ids)
    
    ts_nocd = msprime.sim_mutations(tree_sequence=ts_clear, rate=nr_maps['noncoding'])
    ts_syn = msprime.sim_mutations(tree_sequence=ts_clear, rate=nr_maps['synonymous'])

    
    return ts_nocd, ts_syn