## Setting the simulation

All the data needed to run this notebook is here [test-data](test-data/)

In [1]:
import pandas as pd
import numpy as np
import fwdpy11
import msprime
fwdpy11.__version__

'0.17.2.dev14+g560c1da'

## Loading the data

I have generated 200 random samples of 1Mb length from the genome. Here, I take one of those samples, as an example, to set the simulation.
**NOTE:**

- I subtract the start position of the region from the intervals and the recombination map. So, the start position is zero. (Not sure If I need to do this).

In [2]:
# The 1Mb sampled region

chromosome, start, end = np.loadtxt('test-data/region_region_4.bed', dtype=np.int64)
print(f'position chr{chromosome}, {start}, {end}')

position chr22, 48000000, 49000000


In [3]:
# exonic intervals in the sampled region

exons = pd.read_csv('test-data/region_exons_4.bed', sep='\t', names=['chro', 'start', 'end'])
exons.head()

Unnamed: 0,chro,start,end
0,22,48013740,48013936
1,22,48015558,48015636
2,22,48015810,48017540
3,22,48017838,48023004
4,22,48044709,48045001


In [4]:
# non coding intervals (intronic and exonic) in the sampled region
nonexonic = pd.read_csv('test-data/region_intronANDinterg_4.bed', sep='\t', names=['chro', 'start', 'end'])
nonexonic.head()

Unnamed: 0,chro,start,end
0,22,48000000,48013740
1,22,48013936,48015558
2,22,48015636,48015810
3,22,48017540,48017838
4,22,48023004,48044709


In [5]:
# mutation rates within the region
ml_coding = pd.read_csv('test-data/region_mlcoding_4.csv')
ml_non_coding = np.loadtxt('test-data/region_mlnoncoding_4.txt', dtype=np.object0)[1]
ml_non_coding = float(ml_non_coding)

## Create a dict mapping class name to value

ml = {}
for x, y in ml_coding.iterrows():
    ml[y.Q] = y.mL

ml['noncoding'] = ml_non_coding
print(ml)

{'LOF': 3.96848e-07, 'missense': 4.28265764e-06, 'synonymous': 1.73085646e-06, 'noncoding': 0.009854629053900001}


In [6]:
# substract start position so the intial positon is zero
exons['start'] = exons['start'] - start
exons['end'] = exons['end'] - start

nonexonic['start'] = nonexonic['start'] - start
nonexonic['end'] = nonexonic['end'] - start

nonexonic.head()

Unnamed: 0,chro,start,end
0,22,0,13740
1,22,13936,15558
2,22,15636,15810
3,22,17540,17838
4,22,23004,44709


In [7]:
## Recombination map
## Here I use msprime function: msprime.RateMap.read_hapmap to load the recombination map
rmap = msprime.RateMap.read_hapmap('test-data/chr22.b38.gmap', position_col=0, map_col=2)
print(rmap)


┌───────────────────────────────────────────────────────┐
│left      │right     │         mid│      span│     rate│
├───────────────────────────────────────────────────────┤
│0         │15287922  │     7643961│  15287922│  9.5e-10│
│15287922  │16370978  │    15829450│   1083056│  1.2e-09│
│16370978  │16372046  │    16371512│      1068│    5e-09│
│16372046  │16373044  │    16372545│       998│    5e-09│
│16373044  │16373740  │    16373392│       696│  5.1e-09│
│16373740  │16374612  │    16374176│       872│    6e-09│
│16374612  │16374641  │  16374626.5│        29│  2.7e-08│
│16374641  │16374814  │  16374727.5│       173│  2.7e-08│
│16374814  │16374956  │    16374885│       142│  8.6e-09│
│16374956  │16375126  │    16375041│       170│  4.2e-09│
│⋯         │⋯         │           ⋯│         ⋯│        ⋯│
│50739662  │50744589  │  50742125.5│      4927│  6.5e-09│
│50744589  │50757736  │  50751162.5│     13147│  6.4e-09│
│50757736  │50761423  │  50759579.5│      3687│  6.4e-09│
│50761423  │5

In [8]:
## we can take a slice from the map to get the coordinates in the sampled region
# with set trim=True 
rmap = rmap.slice(left=start, right=end, trim=True)
print(rmap)


┌───────────────────────────────────────────────┐
│left    │right    │       mid│   span│     rate│
├───────────────────────────────────────────────┤
│0       │33       │      16.5│     33│  8.9e-09│
│33      │1030     │     531.5│    997│  7.2e-09│
│1030    │1148     │      1089│    118│  2.7e-09│
│1148    │1302     │      1225│    154│  1.4e-09│
│1302    │1313     │    1307.5│     11│  1.8e-09│
│1313    │1627     │      1470│    314│  9.2e-10│
│1627    │1853     │      1740│    226│  8.4e-10│
│1853    │1874     │    1863.5│     21│  9.5e-10│
│1874    │1925     │    1899.5│     51│  7.8e-10│
│1925    │2103     │      2014│    178│  6.7e-10│
│⋯       │⋯        │         ⋯│      ⋯│        ⋯│
│984458  │984548   │    984503│     90│  1.1e-07│
│984548  │984844   │    984696│    296│  1.1e-07│
│984844  │984898   │    984871│     54│  1.3e-07│
│984898  │984941   │  984919.5│     43│  1.4e-07│
│984941  │985051   │    984996│    110│  1.6e-07│
│985051  │997194   │  991122.5│  12143│  1.8e-07│

# Set simulation 

## Neutral regions

In [9]:
## we will label the mutations according to the functional category

mut_labels = {
    'neutral': 0,
    'missense': 1,
    'synonymous': 2,
    'LOF': 3,
}

In [10]:
nonexonic.head()

Unnamed: 0,chro,start,end
0,22,0,13740
1,22,13936,15558
2,22,15636,15810
3,22,17540,17838
4,22,23004,44709


In [11]:
# Construct the neutral regions from the non-exonic intervals
nregions = []
for _, noexon in nonexonic.iterrows():
    nregions.append(
        fwdpy11.Region(beg=noexon.start, end=noexon.end, weight=1, label=mut_labels['neutral'])
    
    )


In [12]:
nregions[:10]

[fwdpy11.Region(beg=0, end=13740, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=13936, end=15558, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=15636, end=15810, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=17540, end=17838, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=23004, end=44709, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=45001, end=48469, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=48549, end=52645, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=58275, end=139460, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=139696, end=140739, weight=1, coupled=True, label=0),
 fwdpy11.Region(beg=141001, end=141541, weight=1, coupled=True, label=0)]

## Distributions of effect sizes | Selected regions

- For now I will assume a constant effect size.
- The weights establish the relative probability that a mutation comes from a given region.

**NOTE:**

- When multiple “sregion” objects are used, the default behavior is to multiply the input weight by end-beg:
- The weights should depend on the mutation type (i.e. synonymous, missense). We could make the weight
proportional to ml.

**Comments:**

- The selection and dominance should also depend on the mutation class. We'll need to pick an appropiate DFEs for each case.


In [13]:
# Define the Weights
total_weigth = ml['synonymous'] + ml['missense'] + ml['LOF']


w_syn = ml['synonymous'] / total_weigth
w_mis = ml['missense'] / total_weigth
w_lof = ml['LOF'] / total_weigth

print(f'total weight: {total_weigth}\n\n\nsynonymous={w_syn}\nmissense={w_mis}\nlof={w_lof}')

total weight: 6.4103621e-06


synonymous=0.2700091559570403
missense=0.6680835767452201
lof=0.0619072672977397


In [14]:
# selection and dominance for now
s = -0.01
h = 0.25

In [15]:
# Construct the selected regions from the exonic intervals
sregions = []
for _, exon in exons.iterrows():
    # missense
    sregions.append(
        fwdpy11.ConstantS(beg=exon.start, end=exon.end, weight=w_mis, s=s, label=mut_labels['missense'])
    
    )
    # synonymous
    sregions.append(
        fwdpy11.ConstantS(beg=exon.start, end=exon.end, weight=w_syn, s=s, label=mut_labels['synonymous'])
    
    )
    # loss of function
    sregions.append(
        fwdpy11.ConstantS(beg=exon.start, end=exon.end, weight=w_lof, s=s, label=mut_labels['LOF'])
    
    )

In [16]:
sregions[:5]

[fwdpy11.ConstantS(beg=13740, end=13936, weight=0.6680835767452201, s=-0.01, h=1.0, coupled=True, label=1, scaling=1.0),
 fwdpy11.ConstantS(beg=13740, end=13936, weight=0.2700091559570403, s=-0.01, h=1.0, coupled=True, label=2, scaling=1.0),
 fwdpy11.ConstantS(beg=13740, end=13936, weight=0.0619072672977397, s=-0.01, h=1.0, coupled=True, label=3, scaling=1.0),
 fwdpy11.ConstantS(beg=15558, end=15636, weight=0.6680835767452201, s=-0.01, h=1.0, coupled=True, label=1, scaling=1.0),
 fwdpy11.ConstantS(beg=15558, end=15636, weight=0.2700091559570403, s=-0.01, h=1.0, coupled=True, label=2, scaling=1.0)]

## Recombination

In [17]:
rmap

left,right,mid,span,rate
0,33,16.5,33,8.9e-09
33,1030,531.5,997,7.2e-09
1030,1148,1089,118,2.7e-09
1148,1302,1225,154,1.4e-09
1302,1313,1307.5,11,1.8e-09
1313,1627,1470,314,9.2e-10
1627,1853,1740,226,8.4e-10
1853,1874,1863.5,21,9.5e-10
1874,1925,1899.5,51,7.8e-10
1925,2103,2014,178,6.7e-10


In [18]:
nrec = len(rmap) - 1

In [19]:
recregions = []
for i in range(nrec):
    recregions.append(
     fwdpy11.PoissonInterval(
         beg=rmap.left[i],
         end=rmap.right[i],
         mean=rmap.rate[i]
     )   
    )

In [20]:
recregions[:10]

[fwdpy11.PoissonInterval(beg=0.0, end=33.0, mean=8.9208633094493e-09, discrete=False),
 fwdpy11.PoissonInterval(beg=33.0, end=1030.0, mean=7.161484453376708e-09, discrete=False),
 fwdpy11.PoissonInterval(beg=1030.0, end=1148.0, mean=2.711864405352255e-09, discrete=False),
 fwdpy11.PoissonInterval(beg=1148.0, end=1302.0, mean=1.4285714292613398e-09, discrete=False),
 fwdpy11.PoissonInterval(beg=1302.0, end=1313.0, mean=1.818181807131871e-09, discrete=False),
 fwdpy11.PoissonInterval(beg=1313.0, end=1627.0, mean=9.235668793787024e-10, discrete=False),
 fwdpy11.PoissonInterval(beg=1627.0, end=1853.0, mean=8.40707964404883e-10, discrete=False),
 fwdpy11.PoissonInterval(beg=1853.0, end=1874.0, mean=9.52380951879661e-10, discrete=False),
 fwdpy11.PoissonInterval(beg=1874.0, end=1925.0, mean=7.84313725077368e-10, discrete=False),
 fwdpy11.PoissonInterval(beg=1925.0, end=2103.0, mean=6.741573036396607e-10, discrete=False)]

## Rates

We need to specify the total rates

In [21]:
#  The neutral mutation rate, selected mutation rate, and total recombination rate, respectively.
neutral_ml = ml['noncoding']
selected_ml = ml['missense'] + ml['synonymous'] + ml['LOF']


# recomb_rate = ??? | I'm not sure how to set this value
rates = fwdpy11.MutationAndRecombinationRates(
    neutral_mutation_rate=neutral_ml,
    selected_mutation_rate=selected_ml,
    recombination_rate=None)
