In [2]:
import numpy as np
import nlopt
import pandas as pd
from math import log, exp
from multiprocessing import Pool, cpu_count
import time
import matplotlib.pyplot as plt
%matplotlib inline

## Functions for heRho analysis

In [3]:
def rho(rbp,g,L,d):
    r = rbp*(d+2*(g/rbp)*L*(1 - exp(-d/L)))
    return(r)

In [4]:
def single_dist_obj_fun(r,x):
    d,h0,h1,h2,th = x
    r = r*(d)
    calcH0 =(18+13*r + r**2 + 36*th + 22*th**2 + 4*th**3 + r*(6*th+th**2))/((1+th)*(18+13*r+r**2+54*th + 40*th**2+8*th**3+r*(r*th+19*th+6*th**2)))
    calcH2 = (th**2*(36+14*r+r**2+36*th+6*th*r+8*th**2))/((1+th)*(18+13*r+r**2+54*th + 40*th**2+8*th**3+r*(r*th+19*th+6*th**2)))
    calcH1 = 1 - calcH0 - calcH2
    
    val = h0*log(calcH0) + h1*log(calcH1)+h2*log(calcH2)
    return(val)

In [5]:
def single_dist_multi_ind_obj_fun(r,xl):
    res = [ single_dist_obj_fun(r,xs) for xs in xl]
    res = sum(res)
    return(res)

In [6]:
def find_max_like_single_dist(args):
    d, testChr = args
    opt = nlopt.opt(nlopt.LN_NELDERMEAD,1)
    opt.set_lower_bounds([0.00000001])
    opt.set_upper_bounds([0.5])
    startPoint = [0.001]
    opt.set_max_objective(lambda x, grad: single_dist_obj_fun(x,testChr))
    res = opt.optimize(startPoint)
    return(res[0])

In [7]:
def calc_single_dist(x,rbp,g,L):
    d,h0,h1,h2,th = x
    
    r = rbp*(d+2*(g/rbp)*L*(1 - exp(-d/L)))
    
    calcH0 =(18+13*r + r**2 + 36*th + 22*th**2 + 4*th**3 + r*(6*th+th**2))/((1+th)*(18+13*r+r**2+54*th + 40*th**2+8*th**3+r*(r*th+19*th+6*th**2)))
    calcH2 = (th**2*(36+14*r+r**2+36*th+6*th*r+8*th**2))/((1+th)*(18+13*r+r**2+54*th + 40*th**2+8*th**3+r*(r*th+19*th+6*th**2)))
    calcH1 = 1 - calcH0 - calcH2
    
    val = h0*log(calcH0) + h1*log(calcH1)+h2*log(calcH2)
    return(val)

In [8]:
def single_chr_obj_fun(x_,arr):
    rbp, g, L = x_
    test = np.apply_along_axis(lambda x: calc_single_dist(x,rbp,g,L),1,arr)
    obfun = np.sum(test)
    #test = [calc_single_dist(x,rbp,g,L) for x in arr]
    #obfun = sum(test)
    return(obfun)

In [9]:
def across_chr_obj_fun(x,dfArrList,numChrs):
    
    rList = x[0:numChrs]
    gList = x[numChrs:numChrs+numChrs]
    L = x[-1]
    
    obFun = sum([single_chr_obj_fun([rs,gs,L],arr) for (rs,gs,arr) in list(zip(rList,gList,dfArrList))])
    
    return(obFun)

In [10]:
def find_max_like_single_ind(rep):
    
    numberChromsomes = 19
    dfListAuto = [x.iloc[:,[0,4,5,6,11]].to_numpy() for x in dfListAuto]

    lowerBounds = [0.0001 for x in range(numberChromsomes)]+ [0.0001 for x in range(numberChromsomes)]+[10]
    upperBounds = [0.1 for x in range(numberChromsomes)]+[0.1 for x in range(numberChromsomes)]+[2000]
    startPoints = [0.001 for x in range(numberChromsomes)]+[0.001 for x in range(numberChromsomes)]+[50]


    opt=nlopt.opt(nlopt.LN_NELDERMEAD,numberChromsomes+numberChromsomes+1)

    opt.set_lower_bounds(lowerBounds)
    opt.set_upper_bounds(upperBounds)

    opt.set_max_objective(lambda x,grad: across_chr_obj_fun(x,dfListAuto,numberChromsomes))

    res = opt.optimize(startPoints)
    return(res)

In [11]:
def find_max_like_group_ind(numberChromosomes):
    autoNames = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19']
    dfListAuto = [ df.loc[(df['chrom']== x) & (df['name']=='H30') & (df['distances']>=100)] for x in autoNames[0:numberChromosomes]]
    dfListAuto = [combine_across_individuals(x) for x in dfListAuto]

    lowerBounds = [0.0001 for x in range(numberChromosomes)]+ [0.0001 for x in range(numberChromosomes)]+[10]
    upperBounds = [0.1 for x in range(numberChromosomes)]+[0.1 for x in range(numberChromosomes)]+[2000]
    startPoints = [0.001 for x in range(numberChromosomes)]+[0.001 for x in range(numberChromosomes)]+[50]


    opt=nlopt.opt(nlopt.LN_NELDERMEAD,numberChromosomes+numberChromosomes+1)

    opt.set_lower_bounds(lowerBounds)
    opt.set_upper_bounds(upperBounds)

    opt.set_max_objective(lambda x,grad: across_chr_obj_fun(x,dfListAuto,numberChromosomes))

    res = opt.optimize(startPoints)
    return(res)

In [12]:
def combine_across_individuals(thisDF):
    test = thisDF
    numSamples = len(test['name'].unique())
    test = test.groupby(['distances'],as_index=False).sum()
    test['H'] = test['H']/numSamples
    return(test.iloc[:,[0,1,2,3,7]].to_numpy())
    #return(test.iloc[:,[0,4,5,6,11]].to_numpy())
    #return( test.iloc[:,[0,4,5,6,11]].to_numpy() )

In [103]:
def find_max_like_single_chr(args):
    inList,minDist = args
    lowerBounds = [ 1e-7, 1e-7,10]
    upperBounds = [ 0.5, 0.5, 10000]
    startPoints = [ 0.0071, 0.0071, 66]

    opt=nlopt.opt(nlopt.LN_NELDERMEAD,3)

    opt.set_lower_bounds(lowerBounds)
    opt.set_upper_bounds(upperBounds)
    opt.set_max_objective(lambda x,grad: single_chr_obj_fun(x,inList[minDist-1:]))
    res = opt.optimize(startPoints)
    return([x for x in res])

In [48]:
def find_max_like_L_given_single_chr(args):
    inList,minDist = args
    lowerBounds = [ 1e-7, 1e-7,108]
    upperBounds = [ .05, .5, 108]
    startPoints = [ 0.000011, 0.000011, 108]

    opt=nlopt.opt(nlopt.LN_NELDERMEAD,3)

    opt.set_lower_bounds(lowerBounds)
    opt.set_upper_bounds(upperBounds)
    opt.set_max_objective(lambda x,grad: single_chr_obj_fun(x,inList[minDist-1:]))
    res = opt.optimize(startPoints)
    return([x for x in res])

## Analysis

Here we estimate the rates of CO and GC and the mean GC tract length for the X chromosome and compare these estimates to those of the autosomes.

Importing the data set

In [14]:
df = pd.read_csv('../mathematicaInputMice_unbinned.tsv',delimiter='\t')

The data set restricted to the x chromosome is below. 

Note that we include the condition that H != 0 to exclude males from the analysis

In [64]:
df.loc[(df['chrom']=='chrX') & (df['H']!=0) & (df['distances']==1000)]

Unnamed: 0,name,chrom,distances,no_het_bins,one_het_bins,two_het_bins,H0Prop,H1Prop,H2Prop,H
39999,H14,chrX,1000,33306238,254232.0,740,0.992403,0.007575,2.2e-05,0.00381
59999,H15,chrX,1000,33311772,248574.0,708,0.992572,0.007407,2.1e-05,0.003724
79999,H24,chrX,1000,33320017,242344.0,693,0.992759,0.007221,2.1e-05,0.003631
99999,H26,chrX,1000,33315277,247968.0,717,0.992591,0.007388,2.1e-05,0.003715
119999,H27,chrX,1000,33315911,248566.0,797,0.992571,0.007405,2.4e-05,0.003726
159999,H30,chrX,1000,33290628,268685.0,794,0.99197,0.008006,2.4e-05,0.004027
199999,H36,chrX,1000,33312857,249551.0,625,0.992546,0.007435,1.9e-05,0.003736


In [100]:
chrNames = df['chrom'].unique()

we obtain the maximum likelihood co-estimate of Gamma, Kappa, and L for each chromsome separately. Chromosomes are ordered 1, 2, ..., 18, 19, X  below.

We combine data across all individuals, and we exclude the observations for distances d<100.

We see that the separate estimates for each autosome largerly agree with those obtained when all are co-estimated with a single global tract length.  

In contrast, for the X chromosome, heRho estimates little-to-no crossover, a low gene conversion rate, and elevated mean tract length (approximately 10x larger than in autosomes).  Note however, that the sample size for the X chromosome is smaller than for autosomes, and estimates may be poorer.

In [126]:
singleChrEstimates = []
for chrom in chrNames:
    cdf = df.loc[(df['chrom']==chrom) & (df['H'] != 0)]
    testChr = combine_across_individuals(cdf)
    res = find_max_like_single_chr([testChr,100])
    singleChrEstimates.append(res)

singleChrEstimates = np.array(singleChrEstimates)
singleChrEstimates

[[0.001384837691525137, 0.00229706219324625, 121.68920267069825],
 [0.0017470584829202458, 0.0036556340064994083, 106.28930762135163],
 [0.001959082007698121, 0.003761761479066707, 93.77449481007031],
 [0.0015779155113453515, 0.0020519367872829926, 125.72009401078947],
 [0.0019120552217971268, 0.0028915847764834756, 121.29164539275982],
 [0.0017272640901598458, 0.002692051817189848, 107.3719809243635],
 [0.0017567056896119885, 0.0028809171249618544, 89.2414423643061],
 [0.002308650833813212, 0.004531226779882902, 89.13799268047438],
 [0.001552332288937549, 0.003076478655734998, 149.52271168621763],
 [0.0021427925679586178, 0.0047297792562389675, 78.33231968331602],
 [0.0015409335259186354, 0.0024662653148779306, 151.9438798734167],
 [0.0016854736309942849, 0.0034295013616810442, 120.95895736360305],
 [0.002106715787268283, 0.005164335897339648, 79.67833877489745],
 [0.0018697811287865095, 0.003886423423758529, 116.02129530967964],
 [0.001374771998842264, 0.001832639405114471, 169.70405

The tract length distribution is thought to be constant throughtout the genome. However, the rates of recombination and mutation in X chromosomes inherintely differs from that of autosomes. Therefore, in the main analysis, we co-estimated all _autosome_ paramters with a global tract length L, and we found L=108 the most likely. 

Here, we condition on the mean tract length of 108 base pairs and estimate the CO and GC rates, again separately for each chromsome. Although we have global co-estimates for the autosomes, here we obtain single-chromosome estimates conditioned on L in order to make a fair compairison to the estimates we obtain for the X chromosome.

We now find a (perhaps more realistic) estimate for the rates of recombination in the X chromosome:

In [127]:
singleChrEstimatesGivenL = []
for chrom in chrNames:
    cdf = df.loc[(df['chrom']==chrom) & (df['H'] != 0)]
    testChr = combine_across_individuals(cdf)
    res = find_max_like_L_given_single_chr([testChr,100])
    singleChrEstimatesGivenL.append(res)
singleChrEstimatesGivenL = np.array(singleChrEstimatesGivenL)
singleChrEstimatesGivenL


[[0.0014273285279010391, 0.0024443788382259623, 108.0],
 [0.0017390796002656015, 0.0036240526366245014, 108.0],
 [0.0018967490573164607, 0.0034687425046281513, 108.0],
 [0.0016297407536095593, 0.0022143503835071986, 108.0],
 [0.0019672767161944874, 0.003064108539165159, 108.0],
 [0.0017251008127854564, 0.0026835445533939894, 108.0],
 [0.0016972388556097998, 0.002573148973450077, 108.0],
 [0.002210028871586287, 0.00405584120083239, 108.0],
 [0.0017556825058882182, 0.0035570599329271748, 108.0],
 [0.0020023351583798644, 0.0038793705931818943, 108.0],
 [0.0017156136392875976, 0.002866851229960386, 108.0],
 [0.0017475014698493942, 0.0036329275332614916, 108.0],
 [0.0019583328059637234, 0.0042855663098083484, 108.0],
 [0.001912588997403473, 0.004032659020748657, 108.0],
 [0.001563901971825664, 0.0022113608955336966, 108.0],
 [0.0019421396755438065, 0.0036274597822906606, 108.0],
 [0.0018160875020002206, 0.002910349662893056, 108.0],
 [0.0020627639278327464, 0.0027249862876087324, 108.0],
 [

The mean genetic diversity among autosomes

In [145]:
autoHets = df.loc[(df['chrom']!="chrX")]['H']
sum(autoHets)/len(autoHets)

0.007063150054605022

The scaled kappa/theta for each chromosome:

In [159]:
[x/0.00706 for x in singleChrEstimatesGivenL[0:-1,0]]

[0.2021711795893823,
 0.2463285552784138,
 0.26866133956323807,
 0.23084146651693474,
 0.2786510929453948,
 0.24434855705176434,
 0.24040210419402264,
 0.3130352509329018,
 0.24868024162722635,
 0.28361687795748786,
 0.2430047647716144,
 0.2475214546528887,
 0.2773842501365047,
 0.2709049571393021,
 0.22151586003196372,
 0.2750906056011057,
 0.2572361900850171,
 0.29217619374401504,
 0.38342318166560196]

The scaled gamma/theta for each chromsome:

In [162]:
[x/0.00706 for x in singleChrEstimatesGivenL[0:-1,1]]

[0.3462292971991448,
 0.5133219032046036,
 0.4913233009388316,
 0.3136473631030026,
 0.434009708097048,
 0.38010546082067836,
 0.3644686931232404,
 0.5744817564918399,
 0.5038328516894015,
 0.5494859197141493,
 0.4060695793145022,
 0.5145789707169252,
 0.6070207237688878,
 0.5711981615791298,
 0.31322392288012696,
 0.5138045017408868,
 0.4122308304381099,
 0.3859753948454295,
 0.644847622455813]

The mean kappa, gamma, and L obtained across the autosomes, conditioned on L = 108

In [148]:
np.mean(singleChrEstimatesGivenL[0:-1],axis=0)

array([1.86718203e-03, 3.28470437e-03, 1.08000000e+02])

The heterozygosity theta for the X chromosome is below. 

note that the mean heterozygosity theta on the X chromosome is expected to be 3/4 that of the autosomes. 

In [146]:
xHets = df.loc[(df['chrom']=="chrX") & (df['H']!= 0 )]['H']
sum(xHets)/len(xHets)

0.003777521934248191

the expected diversity, however, is 

In [153]:
0.00706*3/4

0.005295

How do these estimates for the X chromosome compare to those of the autosomes?

We expect the X chromosome to experience 2/3 the crossover rate compared to the autosomes. 

Indeed, we found an average CO rate of approximately 0.0012 for the X compared to 0.0019 for the autosomes.

We found that the GC rate is relatively low on the X chromosome compared to the autosomes. The X chromosome is the only one for which the GC rate is lower than the CO rate. 

The recombination rate estimates from heRho are dependent on the observed level of heterozygosity. For the autosomes, kappa\theta = k/mu and gamma/tehta =g/mu. That is, the heRho tells us the ratio of the rates, so that if we know mu, we can estimate the underlying value of k and g (that are independent of the effective population size). However, X chromosomes experience only 2/3 the rate of autosomal recombination and 3/4 the rate of new mutations.  

k/mu = kappa_auto / theta_auto = (3/2)kappa_X / (4/3)theta_X = 9/8 kappa_x/ theta_x.

Whith this adjustment, we see that among all chromosomes, the X chromosome has the second-highest estimated rate of recombination (relative to mutation), but it experiences approximately 3-fold less gene conversioin than autosomes.  

In [149]:
print("auto kappa ratio:  ", 0.001867/0.00706)
print("auto gamma ratio:  ", 0.003285/0.00706)

auto kappa ratio:   0.2644475920679886
auto gamma ratio:   0.4652974504249292


In [160]:
print("X kappa ratio:  ", 0.001228/0.00377)
print("x gamma ratio ",  0.000597/0.00377)

X kappa ratio:   0.3257294429708223
x gamma ratio  0.1583554376657825


In [163]:
print("X k/mu ratio:  ", 0.001228/0.00377*9/8)
print("x g/mu ratio ",  0.000597/0.00377*9/8)

X k/mu ratio:   0.36644562334217506
x g/mu ratio  0.17814986737400532
