-
-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
model.py
229 lines (187 loc) · 9.32 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Author: Anand Patil
# Date: 6 Feb 2009
# License: Creative Commons BY-NC-SA
####################################
# The idea: There are two mutations involved, one (Nr33) converting Fya to Fyb and one (Nr125)
# silencing expression. The latter mutation tends to occur only with the former, but the converse
# is not true.
#
# The model for an individual chromosomal genotype is that the first mutation is a spatially-
# correlated random field, but the second occurs independently with some probability in Africa
# and another probability outside Africa.
#
# There are two types of datapoints:
# - one testing individuals for phenotype (a-b-), meaning both chromosomes have the silencing
# mutation.
# - one testing individuals for expression of Fya on either chromosome.
# It's easy to make a likelihood model for either of these, just a bit complicated.
# The maps we'll eventually want to make will be of (a-b-) frequency, meaining the postprocessing
# function will need to close on model variables. The generic package doesn't currently support this.
import numpy as np
import pymc as pm
import gc
from map_utils import *
from generic_mbg import *
import generic_mbg
__all__ = ['make_model']
def make_gp_submodel(suffix, mesh, africa_val=None, with_africa_covariate=False):
"""
A small function that creates the mean and covariance object
of the random field.
"""
# from duffy import cut_matern
# The partial sill.
amp = pm.Exponential('amp_%s'%suffix, .1, value=1.)
# The range parameter. Units are RADIANS.
# 1 radian = the radius of the earth, about 6378.1 km
# scale = pm.Exponential('scale', 1./.08, value=.08)
scale = pm.Exponential('scale_%s'%suffix, 1, value=.08)
scale_in_km = scale*6378.1
# The nugget variance. Lower-bounded to preserve mixing.
V = pm.Exponential('V_%s'%suffix, 1, value=1.)
@pm.potential
def V_bound(V=V):
if V<.1:
return -np.inf
else:
return 0
# Create the covariance & its evaluation at the data locations.
@pm.deterministic(trace=True,name='C_%s'%suffix)
def C(amp=amp, scale=scale):
return pm.gp.FullRankCovariance(pm.gp.exponential.geo_rad, amp=amp, scale=scale)
# Create the mean function
if with_africa_covariate:
beta = pm.Normal('beta_%s'%suffix, 0, .01, value=1)
@pm.deterministic(trace=True, name='M_%s'%suffix)
def M(mesh=mesh, africa_val = africa_val, beta=beta):
def f(x, mesh=mesh, africa_val=africa_val, beta=beta):
if np.all(x == mesh):
return africa_val*beta
else:
raise ValueError, 'Value of Africa covariate on input array is unknown.'
return pm.gp.Mean(pm.gp.zero_fn)
else:
@pm.deterministic(trace=True, name='M_%s'%suffix)
def M():
return pm.gp.Mean(pm.gp.zero_fn)
# Create the GP submodel
sp_sub = pm.gp.GPSubmodel('sp_sub_%s'%suffix,M,C,mesh)
sp_sub.f_eval.value = sp_sub.f_eval.value - sp_sub.f_eval.value.mean()
return locals()
# =========================
# = Haplotype frequencies =
# =========================
h_freqs = {'a': lambda pb, p0, p1: (1-pb)*(1-p1),
'b': lambda pb, p0, p1: pb*(1-p0),
'0': lambda pb, p0, p1: pb*p0,
'1': lambda pb, p0, p1: (1-pb)*p1}
hfk = ['a','b','0','1']
hfv = [h_freqs[key] for key in hfk]
# ========================
# = Genotype frequencies =
# ========================
g_freqs = {}
for i in xrange(4):
for j in xrange(i,4):
if i != j:
g_freqs[hfk[i]+hfk[j]] = lambda pb, p0, p1, i=i, j=j: 2 * np.asscalar(hfv[i](pb,p0,p1) * hfv[j](pb,p0,p1))
else:
g_freqs[hfk[i]*2] = lambda pb, p0, p1, i=i: np.asscalar(hfv[i](pb,p0,p1))**2
for i in xrange(1000):
pb,p0,p1 = np.random.random(size=3)
np.testing.assert_almost_equal(np.sum([gfi(pb,p0,p1) for gfi in g_freqs.values()]),1.)
def make_model(lon,lat,africa,n,datatype,
genaa,genab,genbb,gen00,gena0,genb0,gena1,genb1,gen01,gen11,
pheab,phea,pheb,
phe0,prom0,promab,
aphea,aphe0,
bpheb,bphe0):
"""
This function is required by the generic MBG code.
"""
# Step method granularity
grainsize = 5
logp_mesh = np.vstack((lon,lat)).T*np.pi/180.
# Probability of mutation in the promoter region, given that the other thing is a.
p1 = pm.Uniform('p1', 0, .04, value=.01)
# Spatial submodels
spatial_b_vars = make_gp_submodel('b',logp_mesh,with_africa_covariate=True)
spatial_0_vars = make_gp_submodel('0',logp_mesh)
sp_sub_b = spatial_b_vars['sp_sub']
sp_sub_0 = spatial_0_vars['sp_sub']
# Loop over data clusters, adding nugget and applying link function.
eps_p_f0_d = []
p0_d = []
eps_p_fb_d = []
pb_d = []
V_b = spatial_b_vars['V']
V_0 = spatial_0_vars['V']
for i in xrange(np.ceil(len(n)/float(grainsize))):
sl = slice(i*grainsize,(i+1)*grainsize,None)
if sl.stop>sl.start:
this_fb = pm.Lambda('fb_%i'%i, lambda f=sp_sub_b.f_eval, sl=sl: f[sl], trace=False)
this_f0 = pm.Lambda('f0_%i'%i, lambda f=sp_sub_0.f_eval, sl=sl: f[sl], trace=False)
# Nuggeted field in this cluster
eps_p_fb_d.append(pm.Normal('eps_p_fb_%i'%i, this_fb, 1./V_b, value=np.random.normal(size=np.shape(this_fb.value)), trace=False))
eps_p_f0_d.append(pm.Normal('eps_p_f0_%i'%i, this_f0, 1./V_0, value=np.random.normal(size=np.shape(this_fb.value)), trace=False))
# The allele frequency
pb_d.append(pm.Lambda('pb_%i'%i,lambda lt=eps_p_fb_d[-1]: invlogit(np.atleast_1d(lt)),trace=False))
p0_d.append(pm.Lambda('p0_%i'%i,lambda lt=eps_p_f0_d[-1]: invlogit(np.atleast_1d(lt)),trace=False))
# The fields plus the nugget
@pm.deterministic
def eps_p_fb(eps_p_fb_d = eps_p_fb_d):
"""Concatenated version of eps_p_fb, for postprocessing & Gibbs sampling purposes"""
return np.hstack(eps_p_fb_d)
@pm.deterministic
def eps_p_f0(eps_p_f0_d = eps_p_f0_d):
"""Concatenated version of eps_p_f0, for postprocessing & Gibbs sampling purposes"""
return np.hstack(eps_p_f0_d)
init_OK = True
# The likelihoods.
data_d = []
for i in xrange(len(n)):
sl_ind = int(i/grainsize)
sub_ind = i%grainsize
if sl_ind == len(p0_d):
break
# See duffy/doc/model.tex for explanations of the likelihoods.
p0 = pm.Lambda('p0_%i_%i'%(sl_ind,sub_ind), lambda p=p0_d[sl_ind], j=sub_ind: p[j], trace=False)
pb = pm.Lambda('pb_%i_%i'%(sl_ind,sub_ind), lambda p=pb_d[sl_ind], j=sub_ind: p[j], trace=False)
if datatype[i]=='prom':
cur_obs = [prom0[i], promab[i]]
# Need to have either b and 0 or a and 1 on both chromosomes
p = pm.Lambda('p_%i'%i, lambda pb=pb, p0=p0, p1=p1: (pb*p0+(1-pb)*p1)**2, trace=False)
n = np.sum(cur_obs)
data_d.append(pm.Binomial('data_%i'%i, p=p, n=n, value=prom0[i], observed=True))
elif datatype[i]=='aphe':
cur_obs = [aphea[i], aphe0[i]]
n = np.sum(cur_obs)
# Need to have (a and not 1) on either chromosome, or not (not (a and not 1) on both chromosomes)
p = pm.Lambda('p_%i'%i, lambda pb=pb, p0=p0, p1=p1: 1-(1-(1-pb)*(1-p1))**2, trace=False)
data_d.append(pm.Binomial('data_%i'%i, p=p, n=n, value=aphea[i], observed=True))
elif datatype[i]=='bphe':
cur_obs = [bpheb[i], bphe0[i]]
n = np.sum(cur_obs)
# Need to have (b and not 0) on either chromosome
p = pm.Lambda('p_%i'%i, lambda pb=pb, p0=p0, p1=p1: 1-(1-pb*(1-p0))**2, trace=False)
data_d.append(pm.Binomial('data_%i'%i, p=p, n=n, value=aphea[i], observed=True))
elif datatype[i]=='phe':
cur_obs = np.array([pheab[i],phea[i],pheb[i],phe0[i]])
n = np.sum(cur_obs)
p = pm.Lambda('p_%i'%i, lambda pb=pb, p0=p0, p1=p1: np.array([\
g_freqs['ab'](pb,p0,p1),
g_freqs['a0'](pb,p0,p1)+g_freqs['a1'](pb,p0,p1)+g_freqs['aa'](pb,p0,p1),
g_freqs['b0'](pb,p0,p1)+g_freqs['b1'](pb,p0,p1)+g_freqs['bb'](pb,p0,p1),
g_freqs['00'](pb,p0,p1)+g_freqs['01'](pb,p0,p1)+g_freqs['11'](pb,p0,p1)]), trace=False)
np.testing.assert_almost_equal(p.value.sum(), 1)
data_d.append(pm.Multinomial('data_%i'%i, p=p, n=n, value=cur_obs, observed=True))
elif datatype[i]=='gen':
cur_obs = np.array([genaa[i],genab[i],gena0[i],gena1[i],genbb[i],genb0[i],genb1[i],gen00[i],gen01[i],gen11[i]])
n = np.sum(cur_obs)
p = pm.Lambda('p_%i'%i, lambda pb=pb, p0=p0, p1=p1, g_freqs=g_freqs: \
np.array([g_freqs[key](pb,p0,p1) for key in ['aa','ab','a0','a1','bb','b0','b1','00','01','11']]), trace=False)
np.testing.assert_almost_equal(p.value.sum(), 1)
data_d.append(pm.Multinomial('data_%i'%i, p=p, n=n, value=cur_obs, observed=True))
if np.any(np.isnan(cur_obs)):
raise ValueError
return locals()