/
rss_ash.m
350 lines (294 loc) · 11.4 KB
/
rss_ash.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
function [bsam, zsam, wsam, lsam, Naccept] = rss_ash(betahat, se, R, Nsnp, sigma_beta, Ndraw, Nburn, Nthin)
% USAGE: simulate posterior draws under the RSS model with ASH prior
% INPUT:
% betahat: 1 by p, effect size estimates under single-SNP model
% se: 1 by p, standard errors of betahat
% R: p by p, population LD, symmetric, positive definite matrix
% Nsnp: 1 by p, the number of individuals for each SNP
% sigma_beta: 1 by K, ordered and non-negative, prior SD on beta
% Ndraw: scalar, the total number of MCMC draws
% Nburn: scalar, the length of burn-in
% Nthin: scalar, the length of thinning interval
% OUTPUT:
% bsam: Nsam by p, the output chain of beta
% zsam: Nsam by p, the output chain of zlabel
% wsam: Nsam by K, the output chain of omega
% lsam: Nsam by 1, the output chain of lambda
% Naccept: scalar, the total number of acceptance in MH step
% make sure the input summary-level data are row vectors
if (rows(betahat) ~= 1)
betahat = betahat';
end
if (rows(se) ~= 1)
se = se';
end
if (rows(Nsnp) ~= 1)
Nsnp = Nsnp';
end
p = length(betahat);
fprintf('total number of variants analyzed: %d \n', p);
if length(se) ~= p
error('length of betahat and se must be the same!');
end
K = length(sigma_beta);
fprintf('number of mixture components in ASH prior: %d \n', K);
Nstart = Nburn + Nthin;
Nsam = length(Nstart:Nthin:Ndraw);
tsam = 0;
Naccept = 0;
% preallocate memory for mcmc output
zsam = uint8(zeros(Nsam, p));
bsam = zeros(Nsam, p);
wsam = zeros(Nsam, K);
lsam = zeros(Nsam, 1);
% pre-compute some quantities used in mcmc iterations
q = betahat ./ (se.^2);
zs = betahat ./ se;
% lower/upper bound on lambda and step size of proposal
lambda_LB = 1e-12;
lambda_UB = 1e1;
sigma_lambda = 0.01;
% inititate latent label based on marginal z-scores
abz = abs(zs);
[gamma_start, snp_rank] = initiate_model(p, abz);
z = gamma_start + 1;
% set starting labels for the large effect
z(z==2) = round(2*K/3) + 1;
% initiate the label counts given z
zc = zeros(1, K);
zc(1) = sum(z==1);
zc(round(2*K/3)+1) = p - zc(1);
% initiate beta given the initial z
b = sigma_beta(z) .* randn(1, p);
bs = b ./ se;
% initiate lambda by 1
l = 1;
lt = log((l-lambda_LB)/(lambda_UB-l));
% initiate omega by Dir(1,...,1)
w = dirichlet_sample(ones(1, K));
fprintf('model initiation done \n')
% rank-based proposal distribution
p_gamma = pmf_ugmix(p, 2e3);
p_gamma = p_gamma(snp_rank);
% if R is an identity matrix, then we have a faster implementation
matrix_type = 1;
if R == eye(p)
matrix_type = 0;
fprintf('faster computation avaialbe if R is identity matrix \n')
end
% viz the mcmc progress
progress_bar = progress('init','start MCMC iteration');
% the FIRST mcmc step is run outside FOR loop
% update (beta, zlabel) given data and (omega, lambda)
[b_loglik, b_logpos] = compute_likpos(q, se, R, sigma_beta, b, z, zc, w, matrix_type);
[b, bs, z, zc, b_loglik, b_logpos] = update_bz(zs, se, R, b, bs, z, zc, b_loglik, b_logpos, w, sigma_beta, matrix_type, p_gamma);
% update omega given data and (beta, zlabel, lambda)
w = update_omega(zc, l, K);
% update lambda given data and (beta, zlabel, omega)
[lt_loglik, l] = loglik_theta(lt, w, K, lambda_LB, lambda_UB);
[l, lt, lt_loglik, Naccept] = update_lambda(w, l, lt, lt_loglik, sigma_lambda, K, lambda_LB, lambda_UB, Naccept);
if Nburn == 0 && Nthin == 1
tsam = tsam + 1;
bsam(tsam, :) = b;
zsam(tsam, :) = uint8(z);
wsam(tsam, :) = w;
lsam(tsam) = l;
end
progress_bar = progress(progress_bar, 1/Ndraw);
%% mcmc rountine to sample (beta, z, omega, lambda)
for t=2:Ndraw
% update (beta, zlabel) given data and (omega, lambda)
[b, bs, z, zc, b_loglik, b_logpos] = update_bz(zs, se, R, b, bs, z, zc, b_loglik, b_logpos, w, sigma_beta, matrix_type, p_gamma);
% update omega given data and (beta, zlabel, lambda)
w = update_omega(zc, l, K);
% update lambda given data and (beta, zlabel, omega)
[l, lt, lt_loglik, Naccept] = update_lambda(w, l, lt, lt_loglik, sigma_lambda, K, lambda_LB, lambda_UB, Naccept);
% record the MCMC simulated draws of parameters
if t > Nburn && mod(t-Nburn, Nthin) == 0
tsam = tsam + 1;
bsam(tsam, :) = b;
zsam(tsam, :) = uint8(z);
wsam(tsam, :) = w;
lsam(tsam) = l;
end
progress_bar = progress(progress_bar, t/Ndraw);
end
end
function [gamma_start, snp_rank] = initiate_model(p, abz)
% USAGE: assign initial value to gamma for MCMC run by marginal association p-value/z-score
% INPUT:
% p: the number of snps analyzed
% abz: the absolute z-score obtained from single-marker model
% OUTPUT:
% gamma_start: the initial state of the inclusion vector, 0/1, 1 by p
% snp_rank: the rank of each snp based on its marginal p-value, integer, 1 by p
gamma_start = zeros(1, p);
q_genome = invnormcdf( 1 - (0.025/p) ); % or use matlab stat tool box: norminv
in_loci = abz > q_genome;
ngamma_start = sum(in_loci);
[~, snp_rank] = sort(abz, 'descend');
if ngamma_start < 10
gamma_start(snp_rank(1:10)) = 1;
else
gamma_start(in_loci) = 1;
end
end
function p_gamma = pmf_ugmix(p, geomean)
% USAGE: find pmf of mixture of uniform (0.3) and truncated geometric (0.7) rvs
% INPUT:
% p: scalar, specifying the support for the mixture rv on 1, 2, ... p
% geomean: the mean parameter of the truncated geometrical rv, scalar
% OUTPUT:
% p_gamma: pmf vector, 1 by p
gp = 1 - exp(-1/geomean); % the same as piMASS
unif_term = (0.3/p) * ones(1, p);
geom_seq = (1-gp) .^ (0:p-1);
geom_term = 0.7*gp/(1-(1-gp)^p) * geom_seq;
p_gamma = unif_term + geom_term;
p_gamma = p_gamma / sum(p_gamma);
end
function [loglik, logpos] = compute_likpos(q, se, R, sigmabeta, beta, zlabel, labelcounts, omega, matrix_type)
% USAGE: compute log p(betahat|beta) and log p(beta, zlabel|betahat, theta)
% INPUT:
% q: 1 by p, betahat ./ (se.^2)
% se: 1 by p
% R: p by p correlation matrix
% sigmabeta: 1 by K, prior SD on beta
% beta: 1 by p
% zlabel: 1 by p
% labelcounts: 1 by K
% omega: 1 by K, postive and sum to 1
% matrix_type: 0 if R is identity; 1 otherwise
% OUTPUT:
% loglik: log p(betahat|beta) up to some constant
% logpos: log p(beta, z|betahat, theta) up to some constant
% p(beta, z|betahat, theta) is proportional to
% p(betahat|beta) * p(beta|z, theta) * p(z|theta)
logpos = 0;
% log p(z|theta)
logpos = logpos + dot(labelcounts, log(omega));
% log p(beta|z, theta)
logpos = logpos - dot(labelcounts, log(sigmabeta));
betastd = sigmabeta(zlabel);
logpos = logpos - 0.5* sum( (beta ./ betastd).^2 );
% log p(betahat|beta)
sb = beta ./ se;
switch matrix_type
case 0
loglik = dot(q, beta) - 0.5*dot(sb, sb);
case 1
loglik = dot(q, beta) - 0.5*(sb * R * sb');
end
logpos = logpos + loglik;
end
function omega = update_omega(labelcounts, lambda, K)
% USAGE: Gibbs update of omega
% INPUT:
% labelcounts: 1 by K
% lambda: current value of lambda
% K: the number of label types
% OUTPUT:
% omega: updated value of omega, 1 by K, positive and sum to 1
% update the parameter of dirichlet distribution
para = labelcounts + lambda * ones(1, K);
% sample omega from the new dirichlet
omega = dirichlet_sample(para);
omega = omega ./ sum(omega);
end
function r = dirichlet_sample(a,n)
% Author: Thomas Minka
% Source: http://research.microsoft.com/en-us/um/people/minka/software/fastfit/
% DIRICHLET_SAMPLE Sample from Dirichlet distribution.
%
% DIRICHLET_SAMPLE(a) returns a probability vector sampled from a
% Dirichlet distribution with parameter vector A.
% DIRICHLET_SAMPLE(a,n) returns N samples, collected into a matrix, each
% vector having the same orientation as A.
%
% References:
% [1] L. Devroye, "Non-Uniform Random Variate Generation",
% Springer-Verlag, 1986
% This is essentially a generalization of the method for Beta rv's.
% Theorem 4.1, p.594
if nargin < 2
n = 1;
end
row = (rows(a) == 1);
a = a(:);
% y = gamrnd(repmat(a, 1, n),1);
% randgamma is faster
y = randgamma(repmat(a, 1, n));
r = col_sum(y);
% r(find(r == 0)) = 1; % Minka's version
r(r == 0) = 1;
r = y./repmat(r, rows(y), 1);
if row
r = r';
end
end
function [loglik, lambda] = loglik_theta(theta, omega, K, a, b)
% USAGE: transform theta to lambda; compute log p(theta|omega)
% INPUT:
% theta: scalar, logit transformation of lambda
% omega: 1 by K, all positive and sum 1
% K: scalar
% a & b: lower and upper bound of lambda
% OUTPUT:
% loglik: scalar, log p(theta|omega)
% lambda: scalar
lambda = (a + b*exp(theta)) / (1 + exp(theta));
loglik = gammaln(K*lambda) - K * gammaln(lambda);
loglik = loglik + lambda * sum(log(omega));
loglik = loglik + theta - 2 * log(1 + exp(theta));
end
function [lambda, theta, theta_loglik, Naccept] = update_lambda(omega, lambda_old, theta_old, loglik_old, sigma_theta, K, a, b, Naccept)
% USAGE: update theta:=logit(lambda); use Metropolis-Hastings (MH) normal random walk
% INPUT:
% omega: 1 by K, positive and sum to 1
% lambda_old: scalar, the current value of lambda
% theta_old: scalar, logit transform of lambda_old
% loglik_old: scalar, the log likelihood at theta_old
% sigma_theta: scalar, the SD of the normal random walk
% K: number of mixture components
% a & b: the lower and upper bounds of lambda
% Naccept: the total number of acceptance up til now
% OUTPUT:
% lambda: the inverse logit transform of theta
% theta: the MH updated value of theta
% theta_loglik: the log likelihood at updated value of theta
% Naccept: the total number of acceptance after this MH update
if rand < 0.33
extrastep = randperm(19);
repeat = 1 + extrastep(1);
else
repeat = 1;
end
theta_new = propose_theta(theta_old, sigma_theta, repeat);
[loglik_new, lambda_new] = loglik_theta(theta_new, omega, K, a, b);
theta_ratio = loglik_new - loglik_old;
if theta_ratio > 0 || log(rand) < theta_ratio
Naccept = Naccept + 1;
lambda = lambda_new;
theta = theta_new;
theta_loglik = loglik_new;
else
lambda = lambda_old;
theta = theta_old;
theta_loglik = loglik_old;
end
end
function theta_new = propose_theta(theta_old, sigma_theta, repeat)
% USAGE: propose the gaussian random walk update for theta in the MH step
% INPUT:
% theta_old: the current value of theta, scalar
% sigma_theta: the sd of the normal move added to theta_old, scalar, positive
% repeat: the number of local proposals to compound, scalar
% OUTPUT:
% theta_new: the newly proposed value of theta, scalar
% NB: this is a symmetric jump, so the contribution to log MH ratio is ZERO
theta = theta_old;
for i=1:repeat
theta = theta + randn * sigma_theta; % add a normal error and reflect about the boundary if outside
end
theta_new = theta;
end