/
gsea_wrapper.m
194 lines (166 loc) · 9.73 KB
/
gsea_wrapper.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
function [log10bf,logw1,alpha,mu,s] = gsea_wrapper(method,betahat,se,SiRiS,se_all,Nsnp_all,snps,h,theta0,theta,logw0,alpha0,mu0,alpha,mu)
% USAGE: perform the full variational inference for the RSS-BVSR baseline model
% enrichment model: the candidate gene set is enriched for genotype-phenotype association
% INPUT:
% method: the implementation of rss-varbvsr, string
% betahat: effect size estimates under single-SNP model, p by 1 array or C by 1 cell array
% se: standard errors of betahat, p by 1 array or C by 1 cell array
% SiRiS: inv(S)*R*inv(S), sparse matrix (CCS format), p by p array or C by 1 cell array
% se_all: standard errors for all SNPs in the genome, numsnp by 1
% Nsnp_all: sample sizes for all SNPs in the genome, numsnp by 1
% snps: indices of SNPs on the genome that are assigned to the gene set, p by 1
% h: the grid of proportion of phenotypic variance explained by available genotypes, nh by 1
% theta0: the grid of the genome-wide log-odds (base 10), n0 by 1
% theta: the grid of the enrichment (base 10), n1 by 1
% logw0: log-importance weights for the grid of theta0 under the baseline model, n0 by nh
% alpha0: variational posterior inclusion probabilities under the baseline model, p by n0 by nh
% mu0: variational posterior expected additive effects (if the SNP is included) under the baseline model, p by n0 by nh
% alpha: initial values of the posterior inclusion probabilities under the erichment model, p by n0 by n1 by nh
% mu: initial values of the expected additive effects (if the SNP is included) under the enrichment model, p by n0 by n1 by nh
% OUTPUT:
% log10bf: log 10 of gene set enrichment Bayes factor (BF), scalar
% logw1: log-importance weights under the enrichment model, n0 by n1 by nh
% alpha: the variational posterior inclusion probabilities under the enrichment model, p by n0 by n1 by nh
% mu: the variational posterior means of the additive effects (if the SNP is included) under the enrichment model, p by n0 by n1 by nh
% s: the variational posterior variances of the additive effects (if the SNP is included) under the enrichment model, p by n0 by n1 by nh
% NOTE:
% To calculate the BF, we integrate over the hyperparameters using importance sampling under the
% assumption that SNPs outside the enriched pathways are unaffected by pathways (a posteriori).
% See pp 5-6 of Supplementary text of Carbonetto and Stephens (PLoS Genetics, 2013).
% get the number of settings of the genome-wide log-odds (n0)
n0 = numel(theta0);
% get the number of settings of the enrichment parameter (n1)
n1 = numel(theta);
% get the number of settings of the heritability parameter (nh)
nh = numel(h);
% sanity check on subsetting the se data
if iscell(se)
se_vec = cell2mat(se);
else
se_vec = se;
end
if all(se_vec==se_all(snps)) ~= 1
error('Inconsistent data input ...');
end
clear se_vec;
% first get the best initialization for the variational parameters
fprintf('Finding best initialization for %d combinations \n',n0*n1*nh);
fprintf('of hyperparameters.\n');
[logw1,alpha,mu] = gsea_outerloop(method,betahat,se,SiRiS,se_all,Nsnp_all,snps,h,theta0,theta,alpha,mu,logw0,alpha0,mu0);
% next choose an initialization common to all the runs of the coordinate ascent algorithm
% this is chosen from the hyperparameters with the highest variational estimate of the log-importance weight
[~, i] = max(logw1(:));
alpha = repmat(alpha(:,i),[1 n0 n1 nh]);
mu = repmat(mu(:,i),[1 n0 n1 nh]);
% compute the unnormalized log-importance weights
fprintf('Computing importance weights for %d combinations \n',n0*n1*nh);
fprintf('of hyperparameters.\n');
[logw1,alpha,mu,s] = gsea_outerloop(method,betahat,se,SiRiS,se_all,Nsnp_all,snps,h,theta0,theta,alpha,mu,logw0,alpha0,mu0);
% compute the marginal log-likelihood under null using importance sampling
c = max(logw0(:));
logZ0 = c + log(mean(exp(logw0(:) - c)));
% compute the marginal log-likelihood under enrichment using importance sampling
c = max(logw1(:));
logZ1 = c + log(mean(exp(logw1(:) - c)));
% get the numerical estimate of the Bayes Factor (BF)
log10bf = (logZ1 - logZ0) / log(10);
% print the BF value.
fprintf('Log 10 BF = %0.2e\n',log10bf);
end
function [logw1,alpha,mu,s] = gsea_outerloop(method,betahat,se,SiRiS,se_all,Nsnp_all,snps,h,theta0,theta,alpha,mu,logw0,alpha0,mu0)
% USAGE: the outer loop of the variational inference for the RSS-BVSR enrichment model
% INPUT:
% method: the implementation of rss-varbvsr, character
% betahat: effect size estimates under single-SNP model, p by 1 array or C by 1 cell array
% se: standard errors of betahat, p by 1 array or C by 1 cell array
% SiRiS: inv(S)*R*inv(S), sparse matrix (ccs format), p by p array or C by 1 cell array
% se_all: standard errors for all SNPs in the genome, numsnp by 1
% Nsnp_all: sample sizes for all SNPs in the genome, numsnp by 1
% snps: indices of SNPs on the genome that are assigned to the gene set, p by 1
% h: the grid of proportion of phenotypic variance explained by available genotypes, nh by 1
% theta0: the grid of the genome-wide log-odds (base 10), n0 by 1
% theta: the grid of the enrichment (base 10), n1 by 1
% logw0: log-importance weights for the grid of theta0 under the baseline model, n0 by nh
% alpha0: the variational posterior inclusion probabilities under the baseline model, p by n0 by nh
% mu0: the variational posterior means of the additive effects (if the SNP is included) under the baseline model, p by n0 by nh
% alpha: initial values of the posterior inclusion probabilities under the erichment model, p by n0 by n1 by nh
% mu: initial values of the expected additive effects (if the SNP is included) under the enrichment model, p by n0 by n1 by nh
% OUTPUT:
% logw1: log-importance weights under the enrichment model, n0 by n1 by nh
% alpha: the variational posterior inclusion probabilities under the enrichment model, p by n0 by n1 by nh
% mu: the variational posterior means of the additive effects under the enrichment model, p by n0 by n1 by nh
% s: the variational posterior variances of the additive effects under the enrichment model, p by n0 by n1 by nh
% get the number of SNPs assigned to the enriched pathway (p)
if iscell(betahat)
p = length(cell2mat(betahat));
else
p = length(betahat);
end
% get the number of settings of the genome-wide log-odds (n0)
n0 = numel(theta0);
% get the number of settings of the enrichment parameter (n1)
n1 = numel(theta);
% get the number of settings of the heritability parameter (nh)
nh = numel(h);
% get the number of SNPs in the whole genome (ng)
ng = length(se_all);
% initialize storage for log-importance weights under the alternative
% hypothesis, and for the variances of the additive effects
logw1 = zeros(n0,n1,nh);
s = zeros(p,n0,n1,nh);
iter = 0;
% repeat for each setting of the heritability (h)
for k = 1:nh
% repeat for each setting of the genome-wide log-odds (theta0)
for i = 1:n0
% NOTE:
% First compute the variational lower bound to the marginal log-likelihood
% under the null hypothesis when there is no enrichment, only for SNPs in
% the gene set. Note that rss_varbvsr defines the log-odds ratio using the
% natural logarithm, so we need to multiply theta0 by log(10).
log10odds_null = theta0(i) * ones(ng,1); % genome-wide baseline
sigb0 = calc_beta_sd(se_all, Nsnp_all, sigmoid10(log10odds_null), h(k));
logodds0 = log(10) * theta0(i);
options = struct('alpha',alpha0(:,i,k),'mu',mu0(:,i,k),'verbose',true);
% run rss-varbvsr under null
F0 = rss_varbvsr_wrapper(method,betahat,se,SiRiS,sigb0,logodds0,options);
% repeat for each setting of the enrichment parameter (theta)
for j = 1:n1
% display the current status of the inference procedure
iter = iter + 1;
fprintf('(%04d) theta0 = %+0.2f, theta = %0.2f, h = %0.2f \n',iter,theta0(i),theta(j),h(k));
% NOTE:
% Next compute the marginal log-likelihood under the alternative
% hypothesis only for SNPs in the pathway. Note that rss_varbvsr
% defines the log-odds ratio using the natural logarithm, so we
% need to multiply (theta0+theta) by log(10).
log10odds_gsea = theta0(i) * ones(ng,1); % genome-wide baseline
log10odds_gsea(snps) = theta0(i) + theta(j); % enriched in the pathway
sigb1 = calc_beta_sd(se_all, Nsnp_all, sigmoid10(log10odds_gsea), h(k));
logodds1 = log(10) * (theta0(i) + theta(j));
options = struct('alpha',alpha(:,i,j,k),'mu',mu(:,i,j,k),'verbose',false);
% run rss-varbvsr under the enrichment model
[F1,alpha(:,i,j,k),mu(:,i,j,k),s(:,i,j,k)] = rss_varbvsr_wrapper(method,betahat,se,SiRiS,sigb1,logodds1,options);
% NOTE:
% Compute the importance weight under the alternative. The prior and
% proposal for the genome-wide log-odds and enrichment parameters are
% assumed to be uniform, so they cancel out from the importance
% weights. For an explanation why we can decompose the variational
% lower bound in this way, see pp 5-6 of Supplementary text of
% Carbonetto and Stephens (PLoS Genetics, 2013).
logw1(i,j,k) = logw0(i,k) + F1 - F0;
end
end
end
fprintf('\n');
end
function sigb = calc_beta_sd(se, Nsnp, pival, h)
% USAGE: calculate sigma_beta using summary data, pi and pve under enrichment
pxy = pival ./ (Nsnp .* (se.^2));
psi = h ./ sum(pxy);
sigb = sqrt(psi);
end
function y = sigmoid10(x)
% USAGE: compute the inverse of logit10(x)
y = 1./(1 + 10.^(-x));
end