-
Notifications
You must be signed in to change notification settings - Fork 44
/
ldsc.R
252 lines (215 loc) · 8.37 KB
/
ldsc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
################################################################################
# heteroscedasticity and overcounting weights
WEIGHTS <- function(pred, w_ld) {
1 / (pred^2 * w_ld)
}
crossprod2 <- function(x, y) drop(base::crossprod(x, y))
# equivalent to stats::lm.wfit(cbind(1, x), y, w)
wlm <- function(x, y, w) {
wx <- w * x
W <- sum(w)
WX <- sum(wx)
WY <- crossprod2(w, y)
WXX <- crossprod2(wx, x)
WXY <- crossprod2(wx, y)
alpha <- (WXX * WY - WX * WXY) / (W * WXX - WX^2)
beta <- (WXY * W - WX * WY) / (W * WXX - WX^2)
list(intercept = alpha, slope = beta, pred = x * beta + alpha)
}
# equivalent to stats::lm.wfit(as.matrix(x), y, w)
wlm_no_int <- function(x, y, w) {
wx <- w * x
WXX <- crossprod2(wx, x)
WXY <- crossprod2(wx, y)
beta <- WXY / WXX
list(slope = beta, pred = x * beta)
}
################################################################################
#' LD score regression
#'
#' @param ld_score Vector of LD scores.
#' @param ld_size Number of variants used to compute `ld_score`.
#' @param chi2 Vector of chi-squared statistics.
#' @param sample_size Sample size of GWAS corresponding to chi-squared statistics.
#' Possibly a vector, or just a single value.
#' @param chi2_thr1 Threshold on `chi2` in step 1. Default is `30`.
#' This is equivalent to parameter `--two-step`.
#' @param chi2_thr2 Threshold on `chi2` in step 2. Default is `Inf` (none).
#' @param blocks Either a single number specifying the number of blocks,
#' or a vector of integers specifying the block number of each `chi2` value.
#' Default is `200` for `snp_ldsc()`, dividing into 200 blocks of approximately
#' equal size. `NULL` can also be used to skip estimating standard errors,
#' which is the default for `snp_ldsc2()`.
#' @param intercept You can constrain the intercept to some value (e.g. 1).
#' Default is `NULL` in `snp_ldsc()` (the intercept is estimated)
#' and is `1` in `snp_ldsc2()` (the intercept is fixed to 1).
#' This is equivalent to parameter `--intercept-h2`.
#' @inheritParams bigsnpr-package
#'
#' @return Vector of 4 values (or only the first 2 if `blocks = NULL`):
#' - `[["int"]]`: LDSC regression intercept,
#' - `[["int_se"]]`: SE of this intercept,
#' - `[["h2"]]`: LDSC regression estimate of (SNP) heritability (also see
#' [coef_to_liab]),
#' - `[["h2_se"]]`: SE of this heritability estimate.
#'
#' @importFrom bigassertr assert_one_int
#'
#' @export
#'
snp_ldsc <- function(ld_score, ld_size, chi2, sample_size,
blocks = 200,
intercept = NULL,
chi2_thr1 = 30,
chi2_thr2 = Inf,
ncores = 1) {
chi2 <- chi2 + 1e-8
assert_pos(chi2, strict = TRUE)
assert_lengths(chi2, ld_score)
assert_one_int(ld_size)
M <- length(chi2)
if (length(sample_size) == 1) {
sample_size <- rep(sample_size, M)
} else {
assert_lengths(sample_size, chi2)
}
if (is.null(blocks)) {
#### step 1 ####
step1_int <- if (is.null(intercept)) {
ind_sub1 <- which(chi2 < chi2_thr1)
w_ld <- pmax(ld_score[ind_sub1], 1)
x1 <- (ld_score / ld_size * sample_size)[ind_sub1]
y1 <- chi2[ind_sub1]
pred0 <- y1
for (i in 1:100) {
pred <- wlm(x1, y1, w = WEIGHTS(pred0, w_ld))$pred
if (max(abs(pred - pred0)) < 1e-6) break
pred0 <- pred
}
wlm(x1, y1, w = WEIGHTS(pred0, w_ld))$intercept
} else intercept
#### step 2 ####
ind_sub2 <- which(chi2 < chi2_thr2)
w_ld <- pmax(ld_score[ind_sub2], 1)
x <- (ld_score / ld_size * sample_size)[ind_sub2]
y <- chi2[ind_sub2]
yp <- y - step1_int
pred0 <- y
for (i in 1:100) {
pred <- step1_int + wlm_no_int(x, yp, w = WEIGHTS(pred0, w_ld))$pred
if (max(abs(pred - pred0)) < 1e-6) break
pred0 <- pred
}
step2_h2 <- wlm_no_int(x, yp, w = WEIGHTS(pred0, w_ld))$slope
c(int = step1_int, h2 = step2_h2)
} else {
#### delete-a-group jackknife variance estimator ####
if (length(blocks) == 1) {
blocks <- sort(rep_len(seq_len(blocks), M))
} else {
assert_lengths(blocks, chi2)
}
ind_blocks <- split(seq_along(blocks), blocks)
h_blocks <- M / lengths(ind_blocks)
bigparallelr::register_parallel(ncores)
delete_values <- foreach(ind_rm = c(list(NULL), ind_blocks), .combine = "cbind") %dopar% {
keep <- which(!seq_along(ld_score) %in% ind_rm)
snp_ldsc(ld_score[keep], ld_size, chi2[keep], sample_size[keep],
NULL, intercept, chi2_thr1, chi2_thr2)
}
estim <- delete_values[, 1]
# https://doi.org/10.1023/A:1008800423698
int_pseudovalues <- h_blocks * estim[1] - (h_blocks - 1) * delete_values[1, -1]
h2_pseudovalues <- h_blocks * estim[2] - (h_blocks - 1) * delete_values[2, -1]
int_J <- sum(int_pseudovalues / h_blocks)
h2_J <- sum( h2_pseudovalues / h_blocks)
c(int = int_J,
int_se = sqrt(mean((int_pseudovalues - int_J)^2 / (h_blocks - 1))),
h2 = h2_J,
h2_se = sqrt(mean(( h2_pseudovalues - h2_J)^2 / (h_blocks - 1))))
}
}
################################################################################
#' @rdname snp_ldsc
#'
#' @param corr Sparse correlation matrix. Can also be an [SFBM][bigsparser::SFBM-class].
#' @param df_beta A data frame with 3 columns:
#' - `$beta`: effect size estimates
#' - `$beta_se`: standard errors of effect size estimates
#' - `$n_eff`: either GWAS sample size(s) when estimating `beta` for a
#' continuous trait, or in the case of a binary trait, this is
#' `4 / (1 / n_control + 1 / n_case)`; in the case of a meta-analysis, you
#' should sum the effective sample sizes of each study instead of using the
#' total numbers of cases and controls, see \doi{10.1016/j.biopsych.2022.05.029};
#' when using a mixed model, the effective sample size needs to be adjusted
#' as well, see \doi{10.1016/j.xhgg.2022.100136}.
#' @param ind.beta Indices in `corr` corresponding to `df_beta`. Default is all.
#'
#' @export
#'
#' @examples
#' bigsnp <- snp_attachExtdata()
#' G <- bigsnp$genotypes
#' y <- bigsnp$fam$affection - 1
#' corr <- snp_cor(G, ind.col = 1:1000)
#'
#' gwas <- big_univLogReg(G, y, ind.col = 1:1000)
#' df_beta <- data.frame(beta = gwas$estim, beta_se = gwas$std.err,
#' n_eff = 4 / (1 / sum(y == 0) + 1 / sum(y == 1)))
#'
#' snp_ldsc2(corr, df_beta)
#' snp_ldsc2(corr, df_beta, blocks = 20, intercept = NULL)
#'
snp_ldsc2 <- function(corr, df_beta,
blocks = NULL,
intercept = 1,
ncores = 1,
ind.beta = cols_along(corr),
chi2_thr1 = 30,
chi2_thr2 = Inf) {
assert_df_with_names(df_beta, c("beta", "beta_se", "n_eff"))
assert_lengths(ind.beta, rows_along(df_beta))
stopifnot(all(ind.beta %in% cols_along(corr)))
assert_pos(df_beta$beta_se, strict = TRUE)
full_ld <- if (inherits(corr, "dsCMatrix")) {
sp_colSumsSq_sym(corr@p, corr@i, corr@x)
} else if (inherits(corr, "SFBM")) {
ld_scores_sfbm(corr, ind_sub = cols_along(corr) - 1L, ncores = ncores)
} else {
Matrix::colSums(corr^2)
}
snp_ldsc(
ld_score = full_ld[ind.beta],
ld_size = ncol(corr),
chi2 = (df_beta$beta / df_beta$beta_se)^2,
sample_size = df_beta$n_eff,
blocks = blocks,
intercept = intercept,
ncores = ncores,
chi2_thr1 = chi2_thr1,
chi2_thr2 = chi2_thr2
)
}
################################################################################
#' Liability scale
#'
#' Coefficient to convert to the liability scale. E.g. h2_liab = coef * h2_obs.
#'
#' @param K_pop Prevalence in the population.
#' @param K_gwas Prevalence in the GWAS. You should provide this if you used
#' (`n_case + n_control`) as sample size. If using the effective sample size
#' `4 / (1 / n_case + 1 / n_control)` instead, you should keep the default
#' value of `K_gwas = 0.5` as the GWAS case-control ascertainment is already
#' accounted for in the effective sample size.
#'
#' @return Scaling coefficient to convert e.g. heritability to the liability scale.
#' @export
#'
#' @examples
#' h2 <- 0.2
#' h2 * coef_to_liab(0.02)
coef_to_liab <- function(K_pop, K_gwas = 0.5) {
z <- stats::dnorm(stats::qnorm(min(K_pop, 1 - K_pop)))
(K_pop * (1 - K_pop) / z)^2 / (K_gwas * (1 - K_gwas))
}
################################################################################