In [1]:
require(tidyverse)
require(data.table)

Loading required package: tidyverse
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0       ✔ purrr   0.3.1  
✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: data.table

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following object is masked from ‘package:purrr’:

    transpose



In [2]:
sscore_f <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_PRS/HC_20190303/8_score_all/HC382.sscore'
sscore_df <- fread(
    cmd=paste0('cat ', sscore_f, ' | cut -f2,6'), sep='\t', data.table=F
)


In [5]:
phe_f <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/highconfidenceqc/phe/HC382.phe'
phe_df <- fread(
    cmd=paste0('cat ', phe_f, ' | cut -f2,3'), sep='\t', data.table=F
)
colnames(phe_df) <- c('IID', 'HC382')


In [31]:
combined_df <- sscore_df %>% rename(snpnet_PRS = SCORE1_AVG) %>% 
left_join(phe_df, by='IID') %>%
rename(outcome = HC382) %>% 
filter(outcome != -9) %>%
drop_na() %>%
mutate(Percentile = rank(-snpnet_PRS) / n())


In [32]:
combined_df %>% dim()

In [34]:
cnt_middle <- combined_df %>% 
filter(0.4 <= Percentile, Percentile <= 0.6) %>% 
count(outcome) %>% rename(n_40_60 = n)

In [35]:
cnt_middle

outcome,n_40_60
1,87152
2,10522


In [96]:
compute_or <- function(df){
    joined_df <- df %>% count(outcome) %>% inner_join(cnt_middle, by='outcome') %>% gather(bin, cnt, -outcome)
    n_TP <- joined_df %>% filter(bin == 'n', outcome == 2) %>% select(cnt) %>% pull()
    n_TN <- joined_df %>% filter(bin != 'n', outcome == 1) %>% select(cnt) %>% pull()
    n_FP <- joined_df %>% filter(bin != 'n', outcome == 2) %>% select(cnt) %>% pull()
    n_FN <- joined_df %>% filter(bin == 'n', outcome == 1) %>% select(cnt) %>% pull()
    or <- (n_TP * n_TN) / (n_FP * n_FN)
    lor <- log(or)
    se_lor <- sqrt((1/n_TP) + (1/n_TN) + (1/n_FP) + (1/n_FN))
    lor <- exp(lor - 1.96 * se_lor)
    uor <- exp(lor + 1.96 * se_lor)
    c(or, lor, uor, n_TP / (n_TP + n_FN), n_TP, n_FN)
#     %>% print()    
}

In [99]:
compute_or_wrapper <- function(idx, df){
    bin_l = idx * 0.1
    bin_u = (idx + 1) * 0.1
    c(
        bin_l, bin_u, 
        df %>% filter(bin_l < Percentile, Percentile <= bin_u) 
        %>% compute_or()
    )    
}

In [100]:
res <- lapply(0:9, compute_or_wrapper, combined_df)

In [101]:
dff <- as.data.frame(res) 
colnames(dff) <- lapply(0:9, function(x){x * 10 + 5})


In [102]:
rownames(dff) <- c(
    'l', 'u', 'OR', 'OR_l', 'OR_u', 
    'Prevalence', 
    'n_cases', 'n_conts'
)

In [103]:
dff

Unnamed: 0,5,15,25,35,45,55,65,75,85,95
l,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
u,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
OR,3.757193,2.073254,1.576222,1.259013,1.085349,0.9161921,0.8041877,0.6944641,0.5670122,0.435018
OR_l,3.654001,2.011964,1.527283,1.218016,1.048743,0.8839145,0.7748165,0.6679848,0.5439712,0.415677
OR_u,39.71982,7.705791,4.753225,3.494257,2.953681,2.508739,2.25246,2.027614,1.795809,1.585906
Prevalence,0.3120585,0.2001966,0.1598755,0.1319464,0.1158548,0.09959662,0.08849847,0.0773578,0.06407027,0.04989967
n_cases,15240.0,9777.0,7808.0,6444.0,5658.0,4864.0,4322.0,3778.0,3129.0,2437.0
n_conts,33597.0,39060.0,41030.0,42394.0,43179.0,43973.0,44515.0,45060.0,45708.0,46401.0


In [104]:
dff %>% fwrite('HC382.csv', row.names = T, col.names=T)

In [105]:
combined_df %>% filter(
    .2 < Percentile, Percentile <= .3
)  %>% compute_or() %>% print()

[1] 1.576222e+00 1.527283e+00 4.753225e+00 1.598755e-01 7.808000e+03
[6] 4.103000e+04


In [85]:
combined_df %>% filter(
    .2 < Percentile, Percentile <= .3
) %>% count(outcome)

outcome,n
1,41030
2,7808


In [77]:
combined_df %>% filter(
    .2 < Percentile, Percentile <= .3
) %>%  count(outcome) %>% inner_join(cnt_middle, by='outcome') %>% gather(bin, cnt, -outcome)

outcome,bin,cnt
1,n,41030
2,n,7808
1,n_40_60,87152
2,n_40_60,10522
