In [1]:
require(tidyverse)
require(data.table)

Loading required package: tidyverse
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0       ✔ purrr   0.3.1  
✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: data.table

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following object is masked from ‘package:purrr’:

    transpose



In [2]:
sscore_f <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_PRS/HC_20190303/8_score_all/HC382.sscore'
sscore_df <- fread(
    cmd=paste0('cat ', sscore_f, ' | cut -f2,6'), sep='\t', data.table=F
)


In [5]:
phe_f <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/highconfidenceqc/phe/HC382.phe'
phe_df <- fread(
    cmd=paste0('cat ', phe_f, ' | cut -f2,3'), sep='\t', data.table=F
)
colnames(phe_df) <- c('IID', 'HC382')


In [31]:
combined_df <- sscore_df %>% rename(snpnet_PRS = SCORE1_AVG) %>% 
left_join(phe_df, by='IID') %>%
rename(outcome = HC382) %>% 
filter(outcome != -9) %>%
drop_na() %>%
mutate(Percentile = rank(-snpnet_PRS) / n())


In [32]:
combined_df %>% dim()

In [34]:
cnt_middle <- combined_df %>% 
filter(0.4 <= Percentile, Percentile <= 0.6) %>% 
count(outcome) %>% rename(n_40_60 = n)

In [35]:
cnt_middle

outcome,n_40_60
1,87152
2,10522


In [106]:
compute_or <- function(df){
    joined_df <- df %>% count(outcome) %>% inner_join(cnt_middle, by='outcome') %>% gather(bin, cnt, -outcome)
    n_TP <- joined_df %>% filter(bin == 'n', outcome == 2) %>% select(cnt) %>% pull()
    n_TN <- joined_df %>% filter(bin != 'n', outcome == 1) %>% select(cnt) %>% pull()
    n_FP <- joined_df %>% filter(bin != 'n', outcome == 2) %>% select(cnt) %>% pull()
    n_FN <- joined_df %>% filter(bin == 'n', outcome == 1) %>% select(cnt) %>% pull()
    or <- (n_TP * n_TN) / (n_FP * n_FN)
    lor <- log(or)
    se_lor <- sqrt((1/n_TP) + (1/n_TN) + (1/n_FP) + (1/n_FN))
    lor <- exp(lor) - 1.96 * se_lor
    uor <- exp(lor) + 1.96 * se_lor
    c(or, lor, uor, n_TP / (n_TP + n_FN), n_TP, n_FN)
#     %>% print()    
}

In [121]:
compute_or_wrapper <- function(idx, df){
    bin_u = 1 - idx * 0.1
    bin_l = 1 - (idx + 1) * 0.1
    c(
        1 - bin_u, 1 - bin_l, 
        df %>% filter(bin_l < Percentile, Percentile <= bin_u) 
        %>% compute_or()
    )    
}

In [122]:
res <- lapply(0:9, compute_or_wrapper, combined_df)

In [123]:
df <- as.data.frame(res) 
colnames(df) <- lapply(0:9, function(x){x * 10 + 5})
rownames(df) <- c(
    'l', 'u', 'OR', 'OR_l', 'OR_u', 
    'Prevalence', 
    'n_cases', 'n_conts'
)
dff <- df %>%
rownames_to_column('id')

In [124]:
dff %>% fwrite('HC382.csv', col.names=T)

In [126]:
dff

id,5,15,25,35,45,55,65,75,85,95
l,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
u,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
OR,0.435018,0.5670122,0.6944641,0.8041877,0.9161921,1.085349,1.259013,1.576222,2.073254,3.757193
OR_l,0.3895391,0.5255277,0.6555891,0.7669812,0.8803264,1.05104,1.225908,1.544681,2.043246,3.729344
OR_u,1.521779,1.732836,1.965152,2.190463,2.447552,2.894933,3.440363,4.718018,7.745622,41.67961
Prevalence,0.04989967,0.06407027,0.0773578,0.08849847,0.09959662,0.1158548,0.1319464,0.1598755,0.2001966,0.3120585
n_cases,2437.0,3129.0,3778.0,4322.0,4864.0,5658.0,6444.0,7808.0,9777.0,15240.0
n_conts,46401.0,45708.0,45060.0,44515.0,43973.0,43179.0,42394.0,41030.0,39060.0,33597.0
