In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
source('paths.sh')

In [3]:
file.path(str_replace(out_d, '/scratch', '/oak/stanford'), 'residual_regression.tsv.gz') %>%
fread() %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) -> df


In [4]:
df %>%
filter(str_detect(variable, '^center_id') ) -> center_df

center_df$BHq <- p.adjust(center_df$P, method = 'BH')

center_df %>%
count(BHq < 0.05, BHq < 0.01, P < (0.05 / nrow(center_df))) %>%
mutate(frac = n / nrow(center_df))


BHq < 0.05,BHq < 0.01,P < (0.05/nrow(center_df)),n,frac
<lgl>,<lgl>,<lgl>,<int>,<dbl>
False,False,False,27815,0.87249059
True,False,False,1274,0.03996236
True,True,False,1494,0.04686324
True,True,True,1297,0.04068381


In [5]:
(
    center_df %>%
    ggplot(aes(x = -log10(P))) +
    stat_bin(binwidth=1)
) %>%
layer_data(1) %>%
arrange(x) %>%
mutate(cumsum_y = cumsum(y)) %>%
filter(x < 20) %>%
ggplot(aes(x = x, y = y)) +
theme_bw(base_size=18) +
geom_vline(xintercept = -log10(0.05/nrow(center_df)), color='red') +
geom_hline(yintercept = nrow(center_df), color='gray') +
geom_bar(stat = 'identity') +
geom_line(aes(x = x, y = cumsum_y), color='black') +
scale_y_continuous(
    name = 'Number of (trait, assessment center) pairs', 
    sec.axis = sec_axis(
        ~./nrow(center_df),
        name = "Cumulative frequency"
    )
) +
labs(
    title = 'Significance of the assessment centers\nin phenotype prediction',
    x = latex2exp::TeX('$-\\log_{10}(P)$ of assessment center terms in regression model')
) -> p_assessment_center


In [6]:
for(ext in c('png', 'pdf')){ggsave(
    sprintf('assessment_center_pvals.%s', ext),
    p_assessment_center,
    width=8, height=8
)}