In [2]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
    library(pROC)
}))


In [3]:
run_name <- '1_p_factor_v1'
phenotype <- 'HC269'
refit <- T

data_d <- file.path(
    '/oak/stanford/groups/mrivas/projects/PRS/private_output/20200908_PRS_map_test', 
    run_name, phenotype, ifelse(refit, '2_refit', '1_fit_w_val')
)
refit <- F

# output
eval_f <- sprintf('AUC_diff_p_eval-refit.%s.tsv', phenotype)

In [4]:
# input and parameters

#data_d <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/20200908_PRS_map_test'
phe_f <- '/scratch/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.20200828.phe.zst'
covariates       <- c('age', 'sex', paste0('PC', 1:10))
refit_split_strs <- c('non_british_white', 'african', 's_asian', 'e_asian')
sscore_f             <- file.path(data_d, '__PHENOTYPE__.sscore.zst')
snpnet_BETAs_f       <- file.path(data_d, 'snpnet.tsv')
snpnet_covar_BETAs_f <- file.path(data_d, 'snpnet.covars.tsv')
family <- ifelse((startsWith(phenotype, 'INI') | startsWith(phenotype, 'QT_FC')), 'gaussian', 'binomial')

# output
eval_f <- file.path(data_d, 'snpnet.eval.tsv')
plot_f <- file.path(data_d, 'snpnet.plot.pdf')
percentile_f <- file.path(data_d, 'snpnet.percentile.tsv')


In [5]:
source('/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/snpnet/helpers/snpnet_misc.R')

# read the raw phenotype file
fread(
    cmd=paste(cat_or_zcat(phe_f), phe_f,  '|', 'sed -e "s/^#//g"'),
    select=c('FID', 'IID', 'split', covariates, phenotype),
    colClasses = c('FID'='character', 'IID'='character'),
    data.table=F
) -> phe_df

if(refit){
    phe_df %>%
    mutate(
        split = if_else(split %in% c('train', 'val'), 'train_val', split)
    ) -> phe_df
}

# read PRS and covariate-based score
phe_df %>%
compute_phe_score_df(
    phenotype,
    str_replace_all(sscore_f, '__PHENOTYPE__', phenotype),
    str_replace_all(snpnet_covar_BETAs_f, '__PHENOTYPE__', phenotype),
    covariates, family, refit_split_strs
) -> phe_score_df


In [5]:
Sys.glob(file.path(data_d, 'results', '*.sscore.zst'), dirmark = FALSE) %>%
lapply(function(x){as.integer(str_replace_all(basename(x), '^snpnet.lambda|.sscore.zst$', ''))}) %>%
simplify() %>% sort() -> lambda_idxs 


In [6]:
lambda_idxs 

In [7]:
lambda_idxs %>%
lapply(function(lambda_idx){
    file.path(data_d, 'results', sprintf('snpnet.lambda%d.sscore.zst', lambda_idx)) %>%
    read_PRS() %>%
    rename(!!sprintf('geno_%d', lambda_idx) := 'geno_score')
}) %>%
reduce(function(x, y){inner_join(x, y, by=c('FID', 'IID'))}) -> geno_score_intermediates_df


In [8]:
phe_score_df %>% 
left_join(geno_score_intermediates_df, by=c('FID', 'IID')) -> phe_score_intermediate_df


In [9]:
phe_score_intermediate_df %>%
filter(split == 'val') %>%
mutate(geno_covar_score = geno_score + covar_score) -> sdf


In [10]:
roc_tests <- list()


In [11]:
roc.test(
    roc(sdf$phe, sdf[['covar_score']], levels=c('control'=1, 'case'=2), direction='<'),
    roc(sdf$phe, sdf[['geno_covar_score']], levels=c('control'=1, 'case'=2), direction='<')
) -> roc_tests[['geno_covar_vs_covar']]


In [12]:
roc.test(
    roc(sdf$phe, sdf[['covar_score']], levels=c('control'=1, 'case'=2), direction='<'),
    roc(sdf$phe, sdf[['geno_2']], levels=c('control'=1, 'case'=2), direction='<')
) -> roc_tests[['geno_2_vs_covar']]


In [13]:
for(lambda_idx in lambda_idxs[lambda_idxs != 2]){
    roc.test(
        roc(sdf$phe, sdf[[sprintf('geno_%d', lambda_idx - 1)]], levels=c('control'=1, 'case'=2), direction='<'),
        roc(sdf$phe, sdf[[sprintf('geno_%d', lambda_idx    )]], levels=c('control'=1, 'case'=2), direction='<')
    ) -> roc_tests[[sprintf('geno_%d_vs_geno_%d', lambda_idx, lambda_idx - 1)]]
}


In [14]:
roc_p_vals <- data.frame(
    name = names(roc_tests),
    p = names(roc_tests) %>%
    lapply(function(k){
        (roc_tests[[k]])$p.value
    }) %>% simplify(),
    stringsAsFactors=F
) %>%
separate(name, c('score2', 'score1'), sep='_vs_', remove=F) %>%
select(name, score1, score2, p)


In [16]:
roc_p_vals %>%
rename('#name' = 'name') %>%
fwrite(p_val_f, sep='\t', na = "NA", quote=F)


In [15]:
roc_tests[['geno_covar_vs_covar']]


	DeLong's test for two correlated ROC curves

data:  roc(sdf$phe, sdf[["covar_score"]], levels = c(control = 1, case = 2),  and roc(sdf$phe, sdf[["geno_covar_score"]], levels = c(control = 1,     direction = "<") and     case = 2), direction = "<")
Z = -13.391, p-value < 2.2e-16
alternative hypothesis: true difference in AUC is not equal to 0
sample estimates:
AUC of roc1 AUC of roc2 
  0.6814867   0.7149091 


In [16]:
roc_p_vals %>%
tail(10)

Unnamed: 0_level_0,name,score1,score2,p
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>
47,geno_47_vs_geno_46,geno_46,geno_47,0.00403915
48,geno_48_vs_geno_47,geno_47,geno_48,0.008727713
49,geno_49_vs_geno_48,geno_48,geno_49,0.011980257
50,geno_50_vs_geno_49,geno_49,geno_50,0.01629996
51,geno_51_vs_geno_50,geno_50,geno_51,0.120738613
52,geno_52_vs_geno_51,geno_51,geno_52,0.565755808
53,geno_53_vs_geno_52,geno_52,geno_53,0.711787302
54,geno_54_vs_geno_53,geno_53,geno_54,0.965750323
55,geno_55_vs_geno_54,geno_54,geno_55,0.472972236
56,geno_56_vs_geno_55,geno_55,geno_56,0.179458603


- We currently have `lambda idx = 54` as the best model
- `lambda idx = 50` may be the optimal

In [18]:
# evaluate the predictive performance
50:56 %>% lapply(function(la_idx){
    phe_score_intermediate_df %>%
    select(-geno_score) %>%
    rename('geno_score'=sprintf('geno_%d', la_idx)) %>%
    eval_performance(
        phenotype,
        file.path(data_d, 'results', sprintf('snpnet.lambda%d.tsv', la_idx)),
        family
    ) %>%
    mutate(
        lambda_idx = la_idx
    ) %>%
    select(phenotype_name, split, lambda_idx, geno, covar, geno_covar, geno_delta, n_variables, case_n, control_n)
}) %>% bind_rows() -> eval_intermediate_df


In [21]:
eval_intermediate_df %>%
rename('#phenotype_name' = 'phenotype_name') %>%
fwrite(eval_f, sep='\t', na = "NA", quote=F)


In [26]:
eval_intermediate_df %>%
filter(split %in% c('train', 'test', 'val')) %>%
left_join(data.frame(split=c('train', 'val', 'test'), order=1:3, stringsAsFactors=F), by='split') %>%
arrange(lambda_idx, order) %>% select(-order)

phenotype_name,split,lambda_idx,geno,covar,geno_covar,geno_delta,n_variables,case_n,control_n
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>
HC269,train,50,0.7064346,0.6897751,0.7652736,0.07549851,3653,31710,204281
HC269,val,50,0.6125953,0.6814867,0.7137185,0.03223179,3653,4517,29196
HC269,test,50,0.616216,0.6926424,0.7258807,0.03323835,3653,8999,58426
HC269,train,51,0.7195777,0.6897751,0.7734151,0.08363995,4401,31710,204281
HC269,val,51,0.6131447,0.6814867,0.7143009,0.03281424,4401,4517,29196
HC269,test,51,0.6163943,0.6926424,0.7262443,0.0336019,4401,8999,58426
HC269,train,52,0.7329157,0.6897751,0.7820876,0.09231251,5250,31710,204281
HC269,val,52,0.6133462,0.6814867,0.7146238,0.03313711,5250,4517,29196
HC269,test,52,0.616406,0.6926424,0.7264053,0.03376293,5250,8999,58426
HC269,train,53,0.7462051,0.6897751,0.791185,0.10140992,6152,31710,204281


In [28]:
roc_tests[['geno_51_vs_geno_50']]



	DeLong's test for two correlated ROC curves

data:  roc(sdf$phe, sdf[[sprintf("geno_%d", lambda_idx - 1)]], levels = c(control = 1,  and roc(sdf$phe, sdf[[sprintf("geno_%d", lambda_idx)]], levels = c(control = 1,     case = 2), direction = "<") and     case = 2), direction = "<")
Z = -1.5517, p-value = 0.1207
alternative hypothesis: true difference in AUC is not equal to 0
sample estimates:
AUC of roc1 AUC of roc2 
  0.6125953   0.6131447 
