In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
library(pROC)

Type 'citation("pROC")' for a citation.


Attaching package: ‘pROC’


The following objects are masked from ‘package:stats’:

    cov, smooth, var




In [3]:
phenotype <- 'HC382'
data_d    <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/20200908_PRS_map_test/1_p_factor_v1/HC382/1_fit_w_val'
refit <- F

#
p_val_f <- 'AUC_diff_p.HC382.tsv'

In [5]:
# input and parameters

#data_d <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/20200908_PRS_map_test'
phe_f <- '/scratch/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.20200828.phe.zst'
covariates       <- c('age', 'sex', paste0('PC', 1:10))
refit_split_strs <- c('non_british_white', 'african', 's_asian', 'e_asian')
sscore_f             <- file.path(data_d, '__PHENOTYPE__.sscore.zst')
snpnet_BETAs_f       <- file.path(data_d, 'snpnet.tsv')
snpnet_covar_BETAs_f <- file.path(data_d, 'snpnet.covars.tsv')
family <- ifelse((startsWith(phenotype, 'INI') | startsWith(phenotype, 'QT_FC')), 'gaussian', 'binomial')

# output
eval_f <- file.path(data_d, 'snpnet.eval.tsv')
plot_f <- file.path(data_d, 'snpnet.plot.pdf')
percentile_f <- file.path(data_d, 'snpnet.percentile.tsv')


In [6]:
source('/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/snpnet/helpers/snpnet_misc.R')

# read the raw phenotype file
fread(
    cmd=paste(cat_or_zcat(phe_f), phe_f,  '|', 'sed -e "s/^#//g"'),
    select=c('FID', 'IID', 'split', covariates, phenotype),
    colClasses = c('FID'='character', 'IID'='character'),
    data.table=F
) -> phe_df

if(refit){
    phe_df %>%
    mutate(
        split = if_else(split %in% c('train', 'val'), 'train_val', split)
    ) -> phe_df
}

# read PRS and covariate-based score
phe_df %>%
compute_phe_score_df(
    phenotype,
    str_replace_all(sscore_f, '__PHENOTYPE__', phenotype),
    str_replace_all(snpnet_covar_BETAs_f, '__PHENOTYPE__', phenotype),
    covariates, family, refit_split_strs
) -> phe_score_df


“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [7]:
1:60 %>%
lapply(function(lambda_idx){
    file.path(data_d, 'results', sprintf('snpnet.lambda%d.sscore.zst', lambda_idx)) %>%
    read_PRS() %>%
    rename(!!sprintf('geno_%d', lambda_idx) := 'geno_score')
}) %>%
reduce(function(x, y){inner_join(x, y, by=c('FID', 'IID'))}) -> geno_score_intermediates_df


In [8]:
phe_score_df %>% 
left_join(geno_score_intermediates_df, by=c('FID', 'IID')) -> phe_score_intermediate_df


In [9]:
phe_score_intermediate_df %>%
filter(split == 'val') %>%
mutate(geno_covar_score = geno_score + covar_score) -> sdf


In [10]:
roc_tests <- list()


In [11]:
roc.test(
    roc(sdf$phe, sdf[['covar_score']], levels=c('control'=1, 'case'=2), direction='<'),
    roc(sdf$phe, sdf[['geno_covar_score']], levels=c('control'=1, 'case'=2), direction='<')
) -> roc_tests[['geno_covar_vs_covar']]


In [12]:
roc.test(
    roc(sdf$phe, sdf[['covar_score']], levels=c('control'=1, 'case'=2), direction='<'),
    roc(sdf$phe, sdf[['geno_1']], levels=c('control'=1, 'case'=2), direction='<')
) -> roc_tests[['geno_1_vs_covar']]


In [13]:
for(lambda_idx in 2:60){
    roc.test(
        roc(sdf$phe, sdf[[sprintf('geno_%d', lambda_idx - 1)]], levels=c('control'=1, 'case'=2), direction='<'),
        roc(sdf$phe, sdf[[sprintf('geno_%d', lambda_idx    )]], levels=c('control'=1, 'case'=2), direction='<')
    ) -> roc_tests[[sprintf('geno_%d_vs_geno_%d', lambda_idx, lambda_idx - 1)]]
    
}


In [14]:
roc_p_vals <- data.frame(
    name = names(roc_tests),
    p = names(roc_tests) %>%
    lapply(function(k){
        (roc_tests[[k]])$p.value
    }) %>% simplify(),
    stringsAsFactors=F
) %>%
separate(name, c('score2', 'score1'), sep='_vs_', remove=F) %>%
select(name, score1, score2, p)


In [15]:
roc_p_vals %>%
rename('#name' = 'name') %>%
fwrite(p_val_f, sep='\t', na = "NA", quote=F)


In [16]:
roc_tests[['geno_covar_vs_covar']]


	DeLong's test for two correlated ROC curves

data:  roc(sdf$phe, sdf[["covar_score"]], levels = c(control = 1, case = 2),  and roc(sdf$phe, sdf[["geno_covar_score"]], levels = c(control = 1,     direction = "<") and     case = 2), direction = "<")
Z = -16.507, p-value < 2.2e-16
alternative hypothesis: true difference in AUC is not equal to 0
sample estimates:
AUC of roc1 AUC of roc2 
  0.5281458   0.6189194 


In [17]:
roc_p_vals %>%
tail()

Unnamed: 0_level_0,name,score1,score2,p
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>
56,geno_55_vs_geno_54,geno_54,geno_55,0.01316363
57,geno_56_vs_geno_55,geno_55,geno_56,0.10500133
58,geno_57_vs_geno_56,geno_56,geno_57,0.17819255
59,geno_58_vs_geno_57,geno_57,geno_58,0.35657788
60,geno_59_vs_geno_58,geno_58,geno_59,0.76033081
61,geno_60_vs_geno_59,geno_59,geno_60,0.36920904


- We currently have `lambda idx = 58` as the best model
- `lambda idx = 55` may be the optimal

In [22]:
# evaluate the predictive performance
phe_score_intermediate_df %>%
eval_performance(
    phenotype,
    str_replace_all(snpnet_BETAs_f, '__PHENOTYPE__', phenotype),
    family
) -> eval_58_df


In [28]:
phe_score_intermediate_df %>%
select(-geno_score) %>%
rename(geno_score = geno_55) %>%
eval_performance(
    phenotype,
    file.path(data_d, 'results', 'snpnet.lambda55.tsv'),
    family
) -> eval_55_df


In [29]:
eval_58_df

phenotype_name,split,geno,covar,geno_covar,geno_delta,n_variables,case_n,control_n
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>
HC382,train,0.7731822,0.5387748,0.7722529,0.233478152,7595,31741,204250
HC382,test,0.6116628,0.5352619,0.6161877,0.080925727,7595,8934,58491
HC382,val,0.6153153,0.5281458,0.6189194,0.090773671,7595,4553,29160
HC382,non_british_white,0.6110596,0.5676678,0.6276089,0.059941082,7595,3243,21662
HC382,s_asian,0.5784973,0.5760666,0.6068172,0.03075052,7595,1171,6660
HC382,african,0.5530495,0.6095357,0.6150603,0.005524621,7595,871,5626
HC382,e_asian,0.4973209,0.5933069,0.5660849,-0.027222022,7595,166,1538


In [30]:
eval_55_df

phenotype_name,split,geno,covar,geno_covar,geno_delta,n_variables,case_n,control_n
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>
HC382,train,0.7323389,0.5387748,0.7312488,0.19247405,4764,31741,204250
HC382,test,0.6120284,0.5352619,0.6163027,0.08104073,4764,8934,58491
HC382,val,0.6139289,0.5281458,0.6172363,0.08909049,4764,4553,29160
HC382,non_british_white,0.6093796,0.5676678,0.6265839,0.05891606,4764,3243,21662
HC382,s_asian,0.5801543,0.5760666,0.6076715,0.03160488,4764,1171,6660
HC382,african,0.5543122,0.6095357,0.6185394,0.00900363,4764,871,5626
HC382,e_asian,0.5086445,0.5933069,0.5790888,-0.01421812,4764,166,1538
