In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [13]:
# input
data_d_root <- '/oak/stanford/groups/mrivas/projects/biobank-methods-dev'
phe_f <- file.path(data_d_root, 'snpnet-elastic-net/phenotype.phe')
PRS_d <- file.path(data_d_root, 'snpnet-P_and_T', 'train')
covar_score_d <- file.path(data_d_root, 'snpnet-PRScs/covar_betas_train_val')

# constants
covars <- c('age', 'sex', paste0('PC', 1:10))

# output
out_f <- 'snpnet-P_and_T.train.eval.tsv'


In [3]:
read_BETAs <- function(beta_f){
    fread(beta_f)
}


In [4]:
read_PRS <- function(sscore_f){
    fread(
        cmd=paste('zstdcat', sscore_f),
        select=c('#FID', 'IID', 'SCORE1_SUM'),
        colClasses=c('#FID'='character', 'IID'='character')
    ) %>%
    rename('FID'='#FID', 'geno_score'='SCORE1_SUM')
}


In [5]:
read_covar_score <- function(covar_score_f){
    fread(
        cmd=paste('zstdcat', covar_score_f),
        select=c('#FID', 'IID', 'Estimate'),
        colClasses=c('#FID'='character', 'IID'='character')
    ) %>%
    rename('FID'='#FID', 'covar_score'='Estimate')
    
}


In [6]:
perform_eval <- function(response, pred, metric.type){
    if(metric.type == 'r2'){
        summary(lm(response ~ 1 + pred))$r.squared
    }else{
#         pROC::auc(pROC::roc(response, pred))        
        pred.obj <- ROCR::prediction(pred, factor(response - 1))
        auc.obj <- ROCR::performance(pred.obj, measure = 'auc')
        auc.obj@y.values[[1]]
    }
}


In [7]:
phe_df <- fread(phe_f, colClasses=c('FID'='character', 'IID'='character')) %>%
mutate(ID = paste(FID, IID, sep='_')) %>%
column_to_rownames('ID')


In [8]:
eval_line_build <- function(score_test_df, phe, metric.type, split_string){
    data.frame(
        phe        = phe,
        split      = split_string,
        geno       = perform_eval(
            score_test_df$phe,
            score_test_df$geno_score,
            metric.type
        ),
        covar      = perform_eval(
            score_test_df$phe,
            score_test_df$covar_score,
            metric.type
        ),
        geno_covar = perform_eval(
            score_test_df$phe,
            score_test_df$geno_covar_score,
            metric.type
        ),
        stringsAsFactors = F
    )    
}


In [9]:
eval_df <- c('INI50', 'INI21001', 'HC269', 'HC382') %>%
lapply(function(phe){   
    c('1e-5', '1e-4', '1e-3') %>%
    lapply(function(p_thr){
        
        metric.type <- ifelse(str_replace_all(phe, '[0-9]', '') %in% c('INI', 'QT_FC'), 'r2', 'auc')

        df <- phe_df %>% 
        select(all_of(c('FID', 'IID', phe, 'split'))) %>%
        rename(!!'phe' := all_of(phe)) %>%
        left_join(
            read_PRS(file.path(PRS_d, sprintf('%s.P_%s.sscore.zst', phe, p_thr))),
            by=c("FID", "IID")
        ) %>%
        left_join(
            read_covar_score(file.path(covar_score_d, sprintf('%s.covar.scores.tsv', phe))), 
            by=c("FID", "IID")
        ) %>%
        mutate(geno_covar_score = geno_score + covar_score) %>%
        drop_na(phe) %>%
        filter(phe != -9)

        nvars <- read_BETAs(
            file.path(PRS_d, sprintf('%s.P_%s.plink.tsv', phe, p_thr))
        ) %>% nrow()
        
        bind_rows(
            df %>%
            filter(split == 'train')%>%
            eval_line_build(phe, metric.type, 'train'),

            df %>%
            filter(split == 'val')%>%
            eval_line_build(phe, metric.type, 'val'),

            df %>%
            filter(split == 'test')%>%
            eval_line_build(phe, metric.type, 'test')
        ) %>%
        mutate(
            P_thr = p_thr,
            n_variables = nvars 
        )
    }) %>% bind_rows()        
}) %>% bind_rows() %>%
select(phe, P_thr, split, geno, covar, geno_covar, n_variables)


In [10]:
eval_df

phe,P_thr,split,geno,covar,geno_covar,n_variables
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<int>
INI50,1e-05,train,0.04450293,0.534437096,0.19710847,4401
INI50,1e-05,val,0.03379383,0.530465444,0.17762235,4401
INI50,1e-05,test,0.03310059,0.533574168,0.17641662,4401
INI50,0.0001,train,0.0551991,0.534437096,0.20310415,6732
INI50,0.0001,val,0.04025643,0.530465444,0.1780345,6732
INI50,0.0001,test,0.03926607,0.533574168,0.17608591,6732
INI50,0.001,train,0.07409714,0.534437096,0.21896185,11829
INI50,0.001,val,0.04921455,0.530465444,0.18081082,11829
INI50,0.001,test,0.04720995,0.533574168,0.1774439,11829
INI21001,1e-05,train,0.03421878,0.010453775,0.0395578,755


In [14]:
eval_df %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)


In [15]:
eval_df %>%
filter(split == 'val')

phe,P_thr,split,geno,covar,geno_covar,n_variables
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<int>
INI50,1e-05,val,0.03379383,0.53046544,0.17762235,4401
INI50,0.0001,val,0.04025643,0.53046544,0.1780345,6732
INI50,0.001,val,0.04921455,0.53046544,0.18081082,11829
INI21001,1e-05,val,0.01978492,0.01036046,0.02410968,755
INI21001,0.0001,val,0.02253438,0.01036046,0.02585041,1655
INI21001,0.001,val,0.0304671,0.01036046,0.03315992,4619
HC269,1e-05,val,0.59264117,0.69592252,0.68365407,207
HC269,0.0001,val,0.59407061,0.69592252,0.67682012,407
HC269,0.001,val,0.58842561,0.69592252,0.65430166,1359
HC382,1e-05,val,0.55261431,0.53538834,0.55358452,451
