In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# input
data_d <- '/oak/stanford/groups/mrivas/projects/biobank-methods-dev/snpnet-elastic-net'
phe_f  <- file.path(data_d, 'phenotype.phe')

# constants
covars <- c('age', 'sex', paste0('PC', 1:10))
alphas <- c(0, 0.1, 0.5, 0.9)

# output
out_f <- 'snpnet-elastic-net.eval.tsv'


In [3]:
read_PRS <- function(GBE_ID, alpha, data_dir=data_d){
    sscore_f <- file.path(
        data_dir,
        sprintf('%s_%s', GBE_ID, alpha),
        sprintf('%s.sscore.zst', GBE_ID)
    )
    fread(
        cmd=paste('zstdcat', sscore_f),
        select=c('#FID', 'IID', 'SCORE1_SUM'),
        colClasses=c('#FID'='character', 'IID'='character')
    ) %>%
    rename('FID'='#FID', 'geno_score'='SCORE1_SUM')
}


In [4]:
read_covars <- function(GBE_ID, alpha, data_dir=data_d){
    file.path(
        data_dir,
        sprintf('%s_%s', GBE_ID, alpha),
        'snpnet.covars.tsv'
    ) %>%
    fread(colClasses=c('ID'='character')) %>%
    column_to_rownames('ID')
}

In [5]:
read_BETAs <- function(GBE_ID, alpha, data_dir=data_d){
    file.path(
        data_dir,
        sprintf('%s_%s', GBE_ID, alpha),
        'snpnet.tsv'
    ) %>%
    fread(colClasses=c('ID'='character'))
}

In [6]:
read_predicted_scores <- function(phe_df, GBE_ID, alpha, covariates=covars){
    covar_df <- read_covars(GBE_ID, alpha)
    as.matrix(
        phe_df %>% select(all_of(covariates))
    ) %*% as.matrix(covar_df) %>%
    as.data.frame() %>%
    rownames_to_column('ID') %>%
    separate(ID, c('FID', 'IID')) %>% 
    rename('covar_score'='BETA') %>%
    left_join(
        phe_df %>% select(FID, IID, split, all_of(GBE_ID)),
        by=c('FID', 'IID')
    ) %>%
    left_join(
        read_PRS(GBE_ID, alpha),
        by=c('FID', 'IID')
    ) %>%
    mutate(
        geno_covar_score = geno_score + covar_score
    )
}


In [7]:
perform_eval <- function(response, pred, metric.type){
    if(metric.type == 'r2'){
        summary(lm(response ~ 1 + pred))$r.squared
    }else{
#         pROC::auc(pROC::roc(response, pred))        
        pred.obj <- ROCR::prediction(pred, factor(response - 1))
        auc.obj <- ROCR::performance(pred.obj, measure = 'auc')
        auc.obj@y.values[[1]]
    }
}


In [8]:
build_eval_df <- function(phe_df, GBE_ID, alpha, split_string, metric.type){
    score_test_df <- phe_df %>%
    read_predicted_scores(GBE_ID, alpha) %>%
    filter(split == split_string) %>%
    drop_na(all_of(GBE_ID)) %>%
    filter(GBE_ID != -9)

    data.frame(
        GBE_ID     = GBE_ID,
        alpha      = alpha,
        n_variables = read_BETAs(GBE_ID, alpha) %>% nrow(),
        geno       = perform_eval(
            score_test_df[[GBE_ID]],
            score_test_df$geno_score,
            metric.type
        ),
        covar      = perform_eval(
            score_test_df[[GBE_ID]],
            score_test_df$covar_score,
            metric.type
        ),
        geno_covar = perform_eval(
            score_test_df[[GBE_ID]],
            score_test_df$geno_covar_score,
            metric.type
        ),
        stringsAsFactors = F
    )    
}


In [9]:
get_eval_df <- function(phe_df, alphas, split_string){
    eval_df <- bind_rows(
        alphas %>% lapply(function(alpha){ tryCatch({ 
            phe_df %>% build_eval_df('INI50', alpha, split_string, 'r2')
        }, error=function(e){})}) %>% bind_rows(),

        alphas %>% lapply(function(alpha){ tryCatch({ 
            phe_df %>% build_eval_df('INI21001', alpha, split_string, 'r2')
        }, error=function(e){})}) %>% bind_rows(),

        alphas %>% lapply(function(alpha){ tryCatch({ 
            phe_df %>% build_eval_df('HC269', alpha, split_string, 'auc')
        }, error=function(e){})}) %>% bind_rows(),

        alphas %>% lapply(function(alpha){ tryCatch({ 
            phe_df %>% build_eval_df('HC382', alpha, split_string, 'auc')
        }, error=function(e){})}) %>% bind_rows()
    )
}


## compute the performance metric

In [10]:
phe_df <- fread(phe_f, colClasses=c('FID'='character', 'IID'='character')) %>%
mutate(ID = paste(FID, IID, sep='_')) %>%
column_to_rownames('ID')


In [16]:
eval_df <- bind_rows(
    phe_df %>% get_eval_df(alphas, 'train') %>% mutate(split = 'train'),
    phe_df %>% get_eval_df(alphas, 'val')   %>% mutate(split = 'val'),
    phe_df %>% get_eval_df(alphas, 'test')  %>% mutate(split = 'test')
)


In [17]:
eval_df

GBE_ID,alpha,n_variables,geno,covar,geno_covar,split
<chr>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>
INI50,0.1,52403,0.3060029,0.533768336,0.8316207,train
INI50,0.5,48443,0.30215008,0.533746947,0.8280342,train
INI50,0.9,48256,0.30224197,0.533741721,0.8281844,train
INI21001,0.0,83500,0.47074509,0.010328787,0.4713581,train
INI21001,0.1,29548,0.38784901,0.010277794,0.392291,train
INI21001,0.5,24559,0.36199951,0.010282546,0.366139,train
INI21001,0.9,26240,0.37573247,0.010274256,0.3803066,train
HC269,0.1,5800,0.75049558,0.692706045,0.7854938,train
HC269,0.5,4348,0.73808237,0.692706592,0.7822025,train
HC269,0.9,4277,0.73801523,0.692706234,0.7830365,train


In [12]:
eval_df %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)
