In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [34]:
# input
data_d <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/20200528-batch'
phe_f  <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.20200522.phe'

# constants
covars <- c('age', 'sex', paste0('PC', 1:10))

# output
out_f <- 'snpnet-weighted.eval.tsv'

In [19]:
read_PRS <- function(GBE_ID, data_dir=data_d){
    sscore_f <- file.path(data_dir, GBE_ID, sprintf('%s.sscore.zst', GBE_ID))
    
    fread(
        cmd=paste('zstdcat', sscore_f),
        select=c('#FID', 'IID', 'SCORE1_SUM'),
        colClasses=c('#FID'='character', 'IID'='character')
    ) %>%
    rename('FID'='#FID', 'geno_score'='SCORE1_SUM')
}


In [20]:
read_covars <- function(GBE_ID, data_dir=data_d){
    file.path(data_dir, GBE_ID, 'snpnet.covars.tsv') %>%
    fread(colClasses=c('ID'='character')) %>%
    column_to_rownames('ID')
}


In [21]:
read_BETAs <- function(GBE_ID, data_dir=data_d){
    file.path(data_dir, GBE_ID, 'snpnet.tsv') %>%
    fread(colClasses=c('ID'='character'))
}


In [43]:
read_predicted_scores <- function(phe_df, GBE_ID, covariates=covars){
    covar_df <- read_covars(GBE_ID)
    as.matrix(
        phe_df %>% select(all_of(covariates))
    ) %*% as.matrix(covar_df) %>%
    as.data.frame() %>%
    rownames_to_column('ID') %>%
    separate(ID, c('FID', 'IID'), sep='_') %>% 
    rename('covar_score'='BETA') %>%
    left_join(
        phe_df %>% select(FID, IID, split, all_of(GBE_ID)),
        by=c('FID', 'IID')
    ) %>%
    left_join(
        read_PRS(GBE_ID),
        by=c('FID', 'IID')
    ) %>%
    mutate(
        geno_covar_score = geno_score + covar_score
    )
}


In [29]:
perform_eval <- function(response, pred, metric.type){
    if(metric.type == 'r2'){
        summary(lm(response ~ 1 + pred))$r.squared
    }else{
#         pROC::auc(pROC::roc(response, pred))        
        pred.obj <- ROCR::prediction(pred, factor(response - 1))
        auc.obj <- ROCR::performance(pred.obj, measure = 'auc')
        auc.obj@y.values[[1]]
    }
}

In [57]:
build_eval_df_line <- function(phe_df, GBE_ID, split_string, metric.type){
    score_test_df <- phe_df %>%
    read_predicted_scores(GBE_ID) %>%
    filter(split == split_string) %>%
    drop_na(all_of(GBE_ID)) %>%
    filter(GBE_ID != -9)

    data.frame(
        GBE_ID     = GBE_ID,
        n_variables = read_BETAs(GBE_ID) %>% nrow(),
        geno       = perform_eval(
            score_test_df[[GBE_ID]],
            score_test_df$geno_score,
            metric.type
        ),
        covar      = perform_eval(
            score_test_df[[GBE_ID]],
            score_test_df$covar_score,
            metric.type
        ),
        geno_covar = perform_eval(
            score_test_df[[GBE_ID]],
            score_test_df$geno_covar_score,
            metric.type
        ),
        stringsAsFactors = F
    )    
}


In [65]:
build_eval_df <- function(phe_df, GBE_ID, split_strings){
    lapply(split_strings, function(s){tryCatch({ 
        build_eval_df_line(phe_df, GBE_ID, s, 'auc') %>% mutate(split = s)
    }, error=function(e){})}) %>%
    bind_rows() %>%
    mutate(
        covar      = if_else(str_detect(split, 'white_british'), covar, 0),
        geno_covar = if_else(str_detect(split, 'white_british'), geno_covar, 0)
    )  %>%
    mutate(
        covar = na_if(covar, 0),
        geno_covar = na_if(geno_covar, 0),
        geno_delta = geno_covar - covar
    ) %>%
    select(GBE_ID, n_variables, split, geno, covar, geno_covar, geno_delta)
}


In [12]:
GBE_ID <- 'HC1'

In [16]:
phe_df <- fread(
    phe_f,
    colClasses=c('#FID'='character', 'IID'='character'),
    select=c('#FID','IID','split', 'population', GBE_ID, covars)
) %>%
rename('FID'='#FID') %>%
drop_na(population) %>%
filter(!str_detect(population, 'outlier')) %>%
mutate(
    split = if_else(is.na(split), population, paste(population, split, sep=':'))
) %>%
mutate(ID = paste(FID, IID, sep='_')) %>%
column_to_rownames('ID')


In [17]:
phe_df %>% count(split)

split,n
<chr>,<int>
african,6497
e_asian,1154
non_british_white,24905
others,28467
s_asian,7885
white_british:test,67427
white_british:train,235997
white_british:val,33714


In [59]:
split_strings <- phe_df %>%
pull(split) %>%
unique() %>%
sort()


In [61]:
split_strings

In [66]:
build_eval_df(phe_df, GBE_ID, split_strings)

GBE_ID,n_variables,split,geno,covar,geno_covar,geno_delta
<chr>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
HC1,1,african,0.4950328,,,
HC1,1,e_asian,0.3308631,,,
HC1,1,non_british_white,0.5076179,,,
HC1,1,s_asian,0.4888557,,,
HC1,1,white_british:test,0.5031645,0.6030854,0.6030854,0.0
HC1,1,white_british:train,0.5146609,0.6145205,0.6145205,0.0
HC1,1,white_british:val,0.4990828,0.6215128,0.6215128,0.0
