In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# input
phe_info_f <- '../../common/biomarker.phenotype.info.tsv'
phe_f <- '@@@@@@/projects/biomarkers/covariate_corrected/outputExtendedNoTDIreduced/phenotypes/residual_and_logtransformed_and_raw_biomarker_phenotypes_20200524.tsv'
pop_def_f <- '@@@@@@/ukbb24983/sqc/population_stratification_w24983_20190809/ukb24983_GWAS_covar.20190809.phe'
split_f <- '@@@@@@/ukbb24983/sqc/population_stratification_w24983_20190809/split/ukb24983_white_british_%s.phe'
prs_f <- '@@@@@@/projects/biomarkers/snpnet/biomarkers/%s/results/score/%s.sscore'

# output
out_long_phe_f <- '@@@@@@/projects/biomarkers/snpnet/biomarkers/biomarkers.eval.long.tsv'
out_f <- 'snpnet_prs_eval.tsv'


## population definition, train/val/split split

In [3]:
pop_def_df <- fread(
    pop_def_f, select=c('FID', 'IID', 'population'),
    colClasses=c('FID'='character', 'IID'='character')
)


In [4]:
split_df <- c('train', 'val', 'test') %>%
lapply(function(s){
    fread(sprintf(split_f, s), colClasses='character', col.names=c('FID', 'IID')) %>%
    mutate(split = s)
}) %>% 
bind_rows() %>%
filter(FID %in% (pop_def_df %>% filter(population == 'white_british') %>% pull(FID)))


In [5]:
pop_split_df <- pop_def_df %>%
left_join(split_df, by=c('FID', 'IID')) %>%
mutate(
    pop_split = if_else(is.na(split), population, paste(population, split, sep=':'))
) %>%
arrange(as.numeric(FID))

In [6]:
pop_split_df %>% count(pop_split)


pop_split,n
<chr>,<int>
african,6498
e_asian,1154
non_british_white,24909
s_asian,7885
white_british:test,67430
white_british:train,236005
white_british:val,33716
,110780


## phenotype
### phenotype info

In [7]:
phe_info_df <- fread(phe_info_f) %>% rename('Phenotype'='name')

In [8]:
phe_info_df %>% pull(annotation)

### phenotype

We read the log-transformed phenotype and the covariate corrected phenotype


In [9]:
phe_df <- fread(
    phe_f, colClasses=c('FID'='character', 'IID'='character'),
    select=c(
        'FID', 'IID', 
        paste0('log.', phe_info_df %>% pull(annotation)),
        paste0('residual.', phe_info_df %>% pull(annotation))
    ), 
)


In [10]:
dim(phe_df)

## PRS

In [12]:
sscore_df <- phe_info_df %>% pull(annotation) %>%
lapply(function(phe){
    fread(
        sprintf(prs_f, phe, phe),
        select=c('#FID', 'IID', 'SCORE1_SUM'),
        colClasses=c('#FID'='character', 'IID'='character')
    ) %>%
    rename('FID'='#FID') %>%
    rename(!! paste0('PRS.', phe) := 'SCORE1_SUM')
}) %>%
reduce(function(x, y) inner_join(x, y, by=c('FID', 'IID')))


In [13]:
sscore_df %>% dim()

## join

In [25]:
long_df <- sscore_df %>% 
gather(trait, PRS, -FID, -IID) %>%
mutate(trait = str_replace(trait, '^PRS.', '')) %>%
full_join(
    phe_df %>%
    select(all_of(c('FID', 'IID', paste0('log.', phe_info_df %>% pull(annotation))))) %>%
    gather(trait, log_phe, -FID, -IID) %>%
    mutate(trait = str_replace(trait, '^log.', '')),
    by=c('FID', 'IID', 'trait')
) %>%
full_join(
    phe_df %>%
    select(all_of(c('FID', 'IID', paste0('residual.', phe_info_df %>% pull(annotation))))) %>%
    gather(trait, residual_phe, -FID, -IID) %>%
    mutate(trait = str_replace(trait, '^residual.', '')),
    by=c('FID', 'IID', 'trait')
) %>%
left_join(
    pop_split_df %>%
    select(FID, IID, pop_split),
    by=c('FID', 'IID')
) %>%
mutate(
    covar_score = log_phe - residual_phe,
    total_score = covar_score + PRS
)


In [26]:
long_df %>%
rename('#FID' = 'FID') %>%
fwrite(out_long_phe_f, sep='\t', na = "NA", quote=F)


In [27]:
out_long_phe_f

In [46]:
long_df %>% 
drop_na(log_phe, residual_phe, covar_score, total_score) %>%
select(FID, IID, pop_split) %>%
unique() %>%
count(pop_split)

pop_split,n
<chr>,<int>
african,6021
e_asian,1082
non_british_white,23595
s_asian,7341
white_british:test,63818
white_british:train,223327
white_british:val,31929
,104389


## eval

In [28]:
r2 <- function(response, pred){
#     1 - sum((response - pred)^2) / sum((response - mean(response))^2)    
    summary(lm(response ~ 1 + pred))$r.squared
}


In [31]:
build_eval_df <- function(long_df, phe, p_s){
    df <- long_df %>% 
    filter(trait == phe, pop_split == p_s) %>%
    drop_na(log_phe, PRS)
    
    data.frame(
        trait      = phe,
        pop_split  = p_s,
        geno       = r2(df$log_phe, df$PRS),
        covar      = r2(df$log_phe, df$covar_score),
        geno_covar = r2(df$log_phe, df$total_score),
        stringsAsFactors = F
    )    
}


In [32]:
# test with example
build_eval_df(long_df, phe = 'Testosterone', p_s = 'white_british:train')

trait,pop_split,geno,covar,geno_covar
<chr>,<chr>,<dbl>,<dbl>,<dbl>
Testosterone,white_british:train,0.00926571,0.9034295,0.913574


In [33]:
r2_eval_df <- pop_split_df %>%
drop_na(pop_split) %>%
pull(pop_split) %>%
unique() %>%
lapply(function(p_s){
    phe_info_df %>% pull(annotation) %>%
    lapply(function(phe){
        build_eval_df(long_df, phe, p_s)
    }) %>% bind_rows()
}) %>% bind_rows()


In [34]:
r2_eval_df %>% dim()

In [43]:
r2_eval_df %>%
rename('annotation'='trait') %>%
left_join(
    phe_info_df %>%
    select(Phenotype, annotation),
    by='annotation'
) %>%
rename('trait'='Phenotype') %>%
select(trait, annotation, pop_split, geno, covar, geno_covar) %>%
rename('#trait' = 'trait') %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)


In [44]:
out_f