In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
    library(gridExtra)
}))


# Note

- This notebook is copied from `PRS_comp.ipynb`

### Step 1. Read the relevant data

In [6]:
lab_repo_dir <- '@@@@@'
out_dir <- file.path('@@@@@', 'sex-div-analysis/06_snpnet/v2.1-basic-covars')
pheno <- 'Testosterone'

score_files <- list()

score_files[['male']] <- file.path(
    out_dir, 'onesex_basic-covars',  
    paste0(pheno, '.sscore.zst')
)

score_files[['female']] <- file.path(
    out_dir, 'zerosex_basic-covars', 
    paste0(pheno, '.sscore.zst')
)

score_files[['combined']] <- file.path(
    out_dir, 'combined_basic-covars',
    paste0(pheno, '.sscore.zst')
)


In [7]:
score_df <- bind_rows(lapply(
    names(score_files), function(x){    
        fread(cmd=paste('zstdcat', score_files[[x]]), sep='\t', colClasses=c('#FID'='character', 'IID'='character')) %>% 
        rename('FID' = '#FID') %>% 
        select(FID, IID, SCORE1_SUM) %>%
        mutate(score_type = x)
    }
)) %>% spread(score_type, SCORE1_SUM)


In [3]:
out_dir

In [9]:
phe_file <- '@@@@@/sex-div-analysis/06_snpnet/phe_data/v2.1-basic-covars/Testosterone.phe'


In [12]:
phe_all_df <- fread(phe_file, colClasses=c('FID'='character', 'IID'='character'))


In [46]:
means <- list()
means[['male']]     <- phe_all_df %>% filter(split_onesex   != '-') %>% pull(Testosterone) %>% mean()
means[['female']]   <- phe_all_df %>% filter(split_zerosex  != '-') %>% pull(Testosterone) %>% mean()
means[['combined']] <- phe_all_df %>% filter(split_combined != '-') %>% pull(Testosterone) %>% mean()


In [47]:
means

In [48]:
phe_df <- bind_rows(
    phe_all_df %>% filter(split_onesex   != '-') %>%
    select(FID, IID, Testosterone) %>%
    mutate(
        cohort = 'male',
        Testosterone = Testosterone - means[['male']]
    ),

    phe_all_df %>% filter(split_zerosex  != '-') %>%
    select(FID, IID, Testosterone) %>% 
    mutate(
        cohort = 'female',
        Testosterone = Testosterone - means[['female']]
    ),

    phe_all_df %>% filter(split_combined != '-') %>%
    select(FID, IID, Testosterone) %>%
    mutate(
        cohort = 'combined',
        Testosterone = Testosterone - means[['combined']]
    )
)


In [49]:
split_df <- '@@@@@/ukb24983_GWAS_covar.20200313.phe' %>%
fread(select=c('FID', 'IID', 'split'), colClasses=c('FID'='character', 'IID'='character')) %>%
rename('keep'='split')


In [50]:
all_df <- phe_df %>% 
left_join(split_df, by=c('FID', 'IID')) %>% 
left_join(score_df, by=c('FID', 'IID'))

all_df %>% dim() %>% print()
phe_df %>% dim() %>% print()


[1] 572720      8
[1] 572720      4


In [51]:
all_df %>% head(0)

FID,IID,Testosterone,cohort,keep,combined,female,male
<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>


### Step 2. Compute statistics for the plots

In [52]:
compute_mean <- function(df, percentile_col, phe_col, l_bin, u_bin){
    stratified_df <- df %>% 
    rename(Percentile = percentile_col, phe = phe_col) %>%
    filter(l_bin < Percentile, Percentile <= u_bin) %>%
    mutate(PRS_bin = T)
    
    n     <- stratified_df %>% nrow()
    mean  <- stratified_df %>% select(phe) %>% pull() %>% mean()
    sd    <- stratified_df %>% select(phe) %>% pull() %>% sd()
    std_e <- sd / sqrt(n)
    l_err <- mean - std_e
    u_err <- mean + std_e
        
    data.frame(
        l_bin = l_bin,
        u_bin = u_bin,
        mean   = mean,
        std_err = std_e,
        l_err = l_err,
        u_err = u_err,
        mean_str = sprintf('%.3f (%.3f-%.3f)', mean, l_err, u_err)
    ) %>%
    mutate(mean_str = as.character(mean_str))
}


In [53]:
compute_summary_df <- function(df, percentile_col, phe_col, PRS_type){
    percentile_col
    bind_rows(
        compute_mean(df, percentile_col, phe_col,   0, .01),
        compute_mean(df, percentile_col, phe_col, .01, .05),
        lapply(2:19, function(x){
            compute_mean(df, percentile_col, phe_col, (x-1)/20, x/20)
        }),
        compute_mean(df, percentile_col, phe_col, .95, .99),
        compute_mean(df, percentile_col, phe_col, .99, 1),
    )  %>%
    mutate(PRS_type = PRS_type)
}


In [54]:
all_df %>% count(cohort, keep)


cohort,keep,n
<chr>,<chr>,<int>
combined,test,57241
combined,train,200476
combined,val,28643
female,test,28640
female,train,99563
female,val,14049
male,test,28601
male,train,100913
male,val,14594


### split into different PRS models

In [55]:
test_df <- all_df %>% 
filter(keep == 'test') %>% 
select(-keep) %>% 
group_by(cohort) %>%
mutate(
    combined_Percentile = rank(-combined) / n(),
    female_Percentile = rank(-female) / n(),
    male_Percentile = rank(-male) / n()
) %>%
ungroup()


In [56]:
summary_dfs   <- list()

for(c in c('male', 'female')){
    summary_dfs[[c]] <- list()
    for(score in c('combined', 'male', 'female')){
        summary_dfs[[c]][[score]] <- compute_summary_df(
            test_df %>% filter(cohort == c),
            paste0(score, '_Percentile'),
            'Testosterone',
            score
        )
    }
}


In [57]:
for(c in c('male', 'female')){
    for(score in c('combined', 'male', 'female')){
        summary_dfs[[c]][[score]] %>% fwrite(file.path('tbls', paste0('basic-covars_', c, '_', score, '.tsv')), sep='\t')
        print(paste0(c, ' ', score))
        summary_dfs[[c]][[score]] %>% head(2) %>% select(l_bin, u_bin, mean_str) %>% print()       
    }
}


[1] "male combined"
  l_bin u_bin            mean_str
1  0.00  0.01 0.081 (0.073-0.088)
2  0.01  0.05 0.057 (0.053-0.061)
[1] "male male"
  l_bin u_bin            mean_str
1  0.00  0.01 0.121 (0.114-0.128)
2  0.01  0.05 0.081 (0.077-0.085)
[1] "male female"
  l_bin u_bin              mean_str
1  0.00  0.01 -0.000 (-0.008-0.007)
2  0.01  0.05   0.006 (0.001-0.010)
[1] "female combined"
  l_bin u_bin            mean_str
1  0.00  0.01 0.068 (0.056-0.079)
2  0.01  0.05 0.039 (0.034-0.045)
[1] "female male"
  l_bin u_bin             mean_str
1  0.00  0.01 0.005 (-0.006-0.017)
2  0.01  0.05 0.001 (-0.005-0.007)
[1] "female female"
  l_bin u_bin            mean_str
1  0.00  0.01 0.095 (0.084-0.106)
2  0.01  0.05 0.072 (0.066-0.078)


In [58]:
cor.test(
    test_df %>% filter(cohort == 'male') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(male) %>% pull(),
    method='spearman'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(female) %>% pull(),
    method='spearman'
) %>% print()


“Cannot compute exact p-value with ties”


	Spearman's rank correlation rho

data:  test_df %>% filter(cohort == "male") %>% select(combined) %>%  and test_df %>% filter(cohort == "male") %>% select(male) %>% pull()    pull() and test_df %>% filter(cohort == "male") %>% select(male) %>% pull()
S = 1.5899e+12, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.5922553 



“Cannot compute exact p-value with ties”


	Spearman's rank correlation rho

data:  test_df %>% filter(cohort == "female") %>% select(combined) %>%  and test_df %>% filter(cohort == "female") %>% select(female) %>%     pull() and     pull()
S = 1.5712e+12, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.5987167 



In [59]:
cor.test(
    test_df %>% filter(cohort == 'male') %>% select(male) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(Testosterone) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(female) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(Testosterone) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'male') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(Testosterone) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(Testosterone) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'male') %>% select(female) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(Testosterone) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(male) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(Testosterone) %>% pull(),
    method='pearson'
) %>% print()




	Pearson's product-moment correlation

data:  test_df %>% filter(cohort == "male") %>% select(male) %>% pull() and test_df %>% filter(cohort == "male") %>% select(Testosterone) %>% test_df %>% filter(cohort == "male") %>% select(male) %>% pull() and     pull()
t = 54.245, df = 28599, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2948897 0.3159064
sample estimates:
      cor 
0.3054353 


	Pearson's product-moment correlation

data:  test_df %>% filter(cohort == "female") %>% select(female) %>%  and test_df %>% filter(cohort == "female") %>% select(Testosterone) %>%     pull() and     pull()
t = 30.811, df = 28638, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1678923 0.1903122
sample estimates:
      cor 
0.1791255 


	Pearson's product-moment correlation

data:  test_df %>% filter(cohort == "male") %>% select(combined) %>%  and test_df %>% filter(cohort 

### plot

In [60]:
require(latex2exp)

In [61]:
colors <- list()
colors[['combined']] <- 'gray'
colors[['male']]     <- '#8DA0CB'
colors[['female']]   <- '#FC8D62'


In [65]:
peformance_plots <- list()
for(c in c('male', 'female')){
    peformance_plots[[c]] <- summary_dfs[[c]] %>% bind_rows() %>%
    mutate(
        x_ticks_labels = paste0('[', 100 * l_bin, '% - ', 100 * u_bin, '%]'),
        PRS_type = str_replace_all(PRS_type, 'male', 'male-specific')
    ) %>%
    ggplot(aes(x=reorder(x_ticks_labels, -u_bin), y=mean, color=PRS_type)) +
    geom_point() + 
    geom_errorbar(aes(ymin = l_err, ymax = u_err)) +
    geom_hline(yintercept = 0, color='gray')+
    theme_bw() + 
    theme(
        legend.position=c(.2, .8),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)
    ) +
    scale_color_manual(values=c(colors[['combined']], colors[['female']], colors[['male']])) +
    labs(
        x = 'The polygenic risk score percentile',
        y = TeX('mean-adjusted log_{10}-transformed Testosterone'),
        color = 'Polygenic risk score'
    )
}


In [66]:
g <- arrangeGrob(
    peformance_plots[['male']]   + labs(title='(A) The snpnet PRS performance (male)'),
    peformance_plots[['female']] + labs(title='(B) The snpnet PRS performance (female)'), 
    nrow=1
)


In [67]:
ggsave('Testosterone.combined.basic-covars.png', g, width=12, height=6)
ggsave('Testosterone.combined.basic-covars.pdf', g, width=12, height=6)


## comparison of the sex-specific model and the combined model


In [23]:
comp_plots <- list()
for(c in c('male', 'female')){
    comp_plots[[c]] <- test_df %>% 
    filter(cohort == c) %>%
    rename('sex' = c) %>%    
    select(FID, IID, combined, sex) %>%    
    mutate(
        sex      = scale(sex,      center = T, scale = T),
        combined = scale(combined, center = T, scale = T)    
    ) %>%
    ggplot(aes(x = sex, y=combined)) + 
    geom_abline(slope = 1, intercept = 0, color='red', alpha=0.5) +
    geom_point(alpha=0.05) + 
    theme_bw() + 
    theme(legend.position = "none") +
    labs(
        x = sprintf('%s-specific PRS', c), 
        y = 'combined PRS'
    )    
}


In [24]:
comp_plot_g <- arrangeGrob(
    comp_plots[['male']]   + labs(title='(A) Comparison of PRSs (male)'),
    comp_plots[['female']] + labs(title='(B) Comparison of PRSs (female)'), 
    nrow=1
)


In [25]:
ggsave('Testosterone.comp.png', comp_plot_g, width=12, height=6)
ggsave('Testosterone.comp.pdf', comp_plot_g, width=12, height=6)
