In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
    library(gridExtra)
}))


### Step 1. Read the relevant data

In [2]:
lab_repo_dir <- '/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab'
out_dir <- file.path(lab_repo_dir, 'sex-div-analysis/snpnet/out/v2.1')
pheno <- 'Testosterone_residuals'

score_files <- list()

score_files[['male']] <- file.path(
    out_dir, 'onesex',  
    pheno, 'results/score', paste0(pheno, '.sscore')
)

score_files[['female']] <- file.path(
    out_dir, 'zerosex', 
    pheno, 'results/score', paste0(pheno, '.sscore')
)

score_files[['combined']] <- file.path(
    out_dir, 'combined',
    pheno, 'results/score', paste0(pheno, '.sscore')
)


In [12]:
score_df <- bind_rows(lapply(
    names(score_files), function(x){    
        fread(score_files[[x]], sep='\t', colClasses=c('#FID'='character', 'IID'='character')) %>% 
        rename('FID' = '#FID') %>% 
        select(FID, IID, SCORE1_SUM) %>%
        mutate(score_type = x)
    }
)) %>% spread(score_type, SCORE1_SUM)


In [4]:
phe_dir <- file.path(lab_repo_dir, 'sex-div-analysis/snpnet/phe_data/v2')

phe_files <- list()

phe_files[['male']]     <- file.path(phe_dir, paste0('onesex', '.phe'))
phe_files[['female']]   <- file.path(phe_dir, paste0('zerosex', '.phe'))
phe_files[['combined']] <- file.path(phe_dir, paste0('combined', '.phe'))


In [13]:
phe_df <- bind_rows(lapply(
    names(phe_files), function(x){    
        fread(phe_files[[x]], sep='\t', colClasses=c('FID'='character', 'IID'='character')) %>% 
        select(FID, IID, Testosterone_residuals) %>%
        mutate(cohort = x)
    }
))


In [14]:
split_df <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification_w24983_20200313/ukb24983_GWAS_covar.20200313.phe' %>%
fread(select=c('FID', 'IID', 'split'), colClasses=c('FID'='character', 'IID'='character')) %>%
rename('keep'='split')


In [15]:
phe_df %>% count(cohort)

cohort,n
<chr>,<int>
combined,365682
female,183130
male,182552


In [16]:
all_df <- phe_df %>% 
left_join(split_df, by=c('FID', 'IID')) %>% 
left_join(score_df, by=c('FID', 'IID'))

all_df %>% dim() %>% print()
phe_df %>% dim() %>% print()


[1] 731364      8
[1] 731364      4


In [17]:
all_df %>% head(0)

FID,IID,Testosterone_residuals,cohort,keep,combined,female,male
<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>


### Step 2. Compute statistics for the plots

In [18]:
compute_mean <- function(df, percentile_col, phe_col, l_bin, u_bin){
    stratified_df <- df %>% 
    rename(Percentile = percentile_col, phe = phe_col) %>%
    filter(l_bin < Percentile, Percentile <= u_bin) %>%
    mutate(PRS_bin = T)
    
    n     <- stratified_df %>% nrow()
    mean  <- stratified_df %>% select(phe) %>% pull() %>% mean()
    sd    <- stratified_df %>% select(phe) %>% pull() %>% sd()
    std_e <- sd / sqrt(n)
    l_err <- mean - std_e
    u_err <- mean + std_e
        
    data.frame(
        l_bin = l_bin,
        u_bin = u_bin,
        mean   = mean,
        std_err = std_e,
        l_err = l_err,
        u_err = u_err,
        mean_str = sprintf('%.3f (%.3f-%.3f)', mean, l_err, u_err)
    ) %>%
    mutate(mean_str = as.character(mean_str))
}


In [19]:
compute_summary_df <- function(df, percentile_col, phe_col, PRS_type){
    percentile_col
    bind_rows(
        compute_mean(df, percentile_col, phe_col,   0, .01),
        compute_mean(df, percentile_col, phe_col, .01, .05),
        lapply(2:19, function(x){
            compute_mean(df, percentile_col, phe_col, (x-1)/20, x/20)
        }),
        compute_mean(df, percentile_col, phe_col, .95, .99),
        compute_mean(df, percentile_col, phe_col, .99, 1),
    )  %>%
    mutate(PRS_type = PRS_type)
}


In [20]:
all_df %>% count(cohort, keep)


cohort,keep,n
<chr>,<chr>,<int>
combined,test,57241
combined,train,200476
combined,val,28643
combined,,79322
female,test,28640
female,train,99563
female,val,14049
female,,40878
male,test,28601
male,train,100913


### split into different PRS models

In [21]:
test_df <- all_df %>% 
filter(keep == 'test') %>% 
select(-keep) %>% 
group_by(cohort) %>%
mutate(
    combined_Percentile = rank(-combined) / n(),
    female_Percentile = rank(-female) / n(),
    male_Percentile = rank(-male) / n()
) %>%
ungroup()


In [22]:
summary_dfs   <- list()

for(c in c('male', 'female')){
    summary_dfs[[c]] <- list()
    for(score in c('combined', 'male', 'female')){
        summary_dfs[[c]][[score]] <- compute_summary_df(
            test_df %>% filter(cohort == c),
            paste0(score, '_Percentile'),
            'Testosterone_residuals',
            score
        )
    }
}


Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(percentile_col)` instead of `percentile_col` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m
Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(phe_col)` instead of `phe_col` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m


In [23]:
for(c in c('male', 'female')){
    for(score in c('combined', 'male', 'female')){
        summary_dfs[[c]][[score]] %>% fwrite(file.path('tbls', paste0(c, '_', score, '.tsv')), sep='\t')
        print(paste0(c, ' ', score))
        summary_dfs[[c]][[score]] %>% head(2) %>% select(l_bin, u_bin, mean_str) %>% print()       
    }
}


[1] "male combined"
  l_bin u_bin            mean_str
1  0.00  0.01 0.083 (0.076-0.091)
2  0.01  0.05 0.057 (0.053-0.062)
[1] "male male"
  l_bin u_bin            mean_str
1  0.00  0.01 0.120 (0.112-0.127)
2  0.01  0.05 0.086 (0.082-0.089)
[1] "male female"
  l_bin u_bin              mean_str
1  0.00  0.01 -0.003 (-0.011-0.004)
2  0.01  0.05   0.005 (0.001-0.010)
[1] "female combined"
  l_bin u_bin            mean_str
1  0.00  0.01 0.067 (0.055-0.078)
2  0.01  0.05 0.044 (0.039-0.050)
[1] "female male"
  l_bin u_bin              mean_str
1  0.00  0.01   0.015 (0.003-0.027)
2  0.01  0.05 -0.003 (-0.009-0.003)
[1] "female female"
  l_bin u_bin            mean_str
1  0.00  0.01 0.099 (0.088-0.109)
2  0.01  0.05 0.068 (0.062-0.073)


In [24]:
cor.test(
    test_df %>% filter(cohort == 'male') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(male) %>% pull(),
    method='spearman'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(female) %>% pull(),
    method='spearman'
) %>% print()


“Cannot compute exact p-value with ties”


	Spearman's rank correlation rho

data:  test_df %>% filter(cohort == "male") %>% select(combined) %>%  and test_df %>% filter(cohort == "male") %>% select(male) %>% pull()    pull() and test_df %>% filter(cohort == "male") %>% select(male) %>% pull()
S = 1.5878e+12, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.5928003 



“Cannot compute exact p-value with ties”


	Spearman's rank correlation rho

data:  test_df %>% filter(cohort == "female") %>% select(combined) %>%  and test_df %>% filter(cohort == "female") %>% select(female) %>%     pull() and     pull()
S = 1.5572e+12, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6022742 



In [25]:
cor.test(
    test_df %>% filter(cohort == 'male') %>% select(male) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(Testosterone_residuals) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(female) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(Testosterone_residuals) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'male') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(Testosterone_residuals) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(combined) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(Testosterone_residuals) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'male') %>% select(female) %>% pull(),
    test_df %>% filter(cohort == 'male') %>% select(Testosterone_residuals) %>% pull(),
    method='pearson'
) %>% print()

cor.test(
    test_df %>% filter(cohort == 'female') %>% select(male) %>% pull(),
    test_df %>% filter(cohort == 'female') %>% select(Testosterone_residuals) %>% pull(),
    method='pearson'
) %>% print()




	Pearson's product-moment correlation

data:  test_df %>% filter(cohort == "male") %>% select(male) %>% pull() and test_df %>% filter(cohort == "male") %>% select(Testosterone_residuals) %>% test_df %>% filter(cohort == "male") %>% select(male) %>% pull() and     pull()
t = 55.408, df = 28599, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.3008485 0.3217806
sample estimates:
      cor 
0.3113523 


	Pearson's product-moment correlation

data:  test_df %>% filter(cohort == "female") %>% select(female) %>%  and test_df %>% filter(cohort == "female") %>% select(Testosterone_residuals) %>%     pull() and     pull()
t = 31.679, df = 28638, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1727902 0.1951691
sample estimates:
      cor 
0.1840035 


	Pearson's product-moment correlation

data:  test_df %>% filter(cohort == "male") %>% select(combined) %>%  and test_d

### plot

In [26]:
require(latex2exp)

Loading required package: latex2exp


In [27]:
colors <- list()
colors[['combined']] <- 'gray'
colors[['male']]     <- '#8DA0CB'
colors[['female']]   <- '#FC8D62'


In [28]:
peformance_plots <- list()
for(c in c('male', 'female')){
    peformance_plots[[c]] <- summary_dfs[[c]] %>% bind_rows() %>%
    mutate(
        x_ticks_labels = paste0('[', 100 * l_bin, '% - ', 100 * u_bin, '%]'),
        PRS_type = str_replace_all(PRS_type, 'male', 'male-specific')
    ) %>%
    ggplot(aes(x=reorder(x_ticks_labels, -u_bin), y=mean, color=PRS_type)) +
    geom_point() + 
    geom_errorbar(aes(ymin = l_err, ymax = u_err)) +
    geom_hline(yintercept = 0, color='gray')+
    theme_bw() + 
    theme(
        legend.position=c(.2, .8),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)
    ) +
    scale_color_manual(values=c(colors[['combined']], colors[['female']], colors[['male']])) +
    labs(
        x = 'The polygenic risk score percentile',
        y = TeX('Testosterone (log_{10}-transformed residuals after covariate adjustment)'),
        color = 'Polygenic risk score'
    )
}


In [29]:
# peformance_plots[['male']] + 
# scale_y_continuous(label=(function(x){10 ** x}))



In [30]:
g <- arrangeGrob(
    peformance_plots[['male']]   + labs(title='(A) The snpnet PRS performance (male)'),
    peformance_plots[['female']] + labs(title='(B) The snpnet PRS performance (female)'), 
    nrow=1
)


In [31]:
ggsave('Testosterone.combined.png', g, width=12, height=6)
ggsave('Testosterone.combined.pdf', g, width=12, height=6)


## comparison of the sex-specific model and the combined model


In [23]:
comp_plots <- list()
for(c in c('male', 'female')){
    comp_plots[[c]] <- test_df %>% 
    filter(cohort == c) %>%
    rename('sex' = c) %>%    
    select(FID, IID, combined, sex) %>%    
    mutate(
        sex      = scale(sex,      center = T, scale = T),
        combined = scale(combined, center = T, scale = T)    
    ) %>%
    ggplot(aes(x = sex, y=combined)) + 
    geom_abline(slope = 1, intercept = 0, color='red', alpha=0.5) +
    geom_point(alpha=0.05) + 
    theme_bw() + 
    theme(legend.position = "none") +
    labs(
        x = sprintf('%s-specific PRS', c), 
        y = 'combined PRS'
    )    
}


In [24]:
comp_plot_g <- arrangeGrob(
    comp_plots[['male']]   + labs(title='(A) Comparison of PRSs (male)'),
    comp_plots[['female']] + labs(title='(B) Comparison of PRSs (female)'), 
    nrow=1
)


In [25]:
ggsave('Testosterone.comp.png', comp_plot_g, width=12, height=6)
ggsave('Testosterone.comp.pdf', comp_plot_g, width=12, height=6)
