In [1]:
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(data.table))


### Step 1. Read the relevant data

In [2]:
lab_repo_dir <- '@@@@@'
pheno <- 'Testosterone'

score_files <- list()

score_files[['male']] <- file.path(
    lab_repo_dir, 'sex-div-analysis/snpnet/out', 'onesex',  
    pheno, 'results/score', paste0(pheno, '.sscore')
)

score_files[['female']] <- file.path(
    lab_repo_dir, 'sex-div-analysis/snpnet/out', 'zerosex', 
    pheno, 'results/score', paste0(pheno, '.sscore')
)

score_files[['pre_meno']] <- file.path(
    lab_repo_dir, 'sex-div-analysis/snpnet/out', 'pre_meno',
    pheno, 'results/score', paste0(pheno, '.sscore')
)

score_files[['post_meno']] <- file.path(
    lab_repo_dir, 'sex-div-analysis/snpnet/out', 'post_meno', 
    pheno, 'results/score', paste0(pheno, '.sscore')
)

score_files[['combined']] <- file.path(
    lab_repo_dir, 'public-resources/uk_biobank/biomarkers/snpnet/data',
    pheno, 'results/score', paste0(pheno, '.sscore')
)


In [3]:
score_df <- bind_rows(lapply(
    names(score_files), function(x){    
        fread(score_files[[x]], sep='\t') %>% 
        rename('FID' = '#FID') %>% 
        select(FID, IID, SCORE1_SUM) %>%
        mutate(score_type = x)
    }
)) %>% spread(score_type, SCORE1_SUM)


In [4]:
t_med <- fread('@@@@@/sex_div_gwas/data/t_related.txt')

read_phe <- function(){
    phe_dir <- '@@@@@/biomarkers/covariate_corrected'
    phe_suffix <- 'phenotypes/residual/Testosterone.phe'
    
    bind_rows(lapply(c(
        'outputExtendedBMIreducedMaleWhiteBritish',
        'outputExtendedBMIreducedFemaleWhiteBritish'
    ), function(x){
        fread(file.path(phe_dir, x, phe_suffix))%>%
        filter(! IID %in% (t_med %>% select(IID) %>% pull())) %>%
        drop_na()
    }))
}


In [5]:
read_keep <- function(name){
    keep_dir <- '@@@@@/sex_div_gwas/phefiles'
    df <- fread(file.path(keep_dir, paste0(name, '.keep')))
    colnames(df) <- c('FID')
    df %>% mutate(IID = FID, keep = name)
}


In [6]:
sex_stratified_phe <- read_phe()


In [7]:
keep_df <- bind_rows(lapply(c('onesex', 'zerosex', 'pre_meno', 'post_meno'), function(x){
    read_keep(x) %>% 
    inner_join(sex_stratified_phe, by=c('FID', 'IID'))
}))


combined_phe <- fread(file.path(
    lab_repo_dir, 
    'public-resources/uk_biobank/biomarkers/snpnet/data/biomarkers_covar.phe'
))


split_df <- bind_rows(lapply(c('train', 'val', 'test'), function(x){
    df <- fread(file.path(
        '@@@@@/snpnet/geno/array_combined',
        paste0(x, '.fam')
    )) %>% mutate(keep = x)
    df %>% select(V1, V2, keep) %>% 
    rename('FID' = 'V1', 'IID' = 'V2')
}))


In [8]:
all_df <- keep_df %>%
rename('phe' = 'Testosterone') %>%
left_join(
    combined_phe %>% 
    select(FID, IID, sex, Testosterone) %>%
    rename('phe_combined' = 'Testosterone'),
    by=c('FID', 'IID')
) %>% 
left_join(
    split_df %>% rename('split' = 'keep'), by=c('FID', 'IID')
) %>% 
left_join(
    score_df, by=c('FID', 'IID')
) %>% drop_na()


In [9]:
all_df %>% count(keep, sex)

keep,sex,n
<chr>,<int>,<int>
onesex,1,144108
post_meno,0,77154
pre_meno,0,32087
zerosex,0,142253


### Step 2. Compute statistics for the plots

In [10]:
compute_residual_predictions_naive <- function(df, l_bin, u_bin, middle_df){
    stratified_df <- df %>% 
    filter(l_bin < Percentile, Percentile <= u_bin) %>%
    mutate(PRS_bin = T)
    
    n     <- stratified_df %>% nrow()
    mean  <- stratified_df %>% select(phe) %>% pull() %>% mean()
    sd    <- stratified_df %>% select(phe) %>% pull() %>% sd()
    std_e <- sd / sqrt(n)
    l_err <- mean - std_e
    u_err <- mean + std_e
        
    data.frame(
        l_bin = l_bin,
        u_bin = u_bin,
        mean   = mean,
        std_err = std_e,
        l_err = l_err,
        u_err = u_err,
        mean_str = sprintf('%.3f (%.3f-%.3f)', mean, l_err, u_err)
    ) %>%
    mutate(mean_str = as.character(mean_str))
}

In [11]:
compute_residual_predictions_glm <- function(df, l_bin, u_bin, middle_df){
    stratified_df <- df %>% 
    filter(l_bin < Percentile, Percentile <= u_bin) %>%
    mutate(PRS_bin = T)

    glmfit <- glm (
        phe ~ as.factor(PRS_bin),
        bind_rows(middle_df, stratified_df),
        family='gaussian'
    )
    
    mean    <- summary(glmfit)$coefficients[2,1]
    std_e <- summary(glmfit)$coefficients[2,2]   
    l_err   <- mean - std_e
    u_err   <- mean + std_e
        
    data.frame(
        l_bin = l_bin,
        u_bin = u_bin,
        mean   = mean,
        std_err = std_e,
        l_err = l_err,
        u_err = u_err,
        mean_str = sprintf('%.3f (%.3f-%.3f)', mean, l_err, u_err)
    ) %>%
    mutate(mean_str = as.character(mean_str))
}

In [12]:
compute_summary_df <- function(df, PRS_type, func){
    middle_df <- df %>% 
    filter(0.4 < Percentile, Percentile <= 0.6) %>%
    mutate(PRS_bin = F)

    bind_rows(
        func(df,   0, .01, middle_df),
        func(df, .01, .05, middle_df),
        lapply(2:19, function(x){
            func(df, (x-1)/20, x/20, middle_df)
        }),
        func(df, .95, .99, middle_df),
        func(df, .99, 1, middle_df),
    )  %>%
    mutate(PRS_type = PRS_type)
}


In [13]:
all_df %>% count(split, sex, keep)

split,sex,keep,n
<chr>,<int>,<chr>,<int>
test,0,post_meno,15699
test,0,pre_meno,6362
test,0,zerosex,28640
test,1,onesex,28601
train,0,post_meno,53860
train,0,pre_meno,22635
train,0,zerosex,99564
train,1,onesex,100913
val,0,post_meno,7595
val,0,pre_meno,3090


### split into different PRS models

In [14]:
test_df <- all_df %>% filter(split == 'test') %>% select(-split) %>%
rename('PRS_combined' = 'combined')

test_dfs <- list()


In [15]:
test_dfs[['male_combined']] <- test_df %>% 
filter(keep == 'onesex') %>% 
select(-sex, -keep, -female, -pre_meno, -post_meno) %>%
select(FID, IID, phe_combined, PRS_combined) %>%
rename('phe' = 'phe_combined', 'PRS' = 'PRS_combined') %>% 
mutate(Percentile = rank(-PRS) / n())


In [16]:
test_dfs[['male_sex']] <- test_df %>% 
filter(keep == 'onesex') %>% 
select(-sex, -keep, -female, -pre_meno, -post_meno) %>%
rename('PRS' = 'male') %>%
select(FID, IID, phe, PRS) %>%
mutate(Percentile = rank(-PRS) / n())


In [17]:
test_dfs[['female_combined']] <- test_df %>% 
filter(keep == 'zerosex') %>% 
select(-sex, -keep, -male) %>%
select(FID, IID, phe_combined, PRS_combined) %>%
rename('phe' = 'phe_combined', 'PRS' = 'PRS_combined') %>% 
mutate(Percentile = rank(-PRS) / n())


In [18]:
test_dfs[['female_sex']] <- test_df %>% 
filter(keep == 'zerosex') %>% 
select(-sex, -keep, -male) %>%
select(FID, IID, phe, female) %>%
rename('PRS' = 'female') %>%
mutate(Percentile = rank(-PRS) / n())


In [19]:
test_dfs[['female_pre_meno']] <- test_df %>% 
filter(keep == 'zerosex') %>% 
select(-sex, -keep, -male) %>%
select(FID, IID, phe, pre_meno) %>%
rename('PRS' = 'pre_meno') %>%
mutate(Percentile = rank(-PRS) / n())


In [20]:
test_dfs[['female_post_meno']] <- test_df %>% 
filter(keep == 'zerosex') %>% 
select(-sex, -keep, -male) %>%
select(FID, IID, phe, post_meno) %>%
rename('PRS' = 'post_meno') %>%
mutate(Percentile = rank(-PRS) / n())


In [21]:
summary_glm_dfs   <- list()
summary_naive_dfs <- list()

for(prs in c('combined', 'sex')){
    key <- paste0('male_', prs)
    summary_glm_dfs[[key]]   <- test_dfs[[key]] %>% compute_summary_df(prs, compute_residual_predictions_glm)
    summary_naive_dfs[[key]] <- test_dfs[[key]] %>% compute_summary_df(prs, compute_residual_predictions_naive)
}

for(prs in c('combined', 'sex', 'pre_meno', 'post_meno')){
    key <- paste0('female_', prs)
    summary_glm_dfs[[key]]   <- test_dfs[[key]] %>% compute_summary_df(prs, compute_residual_predictions_glm)
    summary_naive_dfs[[key]] <- test_dfs[[key]] %>% compute_summary_df(prs, compute_residual_predictions_naive)
}


In [22]:
for(t in names(summary_naive_dfs)){
    print(t)
    summary_naive_dfs[[t]] %>% fwrite(paste0(t, '.tsv'), sep='\t')
    summary_naive_dfs[[t]] %>% head(2) %>% select(l_bin, u_bin, mean_str) %>% print()
}


[1] "male_combined"
  l_bin u_bin            mean_str
1  0.00  0.01 0.211 (0.195-0.227)
2  0.01  0.05 0.124 (0.115-0.134)
[1] "male_sex"
  l_bin u_bin            mean_str
1  0.00  0.01 0.279 (0.263-0.295)
2  0.01  0.05 0.198 (0.189-0.206)
[1] "female_combined"
  l_bin u_bin            mean_str
1  0.00  0.01 0.167 (0.142-0.193)
2  0.01  0.05 0.090 (0.077-0.103)
[1] "female_sex"
  l_bin u_bin            mean_str
1  0.00  0.01 0.218 (0.194-0.241)
2  0.01  0.05 0.170 (0.157-0.183)
[1] "female_pre_meno"
  l_bin u_bin            mean_str
1  0.00  0.01 0.168 (0.144-0.192)
2  0.01  0.05 0.091 (0.077-0.104)
[1] "female_post_meno"
  l_bin u_bin            mean_str
1  0.00  0.01 0.209 (0.184-0.234)
2  0.01  0.05 0.128 (0.115-0.141)


In [23]:
for(t in sort(names(test_dfs))){
    print(t)
    cor.test(
        test_dfs[[t]]$phe,
        test_dfs[[t]]$PRS,    
        method = 'pearson'
    ) %>% print()
}

[1] "female_combined"

	Pearson's product-moment correlation

data:  test_dfs[[t]]$phe and test_dfs[[t]]$PRS
t = 23.767, df = 28638, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1277017 0.1504167
sample estimates:
      cor 
0.1390775 

[1] "female_post_meno"

	Pearson's product-moment correlation

data:  test_dfs[[t]]$phe and test_dfs[[t]]$PRS
t = 25.4, df = 28638, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1370875 0.1597402
sample estimates:
      cor 
0.1484333 

[1] "female_pre_meno"

	Pearson's product-moment correlation

data:  test_dfs[[t]]$phe and test_dfs[[t]]$PRS
t = 20.931, df = 28638, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1113253 0.1341393
sample estimates:
      cor 
0.1227485 

[1] "female_sex"

	Pearson's product-moment correlation

data:  test_dfs[[t]]$phe and tes

### plot

In [45]:
p_male <- bind_rows(
    summary_naive_dfs[['male_combined']], summary_naive_dfs[['male_sex']]
) %>%
mutate(
    x_ticks_labels = paste0('[', 100 * l_bin, '% - ', 100 * u_bin, '%]'),
    PRS_type = str_replace_all(PRS_type, 'sex', 'male-specific')
) %>%
ggplot(aes(x=reorder(x_ticks_labels, -u_bin), y=mean, color=PRS_type)) +
geom_point() + 
geom_errorbar(aes(ymin = l_err, ymax = u_err)) +
geom_hline(yintercept = 0, color='gray')+
theme_bw() + 
theme(
    legend.position=c(.15, .9),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)
) +
scale_color_manual(values=c('gray', '#00BFC4')) +
labs(
    title = sprintf('(A) The snpnet PRS performance (male)'),
    x = 'The polygenic risk score percentile',
    y = 'Testosterone (covariate-adjusted residuals)',
    color = 'Polygenic risk score'
)


In [46]:
p_female <- bind_rows(
    summary_naive_dfs[['female_combined']], summary_naive_dfs[['female_sex']]
#     ,
#     summary_dfs[['female_pre_meno']], summary_dfs[['female_post_meno']]
) %>%
mutate(
    x_ticks_labels = paste0('[', 100 * l_bin, '% - ', 100 * u_bin, '%]'),
    PRS_type = str_replace_all(PRS_type, 'sex', 'female-specific')
) %>%
ggplot(aes(x=reorder(x_ticks_labels, -u_bin), y=mean, color=PRS_type)) +
geom_point() + 
geom_errorbar(aes(ymin = l_err, ymax = u_err)) +
geom_hline(yintercept = 0, color='gray')+
theme_bw() + 
theme(
    legend.position=c(.15, .9),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)
) +
scale_color_manual(values=c('gray', '#F8766D')) +
labs(
    title = sprintf('(B) The snpnet PRS performance (female)'),
    x = 'The polygenic risk score percentile',
    y = 'Testosterone (covariate-adjusted residuals)',
    color = 'Polygenic risk score'
)


In [47]:
p_female_full <- bind_rows(
    summary_naive_dfs[['female_combined']], summary_naive_dfs[['female_sex']],
    summary_naive_dfs[['female_pre_meno']], summary_naive_dfs[['female_post_meno']]
) %>%
mutate(
    x_ticks_labels = paste0('[', 100 * l_bin, '% - ', 100 * u_bin, '%]')
) %>%
ggplot(aes(x=reorder(x_ticks_labels, -u_bin), y=mean, color=PRS_type)) +
geom_point() + 
geom_errorbar(aes(ymin = l_err, ymax = u_err)) +
geom_hline(yintercept = 0, color='gray')+
theme_bw() + 
theme(
    legend.position=c(.15, .9),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)
) +
labs(
    title = sprintf('The snpnet PRS performance (female)'),
    x = 'The polygenic risk score percentile',
    y = 'Testosterone (covariate-adjusted residuals)',
    color = 'Polygenic risk score'
)


In [48]:
suppressPackageStartupMessages(require(gridExtra))

In [49]:
g <- arrangeGrob(p_male, p_female, nrow=1)


In [51]:
ggsave('Testosterone.combined.png', g, width=12, height=6)
ggsave('Testosterone.combined.pdf', g, width=12, height=6)

In [2]:
p_male   + ggsave('Testosterone.male.pdf')
p_male   + ggsave('Testosterone.male.png')

p_female + ggsave('Testosterone.female.pdf')
p_female + ggsave('Testosterone.female.png')


In [1]:
p_female_full + ggsave('Testosterone.female.full.pdf')
p_female_full + ggsave('Testosterone.female.full.png')


## comparison of the sex-specific model and the combined model


In [31]:
cor.test(
    test_df %>% filter(sex == 1) %>% select(male) %>% pull(),
    test_df %>% filter(sex == 1) %>% select(PRS_combined) %>% pull(),
    method = 'spearman'
) %>% print()


“Cannot compute exact p-value with ties”


	Spearman's rank correlation rho

data:  test_df %>% filter(sex == 1) %>% select(male) %>% pull() and test_df %>% filter(sex == 1) %>% select(PRS_combined) %>% pull()
S = 1.6234e+12, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.5836842 



In [32]:
p_comp_male <- test_df %>% filter(sex == 1) %>% select(-sex) %>%
rename('sex' = 'male', 'combined' = 'PRS_combined') %>%
mutate(
    sex      = scale(sex,      center = T, scale = T),
    combined = scale(combined, center = T, scale = T),
)%>%
ggplot(aes(
    x = sex, y=combined
)) + 
geom_abline(slope = 1, intercept = 0, color='red', alpha=0.5) +
geom_point(alpha=0.05) + 
theme_bw() + 
theme(legend.position = "none") +
labs(
    title = sprintf('Comparison of PRSs (male)'),
    x = 'Sex-specific PRS', y = 'Combined PRS'
)


In [33]:
cor.test(
    test_df %>% filter(sex == 0) %>% select(female) %>% pull(),
    test_df %>% filter(sex == 0) %>% select(PRS_combined) %>% pull(),
    method = 'spearman'
) %>% print()


“Cannot compute exact p-value with ties”


	Spearman's rank correlation rho

data:  test_df %>% filter(sex == 0) %>% select(female) %>% pull() and test_df %>% filter(sex == 0) %>% select(PRS_combined) %>% pull()
S = 8.587e+12, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6046834 



In [34]:
p_comp_female <- test_df %>% filter(sex == 0) %>% select(-sex) %>%
rename('sex' = 'female', 'combined' = 'PRS_combined') %>%
mutate(
    sex      = scale(sex,      center = T, scale = T),
    combined = scale(combined, center = T, scale = T),
)%>%
ggplot(aes(
    x = sex, y=combined
)) + 
geom_abline(slope = 1, intercept = 0, color='red', alpha=0.5) +
geom_point(alpha=0.05) + 
theme_bw() + 
theme(legend.position = "none") +
labs(
    title = sprintf('Comparison of PRSs (female)'),
    x = 'Sex-specific PRS', y = 'Combined PRS'
)


In [3]:
p_comp_male   + ggsave('Testosterone.comp.male.pdf')
p_comp_male   + ggsave('Testosterone.comp.male.png')

p_comp_female + ggsave('Testosterone.comp.female.pdf')
p_comp_female + ggsave('Testosterone.comp.female.png')

