In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# output
out_f <- '/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/sex-div-analysis/snpnet/phe_data/v2.1-no-covars/Testosterone.phe'

# phenotype file
phe_dir <- '/oak/stanford/groups/mrivas/projects/biomarkers/covariate_corrected'
phe_f   <- file.path(phe_dir, 'phenotypes/biomarkers_with_egfr_fastingglucose_nonalbumin.phe')
covar_f <- file.path(phe_dir, 'outputExtendedNoTDIreduced/phenotypes/full.table.combined.phe')
gwas_covar_f <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification_w24983_20200313/ukb24983_GWAS_covar.20200313.phe'

# covar list
covars_lst <- '/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/sex-div-analysis/snpnet/phe_data/v2/covars.sex-indep.txt'

# population definition
pop_def_d <- dirname(covars_lst)
pop_defs <- c('combined', 'onesex', 'zerosex')


In [3]:
gwas_covar_df <- fread(
    gwas_covar_f,
    select=c('FID', 'IID', 'split', 'age', 'sex'),
    colClasses=c('FID'='character', 'IID'='character')
)


In [4]:
pop_def_dfs <- list()
for(pop in pop_defs){
    pop_def_dfs[[pop]] <- file.path(pop_def_d, paste0(pop, '.phe')) %>%
    fread(select=c('FID','IID'),colClasses=c('FID'='character', 'IID'='character'))
}


In [5]:
covars <- fread(covars_lst, head=F) %>% pull()

In [6]:
covar_df <- fread(covar_f, colClasses=c('FID'='character', 'IID'='character')) %>% 
mutate(ageBin_FastingTime = ageBin * FastingTime) %>%
select(c('FID', 'IID', all_of(covars)))


In [7]:
df <- fread(phe_f, select=c('IID', 'f.30850.0.0'), colClasses=c('IID'='character')) %>%
rename('Testosterone'='f.30850.0.0')%>%
mutate(FID = IID) %>%
select(FID, IID, Testosterone) %>%
mutate(Testosterone = log10(Testosterone)) %>%
left_join(covar_df, by=c('FID', 'IID')) %>%
left_join(gwas_covar_df, by=c('FID', 'IID')) %>%
mutate(
    split2 = if_else(is.na(Testosterone), '', split),
    split_combined = if_else(IID %in% (pop_def_dfs[['combined']] %>% pull(FID)), split2, '-'),
    split_onesex   = if_else(IID %in% (pop_def_dfs[['onesex']] %>% pull(FID)), split2, '-'),    
    split_zerosex  = if_else(IID %in% (pop_def_dfs[['zerosex']] %>% pull(FID)), split2, '-')
)


In [8]:
df %>% 
colnames()

In [9]:
df %>%
count(split, split_combined, split_onesex, split_zerosex) %>%
drop_na(split_combined) %>%
filter(split_combined != '')

split,split_combined,split_onesex,split_zerosex,n
<chr>,<chr>,<chr>,<chr>,<int>
test,-,-,-,10189
test,test,-,test,28640
test,test,test,-,28601
train,-,-,-,35527
train,train,-,train,99563
train,train,train,-,100913
val,-,-,-,5071
val,val,-,val,14049
val,val,val,-,14594
,-,-,-,71820


In [11]:
df %>%
count(split, split_combined, split_onesex, split_zerosex) %>%
drop_na(split_combined) %>%
filter(split_combined != '-')

split,split_combined,split_onesex,split_zerosex,n
<chr>,<chr>,<chr>,<chr>,<int>
test,test,-,test,28640
test,test,test,-,28601
train,train,-,train,99563
train,train,train,-,100913
val,val,-,val,14049
val,val,val,-,14594


In [13]:
df %>%
drop_na(split_combined) %>%
filter(split_combined != '-') %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)
