In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [233]:
source('paths.sh')


## trait category

In [217]:
trait_rename_df <- data.frame(
    trait_new_name = c(
        'LDL cholesterol',
        'eGFR'
    ),
    trait = c(
        'INI30780',
        'INI30030700'
        
    ),
    stringsAsFactors='F'
)


In [218]:
GBE_category_f %>%
fread(select=c('#GBE_category', 'GBE_ID', 'GBE_short_name')) %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) %>%
left_join(
    trait_rename_df, by=c('GBE_ID'='trait')
) %>%
mutate(
    GBE_short_name = if_else(is.na(trait_new_name), GBE_short_name, trait_new_name)
) %>%
select(-trait_new_name) -> GBE_category_df


In [219]:
GBE_category_df %>%
filter(GBE_category == 'Biomarkers') %>%
rename('Biomarkers_covariate_adjusted_GBE_ID' = 'GBE_ID') %>%
mutate(
    GBE_ID = str_replace(Biomarkers_covariate_adjusted_GBE_ID, '^INI[12]00', 'INI')
) -> biomarkers_df


In [220]:
biomarkers_df %>%
count(GBE_ID == Biomarkers_covariate_adjusted_GBE_ID)


Unnamed: 0_level_0,GBE_ID == Biomarkers_covariate_adjusted_GBE_ID,n
Unnamed: 0_level_1,<lgl>,<int>
1,False,32
2,True,3


In [221]:
biomarkers_df %>%
filter(GBE_ID == Biomarkers_covariate_adjusted_GBE_ID)


GBE_category,Biomarkers_covariate_adjusted_GBE_ID,GBE_short_name,GBE_ID
<chr>,<chr>,<chr>,<chr>
Biomarkers,INI30030650,AST to ALT ratio,INI30030650
Biomarkers,INI30030700,eGFR,INI30030700
Biomarkers,INI30030860,Non-albumin protein,INI30030860


Those 3 traits are "derived" traits. They are derived based on the covariate-adjusted traits


## other input files

In [222]:
PRS202009_f %>%
fread(nrows=0) %>%
select(starts_with('PRS_')) %>%
colnames() %>%
str_replace('PRS_', '') -> list_of_traits_w_PRS


In [73]:
list_of_traits_w_PRS %>% length %>% print

[1] 1772


In [72]:
mrp_blacklist_f %>% fread(head=F) %>% pull() -> mrp_blacklist
mrp_blacklist %>% length %>% print


[1] 552


In [76]:
icdinfo_f %>% fread(select='GBE_ID') %>% pull() -> icdinfo_list
icdinfo_list %>% length %>% print


[1] 3965


## get the list of traits

In [124]:
# for visualization, we collapse some of the trait categories as 'Others'
# here, we list the categories that we'd like to keep
trait_categories_to_keep <- setNames(
    list( 
        c(
            # binary traits (binomial family)
            'Disease_outcome',
            'Lifestyle_and_environment',
            'Health_and_medical_history',
            'Cancer',
            'Psychosocial_factors',
            'Family_history'
        ),
        c(
            # quantitative traits (Gaussian family)
            'Blood_assays',
            'Anthropometry',
            'Lifestyle_and_environment',
            'Biomarkers',
            'Bone-densitometry_of_heel'
        )
    ),
    c('binomial', 'gaussian')
)


In [125]:
list_of_traits_w_PRS %>%
intersect(icdinfo_list) %>%
setdiff(
    mrp_blacklist
) %>% 
setdiff(
    biomarkers_df %>%
    select(GBE_ID, Biomarkers_covariate_adjusted_GBE_ID) %>%
    gather() %>%
    pull(value) %>%
    unique
) %>%
union(
    biomarkers_df %>% pull(GBE_ID)
) -> final_list_of_traits


In [126]:
final_list_of_traits %>% length


In [210]:
GBE_category_df %>%
filter(GBE_ID %in% final_list_of_traits) %>%
rename(
    'trait' = 'GBE_ID',
    'trait_category'='GBE_category',
    'trait_name'='GBE_short_name'
) %>%
mutate(
    trait_category = R.utils::capitalize(trait_category),
    family = if_else(
        str_replace_all(trait, '[0-9]', '') %in% c('INI', 'QT_FC'),
        'gaussian', 'binomial'
    ),
    trait_category = if_else(
        trait %in% (biomarkers_df %>% pull(GBE_ID)), 
        'Biomarkers', trait_category
    ),
    trait_category_plot = if_else(
        (
            (family == 'binomial') & 
            (trait_category %in% trait_categories_to_keep[['binomial']])
        ) | (
            (family == 'gaussian') &
            (trait_category %in% trait_categories_to_keep[['gaussian']])
        ),
        str_replace_all(trait_category, '_', ' '),
        'Others'
    )
) %>%
arrange(family, trait_category_plot, trait_category, trait) %>%
select(
    trait, trait_name, trait_category, trait_category_plot, family
) -> trait_list_df


In [211]:
trait_list_df %>% dim %>% print


[1] 1565    5


In [212]:
trait_list_df %>%
rename('#trait' = 'trait') %>%
fwrite(trait_list_f, sep='\t', na = "NA", quote=F)


In [213]:
trait_list_df %>%
filter(trait %in% c('BIN4093', 'INI30780'))

trait,trait_name,trait_category,trait_category_plot,family
<chr>,<chr>,<chr>,<chr>,<chr>
BIN4093,Fractured heel (L),Bone-densitometry_of_heel,Others,binomial
INI30780,LDL cholesterol,Biomarkers,Biomarkers,gaussian


## biomarker traits

In [224]:
biomarker_rename_df <- data.frame(
    trait_new_name = c(
        'Creatinine_in_urine',
        'Glycated_haemoglobin_HbA1c'
    ),
    trait = c(
        'INI10030510',
        'INI10030750'
        
    ),
    stringsAsFactors='F'
)


In [231]:
biomarkers_f %>% fread()

#trait,trait_name,Biomarkers_covariate_adjusted_GBE_ID,annotation
<chr>,<chr>,<chr>,<chr>
INI30030650,AST to ALT ratio,INI30030650,AST_ALT_ratio
INI30030700,eGFR,INI30030700,eGFR
INI30030860,Non-albumin protein,INI30030860,Non_albumin_protein
INI30500,Microalbumin in urine,INI10030500,Microalbumin_in_urine
INI30510,Creatinine (enzymatic) in urine,INI10030510,Creatinine_in_urine
INI30520,Potassium in urine,INI10030520,Potassium_in_urine
INI30530,Sodium in urine,INI10030530,Sodium_in_urine
INI30600,Albumin,INI10030600,Albumin
INI30610,Alkaline phosphatase,INI10030610,Alkaline_phosphatase
INI30620,Alanine aminotransferase,INI10030620,Alanine_aminotransferase


In [234]:
trait_list_df %>%
filter(trait_category == 'Biomarkers') %>%
left_join(
    biomarkers_df %>%
    select(GBE_ID, Biomarkers_covariate_adjusted_GBE_ID),
    by=c('trait'='GBE_ID')
) %>%
left_join(
    biomarkers_f %>% fread(),
    by=c('trait_name'='name')
) %>%
left_join(
    biomarker_rename_df,
    by=c('Biomarkers_covariate_adjusted_GBE_ID'='trait')
) %>%
mutate(
    annotation = if_else(is.na(trait_new_name), annotation, trait_new_name)
) %>%
select(-trait_new_name) %>%
select(-trait_category, -trait_category_plot, -family) %>%
rename('#trait' = 'trait') %>%
fwrite(biomarkers_mapping_f, sep='\t', na = "NA", quote=F)
