In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
    library(googlesheets)
}))


In [2]:
suppressWarnings(suppressMessages({
gs_token <- "/home/users/ytanigaw/.googlesheets_token.rds"
gs_auth(token = gs_token)

'https://docs.google.com/spreadsheets/d/1gwzS0SVZBSKkkHgsoqB5vHo5JpUeYYz8PK2RWrHEq3A' %>%
gs_url() %>% 
gs_read(
    ws = 'GBE_names', 
    col_types = cols('Units_of_measurement' = col_character())
) -> GBE_names_df
}))


In [3]:
# input files
phe_info_f <- '../../phenotype_info.tsv'
UKB_cat_f  <- 'UKB_fields_with_category.20200812.tsv'
metal_N_f <- '../../../18_metal/202006_metal/icdinfo.metal.20200717.txt'

# output

GBE_cat_f <- 'GBE_category.20201024.tsv'


In [4]:
GBE_IDs_biomarkers <- c(
    # we manually assign "Biomarker" as a category for those 35 traits
    'INI10030620',
    'INI10030600',
    'INI10030610',
    'INI10030630',
    'INI20030640',
    'INI10030650',
    'INI30030650',
    'INI10030710',
    'INI10030680',
    'INI20030690',
    'INI10030700',
    'INI10030510',
    'INI10030720',
    'INI10030660',
    'INI30030700',
    'INI10030730',
    'INI10030740',
    'INI10030750',
    'INI10030760',
    'INI10030770',
    'INI20030780',
    'INI10030790',
    'INI10030500',
    'INI30030860',
    'INI10030810',
    'INI10030520',
    'INI10030830',
    'INI10030530',
    'INI10030850',
    'INI10030840',
    'INI10030860',
    'INI10030870',
    'INI10030880',
    'INI10030670',
    'INI10030890'
)


In [5]:
cat_df <- fread(UKB_cat_f) %>%
rename('FieldID'='#FieldID')

In [6]:
metal_N_df <- fread(metal_N_f) %>%
select(V1, V2) %>%
rename('GBE_ID'='V1', 'N_META'='V2')


In [7]:
phe_info_f %>%
fread() %>%
rename('GBE_ID'='#GBE_ID') %>% 
mutate(
    RivasLabCategory = if_else(
        GBE_ID %in% GBE_IDs_biomarkers,
        'Biomarkers',
        str_replace_all(GBE_ID, '\\d+$', '')
    ),
    RivasLabCategory = str_replace(RivasLabCategory, 'FH', 'Family_history'),
    RivasLabCategory = str_replace(RivasLabCategory, 'HC', 'Disease_outcome')
) %>%
left_join(
    cat_df %>% select(FieldID, Category) %>%
    mutate(
        FieldID = as.character(FieldID),
        Category = str_replace_all(Category, '\\s', '_')
    ),
    by=c('FIELD'='FieldID')
) %>% 
mutate(
    Category = if_else(is.na(Category), RivasLabCategory, Category),
    Category = if_else(Category %in% c('BIN', 'INI'), 'Others', Category)
) %>% 
rename('GBE_category'='Category') %>%
left_join(metal_N_df, by='GBE_ID') %>%
replace_na(list(N_META = 0)) %>%
select(GBE_category, GBE_ID, N, N_GBE, N_NBW, N_AFR, N_SAS, N_EAS, N_SMR, N_OTH, N_META, GBE_NAME) %>%
left_join(
    GBE_names_df %>%
    select(GBE_ID, GBE_short_name, GBE_short_name_len),
    by='GBE_ID'
) %>%
arrange(GBE_category, GBE_ID) -> df

In [8]:
df %>%
rename('#GBE_category' = 'GBE_category') %>%
fwrite(GBE_cat_f, sep='\t', na = "NA", quote=F)
