In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
in_f  <- 'http://biobank.ctsu.ox.ac.uk/~bbdatan/Data_Dictionary_Showcase.csv'
out_f <- 'UKB_fields_with_category.20200812.tsv'


In [3]:
df <- fread(in_f)


In [9]:
df %>% select(Path, FieldID, Field)  %>%
mutate(
    UKBCategoryTreePath = Path,
    # here we have a custom Regex to clean-up the tree structure
    Path = str_replace(Path, 'Touchscreen > ([^>]+) > .*', 'Touchscreen > \\1'),
    Path = str_replace(Path, 'Online follow-up > ([^>]+) > .*', 'Online follow-up > \\1'),
    Path = str_replace(Path, 'Brain MRI > ([^>]+) > .*', 'Brain MRI > \\1'),
    Path = str_replace(Path, 'Health-related outcomes > ([^>]+) > .*', 'Health-related outcomes > \\1'),
    Path = str_replace(Path, 'Additional exposures > ([^>]+) > .*', 'Additional exposures > \\1'),
    Path = str_replace(Path, 'Genomics > ([^>]+) > .*', 'Genomics > \\1'),
    Path = str_replace(Path, 'Assay results > ([^>]+) > .*', 'Assay results > \\1'),
    Path = str_replace(Path, 'Physical measures > ([^>]+) > .*', 'Physical measures > \\1'),
    Path = str_replace(Path, '^Population characteristics >.*', 'Population characteristics'),
    Path = str_replace(Path, 'Sample inventory >.*', 'Sample inventory'),
    Path = str_replace(Path, 'Biological sampling >.*', 'Biological sampling'),
    Path = str_replace(Path, 'Recruitment >.*', 'Recruitment'),
    Path = str_replace(Path, 'Procedural metrics >.*', 'Procedural metrics'),
    Path = str_replace(Path, 'Cognitive function >.*', 'Cognitive function'),
    Path = str_replace(Path, 'Cognitive function online', 'Cognitive function'),
    Path = str_replace(Path, 'Abdominal MRI >.*', 'Abdominal MRI'),
    Path = str_replace(Path, 'Heart MRI >.*', 'Heart MRI'),
    Path = str_replace(Path, 'DXA assessment >.*', 'DXA assessment')
) %>%
separate(Path, c(paste0('category', rev(1:5))), sep=' > ', remove=F, fill='left', extra='drop') %>%
gather(category_level, category_name, -FieldID, -Field, -Path, -UKBCategoryTreePath) %>%
filter(!is.na(category_name)) %>%
arrange(FieldID, category_level) -> long_df


In [10]:
long_df %>%
group_by(FieldID) %>%
summarise(Field = first(Field), Category = first(category_name), UKBCategoryTreePath = first(UKBCategoryTreePath)) %>%
ungroup() -> category_df


`summarise()` ungrouping output (override with `.groups` argument)



In [11]:
category_df %>% 
rename('#FieldID' = 'FieldID') %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)
