In [1]:
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))


In [15]:
read_keep <- function(file){
    df <- file %>% fread(sep='\t')
    colnames(df) <- c('FID', 'IID')
    df
}

In [16]:
pop_def <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/iop/phe/ukb24983_white_british_noIOP.keep' %>%
read_keep()


In [18]:
icd_keep <- c(
    'H400',
    'H401',
    'H402',
    'H403',
    'H404',
    'H405',
    'H406',
    'H408',
    'H409',
    'H428',
    'Q150'
)

In [6]:
select_cols <- c(
    'f.eid',
    paste0('f.20002.0.', 1:28),
    paste0('f.20002.1.', 1:28),
    paste0('f.20002.2.', 1:28),
    paste0('f.41202.0.', 1:379)
)


In [3]:
tab_file <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/9796/9797/download/ukb9797.tab'


In [22]:
df <- fread(tab_file, select=select_cols)

In [38]:
extracted_df <- df %>% rename('IID' = 'f.eid') %>% 
gather(field, val, -IID) %>% 
mutate(field = str_replace(field, '^f.', '')) %>%
drop_na(val) %>%
separate(
    field, c('UKB_Field_ID', 'UKB_time_idx', 'UKB_array_idx')
) %>%
filter(
    ((UKB_Field_ID == '20002') & (val == '1277')) |
    ((UKB_Field_ID == '41202') & (val %in% icd_keep))
)


In [39]:
extracted_df %>% dim()

In [42]:
extracted_df %>% 
select(-UKB_time_idx, -UKB_array_idx) %>% 
unique() %>%
dim()

In [68]:
extracted_wide_df <- extracted_df %>% 
select(-UKB_time_idx, -UKB_array_idx) %>% 
unique() %>%
mutate(UKB_Field_ID_val = paste0('F', UKB_Field_ID, '_', val)) %>% 
select(-UKB_Field_ID, -val) %>%
mutate(value = TRUE) %>%
spread(UKB_Field_ID_val, value, fill=FALSE)

In [79]:
extracted_wide_df %>%
fwrite('glaucoma_relevant_raw_data.tsv', sep='\t')


## Counts

### 500k people

In [94]:
extracted_wide_df %>% dim()

In [80]:
extracted_wide_df %>% select(-IID) %>%
summary()

 F20002_1277     F41202_H400     F41202_H401     F41202_H402    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:1410      FALSE:4469      FALSE:4275      FALSE:4381     
 TRUE :3480      TRUE :421       TRUE :615       TRUE :509      
 F41202_H403     F41202_H404     F41202_H405     F41202_H406    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:4879      FALSE:4875      FALSE:4799      FALSE:4887     
 TRUE :11        TRUE :15        TRUE :91        TRUE :3        
 F41202_H408     F41202_H409     F41202_H428     F41202_Q150    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:4845      FALSE:4277      FALSE:4889      FALSE:4884     
 TRUE :45        TRUE :613       TRUE :1         TRUE :6        

### White British cohort

In [97]:
WB_pop <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification/ukb24983_white_british.phe' %>% read_keep()

In [98]:
length(WB_pop$IID)

In [99]:
extracted_wide_df %>% 
filter(IID %in% WB_pop$IID) %>%
dim()

### White British cohort who does not have IOP measurements

In [95]:
length(pop_def$IID)

In [96]:
extracted_wide_df %>% 
filter(IID %in% pop_def$IID) %>%
dim()

In [82]:
extracted_wide_df %>% 
filter(IID %in% pop_def$IID) %>%
select(-IID) %>%
summary()

 F20002_1277     F41202_H400     F41202_H401     F41202_H402    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:763       FALSE:2176      FALSE:2059      FALSE:2078     
 TRUE :1621      TRUE :208       TRUE :325       TRUE :306      
 F41202_H403     F41202_H404     F41202_H405     F41202_H406    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:2381      FALSE:2374      FALSE:2342      FALSE:2383     
 TRUE :3         TRUE :10        TRUE :42        TRUE :1        
 F41202_H408     F41202_H409     F41202_H428     F41202_Q150    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:2357      FALSE:2074      FALSE:2383      FALSE:2381     
 TRUE :27        TRUE :310       TRUE :1         TRUE :3        

In [88]:
extracted_wide_df %>% 
filter(IID %in% pop_def$IID) %>%
count(
    F20002_1277, 
    F41202_H400, 
    F41202_H401,
    F41202_H402,
    F41202_H403,
    F41202_H404,
    F41202_H405,
    F41202_H406,
    F41202_H408,
    F41202_H409,
    F41202_H428,
    F41202_Q150    
) %>% arrange(-n) %>%
fwrite('glaucoma_def_count_WB_noIOP.tsv', sep='\t')


In [89]:
extracted_wide_df %>% 
filter(IID %in% pop_def$IID) %>%
dim()

In [92]:
extracted_wide_df %>% 
filter(IID %in% pop_def$IID) %>%
fwrite('glaucoma_def_WB_noIOP.tsv', sep='\t')


In [None]:
/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/highconfidenceqc/phe/HC276.phe