In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
    library(googlesheets)
}))


In [2]:
gs_token <- "/home/users/ytanigaw/.googlesheets_token.rds"
gs_auth(token = gs_token)


Auto-refreshing stale OAuth token.


In [3]:
# input
ST_sheet <- 'https://docs.google.com/spreadsheets/d/1j8q1Y7wnMg9nWUm0iT4wJvFfg_hgIXbrtvxelqWHeH4' 

phe_info_f <- 'biomarker.phenotype.info.tsv'
phe_f <- '@@@@@@/projects/biomarkers/covariate_corrected/outputExtendedNoTDIreduced/phenotypes/combined.20190810.phe'
gwas_covar_f <- '@@@@@@/ukbb24983/sqc/population_stratification_w24983_20190809/ukb24983_GWAS_covar.20190809.phe'

# output
out_f <- 'number_check.tsv'


## Read the input

In [4]:
phe_info_df <- fread(phe_info_f) %>% rename('Phenotype'='name')

In [5]:
phe_df <- fread(
    phe_f, 
    select=c('FID', 'IID', phe_info_df %>% pull(annotation)), 
    colClasses=c('FID'='character', 'IID'='character')
)


In [6]:
gwas_covar_df <- fread(
    gwas_covar_f, select=c('FID', 'IID', 'population'), 
    colClasses=c('FID'='character', 'IID'='character')
) %>% drop_na(population)


## count N for each phenotype across population

In [7]:
phe_pop_df <- gwas_covar_df %>% left_join(phe_df)


Joining, by = c("FID", "IID")


In [8]:
non_NA_phe_cnt <- phe_info_df %>% pull(annotation) %>% lapply(function(phe){
    phe_pop_df %>% select(population, phe) %>% drop_na(phe) %>% count(population) %>%
    mutate(phenotype = phe)
}) %>% bind_rows() %>% spread(population, n) %>%
select(phenotype, white_british, non_british_white, african, s_asian, e_asian)


Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(phe)` instead of `phe` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m


## Number of associations


In [22]:
suppressMessages(suppressWarnings({
    PTV_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST6') %>% mutate(var_category='PTVs') %>%
    rename('Phenotype'='Trait')

    PAV_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST7') %>% mutate(var_category='PAVs') %>%
    rename('Phenotype'='Trait')

    non_coding_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST8') %>% mutate(var_category='non-coding') %>%
    rename('Phenotype'='Trait')

    HLA_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST9') %>% mutate(var_category='HLA')

    CNV_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST10') %>%
    mutate(var_category=if_else(Class == 'Single', 'CNV_single', 'CNV_burden')) %>%
    rename('Phenotype'='Trait')
}))


In [36]:
hits_combined_df <- bind_rows(PTV_hits, PAV_hits, non_coding_hits) %>%
bind_rows(
    HLA_hits %>%
    rename('ID'='HLA Allele', 'BETA' = 'PLINK Beta', 'P' = 'PLINK P'),

    CNV_hits %>%
    rename('ID'='MarkerName', 'BETA' = 'Effect', 'P' = 'P-value')
)


In [38]:
counts_all_df <- hits_combined_df %>%
select(ID, Phenotype, BETA, P, var_category) %>%
count(var_category, Phenotype) %>%
spread(var_category, n, fill=0) %>%
arrange(Phenotype) %>%
select(Phenotype, PTVs, PAVs, `non-coding`, HLA, CNV_single, CNV_burden)


In [43]:
count_full_df <- phe_info_df %>%
left_join(
    non_NA_phe_cnt, by=c('annotation'='phenotype')
) %>%
left_join(
    counts_all_df, by='Phenotype'
) %>%
arrange(Phenotype) %>%
mutate(
    meta_N = white_british + non_british_white + african + s_asian,
    N = meta_N + s_asian,
    n_loci = PTVs + PAVs + `non-coding`
)

## save the results to a file

In [46]:
count_full_df %>%
select(
    Phenotype, Abbreviation, 'Units of measurement', 
    'UKBB field ID', 'Statin adjustment', 'Trait category', 'GBE ID', 
    'N', 'white_british', 'non_british_white', 'african', 's_asian', 'e_asian', 'meta_N',
    'n_loci', 'PTVs', 'PAVs', 'non-coding', 'HLA', 'CNV_single', 'CNV_burden',
    'Color', 'GBE URL'
) %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)


## total number of loci

In [49]:
hits_combined_df %>%
select(ID, var_category) %>%
unique() %>%
count(var_category)

var_category,n
<chr>,<int>
CNV_burden,28
CNV_single,10
HLA,31
non-coding,5160
PAVs,594
PTVs,38


In [50]:
38 + 594 + 5160

### Number of large-effect associations (BETA > .1 sd)

In [52]:
hits_combined_df %>%
filter(abs(BETA) > .1) %>%
select(ID, Phenotype, BETA, P, var_category) %>%
count(var_category, Phenotype) %>%
spread(var_category, n, fill=0) %>%
arrange(Phenotype) %>%
select(Phenotype, PTVs, PAVs, `non-coding`, CNV_single, CNV_burden)

Phenotype,PTVs,PAVs,non-coding,CNV_single,CNV_burden
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Alanine aminotransferase,0,8,2,0,0
Albumin,0,7,6,0,1
Alkaline phosphatase,3,30,37,0,3
Apolipoprotein A,3,27,25,0,1
Apolipoprotein B,1,32,60,0,0
Aspartate aminotransferase,0,7,4,1,0
AST to ALT ratio,0,11,1,0,0
C reactive protein,0,0,0,1,0
C-reactive protein,0,7,33,0,0
Calcium,0,9,4,0,0


In [53]:
hits_combined_df %>%
filter(abs(BETA) > .1) %>%
count(var_category)

var_category,n
<chr>,<int>
CNV_burden,32
CNV_single,11
non-coding,648
PAVs,422
PTVs,45


In [54]:
45 + 422+ 648 + 0 + 11 + 32

In [60]:
hits_combined_df %>%
filter(MAF < 0.01) %>%
select(ID, var_category) %>%
unique() %>%
count(var_category)

var_category,n
<chr>,<int>
non-coding,75
PAVs,192
PTVs,28


In [63]:
hits_combined_df %>%
filter(var_category == "HLA") %>%
select(ID) %>%
unique() %>%
dim()

In [68]:
hits_combined_df %>%
filter(var_category == "CNV_single") %>%
select(ID) %>%
unique() %>%
dim()

In [69]:
hits_combined_df %>%
filter(var_category == "CNV_burden") %>%
select(ID) %>%
unique() %>%
dim()