In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [64]:
library(googlesheets)
gs_token <- "/home/users/ytanigaw/.googlesheets_token.rds"
gs_auth(token = gs_token)


Auto-refreshing stale OAuth token.


In [143]:
# input
ST_sheet <- 'https://docs.google.com/spreadsheets/d/1j8q1Y7wnMg9nWUm0iT4wJvFfg_hgIXbrtvxelqWHeH4' 

phe_info_f <- 'biomarker.phenotype.info.tsv'
phe_f <- '/oak/stanford/groups/mrivas/projects/biomarkers/covariate_corrected/outputExtendedNoTDIreduced/phenotypes/combined.20190810.phe'
gwas_covar_f <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification_w24983_20190809/ukb24983_GWAS_covar.20190809.phe'

# output

out_f <- 'number_check.tsv'


## Read the input

In [3]:
phe_info_df <- fread(phe_info_f) %>% rename('Phenotype'='name')

In [77]:
phe_df <- fread(
    phe_f, 
    select=c('FID', 'IID', phe_info_df %>% pull(annotation)), 
    colClasses=c('FID'='character', 'IID'='character')
)


In [27]:
gwas_covar_df <- fread(
    gwas_covar_f, select=c('FID', 'IID', 'population'), 
    colClasses=c('FID'='character', 'IID'='character')
) %>% drop_na(population)


## count N for each phenotype across population

In [78]:
phe_pop_df <- gwas_covar_df %>% left_join(phe_df)


Joining, by = c("FID", "IID")


In [79]:
non_NA_phe_cnt <- phe_info_df %>% pull(annotation) %>% lapply(function(phe){
    phe_pop_df %>% select(population, phe) %>% drop_na(phe) %>% count(population) %>%
    mutate(phenotype = phe)
}) %>% bind_rows() %>% spread(population, n) %>%
select(phenotype, white_british, non_british_white, african, s_asian, e_asian)

## Number of associations


In [87]:
suppressWarnings({
    PTV_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST7')

    PAV_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST8')

    non_coding_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST9')

    HLA_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST10')

    CNV_hits <- ST_sheet %>% gs_url() %>%
    gs_read(ws = 'ST11')
})


Sheet-identifying info appears to be a browser URL.
googlesheets will attempt to extract sheet key from the URL.
Putative key: 1j8q1Y7wnMg9nWUm0iT4wJvFfg_hgIXbrtvxelqWHeH4
Sheet successfully identified: "UKBB Biomarker Paper Supplementary Tables Revisions"
Accessing worksheet titled 'ST7'.
Parsed with column specification:
cols(
  .default = col_character(),
  CHROM = [32mcol_double()[39m,
  POS = [32mcol_double()[39m,
  BETA = [32mcol_double()[39m,
  SE = [32mcol_double()[39m,
  P = [32mcol_double()[39m,
  HetISq = [32mcol_double()[39m,
  HetChiSq = [32mcol_double()[39m,
  HetDf = [32mcol_double()[39m,
  HetPVal = [32mcol_double()[39m,
  MAF = [32mcol_double()[39m,
  is_outside_of_MHC = [33mcol_logical()[39m,
  ld_indep = [33mcol_logical()[39m,
  Comments = [33mcol_logical()[39m
)
See spec(...) for full column specifications.
Sheet-identifying info appears to be a browser URL.
googlesheets will attempt to extract sheet key from the URL.
Putative key: 1j8q1Y7w

In [109]:
assoc_counts_non_CNV <- list(
    PTV_hits %>% count(trait, name='PTV'),
    PAV_hits %>% count(trait, name='PAV'),
    non_coding_hits %>% count(trait, name='non-coding'),
    HLA_hits %>% count(Phenotype, name='HLA') %>% rename('trait'='Phenotype')
)

assoc_counts_CNV <- list(
    CNV_hits %>% filter(Class == 'Single') %>% count(Trait, name='CNV single') %>% rename('annotation'='Trait'),
    CNV_hits %>% filter(Class == 'Burden') %>% count(Trait, name='CNV burden') %>% rename('annotation'='Trait')    
)

In [142]:
count_full_df <- phe_info_df %>%
left_join(
    non_NA_phe_cnt, by=c('annotation'='phenotype')
) %>%
left_join(
    assoc_counts_non_CNV %>%
    reduce(function(x, y){full_join(x, y, by='trait')}),
    by=c('Phenotype'='trait')
) %>%
left_join(
    assoc_counts_CNV %>%
    reduce(function(x, y){full_join(x, y, by='annotation')}), 
    by='annotation'
) %>%
replace_na(list(PTV=0, PAV=0, HLA=0, 'CNV single'=0, 'CNV burden'=0)) %>%
arrange(Phenotype) %>%
mutate(
    meta_N = white_british + non_british_white + african + s_asian,
    N = meta_N + s_asian,
    n_loci = PTV + PAV + `non-coding`
)

## save the results to a file

In [150]:
count_full_df %>%
select(
    Phenotype, Abbreviation, 'Units of measurement', 
    'UKBB field ID', 'Statin adjustment', 'Trait category', 'GBE ID', 
    'N', 'white_british', 'non_british_white', 'african', 's_asian', 'e_asian', 'meta_N',
    'n_loci', 'PTV', 'PAV', 'non-coding', 'HLA', 'CNV single', 'CNV burden',
    'Color', 'GBE URL'
) %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)
