In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# output file
out_f <- 'fuji.input.tsv'

# input file/dir
in_d <- '@@@@@@/users/ytanigaw/repos/rivas-lab/biomarkers/cascade/out_v3'


In [3]:
dfs <- list()
for(vtype in c('PTVs', 'PAVs', 'nc')){
    dfs[[vtype]] <- fread(file.path(in_d, sprintf('cascade.%s.tsv', vtype))) %>% rename('CHROM'='#CHROM')
}


In [4]:
df <- bind_rows(
    dfs[['PTVs']] %>% mutate(Csq='PTVs'),
    dfs[['PAVs']] %>% mutate(Csq='PAVs')
) %>%
filter(is_outside_of_MHC) %>%
select(CHROM, POS, ID, trait, Gene_symbol, BETA, MAF, Csq, is_outside_of_MHC) %>%
bind_rows(
    dfs[['nc']] %>% filter(is_outside_of_MHC) %>%
    replace_na(list(Gene_symbol=''))%>%
    select(CHROM, POS, ID, trait, BETA, MAF, is_outside_of_MHC) %>%
    mutate(Gene_symbol='', Csq='nc')
)


In [5]:
df %>% colnames()

### the number of associations

In [6]:
df %>% 
mutate(is_rare = if_else(MAF < 0.01, 'rare', 'non_rare')) %>%
count(is_outside_of_MHC, Csq, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,non_rare,rare,n
<lgl>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,9001,116,9117
True,PAVs,1017,306,1323
True,PTVs,15,43,58


In [11]:
df %>% 
filter(abs(BETA) > .1) %>%
mutate(
    is_rare = if_else(MAF < 0.01, 'rare', 'non_rare'),
    sign = if_else(BETA > 0, '+', '-')
) %>%
count(is_outside_of_MHC, Csq, sign, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,sign,non_rare,rare,n
<lgl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,-,298,41,339
True,nc,+,263,46,309
True,PAVs,-,79,161,240
True,PAVs,+,57,125,182
True,PTVs,-,2,17,19
True,PTVs,+,2,24,26


### the number of variants

In [10]:
df %>% 
select(CHROM, POS, ID, MAF, Csq, is_outside_of_MHC) %>%
unique() %>% 
mutate(is_rare = if_else(MAF < 0.01, 'rare', 'non_rare')) %>%
count(is_outside_of_MHC, Csq, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,non_rare,rare,n
<lgl>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,5087,75,5162
True,PAVs,402,192,594
True,PTVs,10,28,38


In [13]:
df %>% 
filter(abs(BETA) > .1) %>%
select(CHROM, POS, ID, MAF, Csq, is_outside_of_MHC) %>%
unique() %>% 
mutate(
    is_rare = if_else(MAF < 0.01, 'rare', 'non_rare'),
) %>%
count(is_outside_of_MHC, Csq, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,non_rare,rare,n
<lgl>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,382,59,441
True,PAVs,62,179,241
True,PTVs,4,27,31


In [14]:
df %>% count(trait)

trait,n
<chr>,<int>
Alanine aminotransferase,189
Albumin,202
Alkaline phosphatase,715
Apolipoprotein A,454
Apolipoprotein B,593
Aspartate aminotransferase,265
AST to ALT ratio,226
C-reactive protein,351
Calcium,208
Cholesterol,596
