In [1]:
library(tidyverse)
library(data.table)


── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0       ✔ purrr   0.3.1  
✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following object is masked from ‘package:purrr’:

    transpose



In [2]:
IOP_file <- '/oak/stanford/groups/mrivas/ukbb24983/cal/gwas/10136/21731/white_british/ukb24983_v2_hg19.INI5255.genotyped.PHENO1.glm.linear.gz'
IOP <- fread(
    cmd=paste0('zcat ', IOP_file, ' | sed -e "s/#//g"'), sep='\t', data.table=F
)


In [3]:
annot.tbl <- '/oak/stanford/groups/mrivas/private_data/ukbb/variant_filtering/variant_filter_table.tsv.gz'


In [4]:
annot.arr <- fread(
    cmd=paste0('zcat ', annot.tbl),
    sep='\t', data.table=FALSE
) %>% mutate(
    MAF=pmin(freq, 1-freq)
) %>%
mutate(
    variant = paste(CHROM, POS, REF, ALT, sep=':'),
    is_outside_of_MHC = (as.numeric(CHROM) == 6 & as.numeric(POS) < 25477797) | ( as.numeric(CHROM) == 6 & 36448354 < as.numeric(POS)) | as.numeric(CHROM) != 6
)


In [5]:
annot.arr$Csq[
    !(annot.arr$Consequence %in% c("frameshift_variant","splice_donor_variant","stop_gained","stop_lost","start_lost","splice_acceptor_variant","splice_region_variant","missense_variant","inframe_insertion","inframe_deletion"))
] = "non-coding"
annot.arr$Csq[
    annot.arr$Consequence %in% c("splice_region_variant","missense_variant","inframe_insertion","inframe_deletion")
] = "protein-altering"
annot.arr$Csq[
    annot.arr$Consequence %in% c("frameshift_variant","splice_donor_variant","stop_gained","stop_lost","start_lost","splice_acceptor_variant")
] = "protein-truncating"



In [24]:
annot.arr %>% colnames()

In [69]:
annot.arr %>% mutate(
    Csq2 = if_else(Csq == 'non-coding', 'non-coding', 'PTVs+protein-altering')
) %>% filter(MAF < 0.01) %>% count(is_outside_of_MHC, Csq2)

is_outside_of_MHC,Csq2,n
False,non-coding,1100
False,PTVs+protein-altering,1136
True,non-coding,40286
True,PTVs+protein-altering,89145


## ANGPTL7

In [83]:
annot.arr %>% filter(Gene_symbol == 'ANGPTL7')

CHROM,POS,REF,ALT,ID,Gene,Consequence,HGVSp,LoF,LoF_filter,⋯,mcpi,gnomad_af,mgi,mgi_notes,all_filters,Gene_symbol,MAF,variant,Csq,is_outside_of_MHC
1,11252357,A,G,rs200058074,ENSG00000171819,missense_variant,ENSP00000366015.3:p.Gln136Arg,,,⋯,0,,,,0,ANGPTL7,0.0005355845,1:11252357:A:G,protein-altering,True
1,11252369,G,A,rs28991002,ENSG00000171819,missense_variant,ENSP00000366015.3:p.Arg140His,,,⋯,0,,,,0,ANGPTL7,0.002532551,1:11252369:G:A,protein-altering,True
1,11253684,G,T,rs28991009,ENSG00000171819,missense_variant,ENSP00000366015.3:p.Gln175His,,,⋯,0,,,,0,ANGPTL7,0.008116199,1:11253684:G:T,protein-altering,True
1,11253688,C,T,rs143435072,ENSG00000171819,stop_gained,ENSP00000366015.3:p.Arg177Ter,HC,,⋯,0,PASS,,,0,ANGPTL7,0.0004063244,1:11253688:C:T,protein-truncating,True
1,11255013,GGCAT,G,Affx-89021330,ENSG00000171819,frameshift_variant,ENSP00000366015.3:p.His326AspfsTer11,HC,,⋯,0,,,,0,ANGPTL7,1.483913e-05,1:11255013:GGCAT:G,protein-truncating,True


In [77]:
df <- annot.arr %>% filter(Gene_symbol == 'ANGPTL7') %>%
select(variant, ID, Consequence, HGVSp, freq, MAF, ld_indep) %>%
left_join(
    IOP, by='ID'
) %>%
rename(
    variant_ID = ID
) %>%
arrange(CHROM, POS) %>%
select(-CHROM, -POS, -REF, -ALT, -A1, -TEST, -OBS_CT)

In [82]:
df %>% 
# filter(P < .05) %>%
mutate(
    BETA_CI_l = BETA - 1.96 * SE,
    BETA_CI_u = BETA + 1.96 * SE,
    MAF_percent = MAF * 100    
) %>% 
select(variant, variant_ID, HGVSp, MAF_percent, BETA, BETA_CI_l, BETA_CI_u, P)

variant,variant_ID,HGVSp,MAF_percent,BETA,BETA_CI_l,BETA_CI_u,P
1:11252357:A:G,rs200058074,ENSP00000366015.3:p.Gln136Arg,0.053558447,-0.0380036,-0.2564534,0.18044624,0.73312
1:11252369:G:A,rs28991002,ENSP00000366015.3:p.Arg140His,0.253255077,-0.155464,-0.2502167,-0.06071133,0.00130115
1:11253684:G:T,rs28991009,ENSP00000366015.3:p.Gln175His,0.811619854,-0.200253,-0.2533586,-0.14714739,1.47284e-13
1:11253688:C:T,rs143435072,ENSP00000366015.3:p.Arg177Ter,0.040632442,-0.262552,-0.5106743,-0.01442972,0.0380835
1:11255013:GGCAT:G,Affx-89021330,ENSP00000366015.3:p.His326AspfsTer11,0.001483913,-0.454726,-1.5823473,0.67289532,0.429302


## Hits

In [80]:
IOP_hits <- IOP %>% filter(as.numeric(P) <= 0.0001) %>% 
select(-REF, -ALT, -A1, -TEST, -OBS_CT, -T_STAT) %>%
left_join(annot.arr %>% select(-CHROM, -POS), by='ID') %>% 
arrange(as.numeric(CHROM), as.numeric(POS)) %>%
select(-CHROM, -POS) %>%
rename(variant_ID = ID) %>%
select(variant, variant_ID, BETA, SE, P, Csq, Consequence, Gene_symbol, HGVSp, MAF, ld_indep)


In [62]:
IOP_hits %>%
fwrite('INI5255.hits.tsv', sep='\t', row.names=FALSE)