In [1]:
source('model_utils.R')

Loading required package: MASS
Loading required package: Matrix
Loading required package: mnormt
Loading required package: qqman

For example usage please run: vignette('qqman')

Citation appreciated but not required:
Turner, S.D. qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. biorXiv DOI: 10.1101/005165 (2014).

Loading required package: rstan
Loading required package: ggplot2
Loading required package: StanHeaders
rstan (Version 2.15.1, packaged: 2017-04-19 05:03:57 UTC, GitRev: 2e1f913d3ca3)
For execution on a local, multicore CPU with excess RAM we recommend calling
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())


In [2]:
FILTER_DIR = '/oak/stanford/groups/mrivas/private_data/ukbb/variant_filtering'
OUT_DIR = '/scratch/PI/mrivas/users/erflynn/sex_div_gwas/data'

filt.file <- read.delim(sprintf("%s/variant_filter_table.tsv.gz", FILTER_DIR))

In [5]:
colnames(filt.file)
head(filt.file[,c("f_miss", "freq", "hwe_p", "maf", "ld_indep")])

f_miss,freq,hwe_p,maf,ld_indep
0.8915,0.03614952,4.458999999999999e-22,0.03614952,False
0.09985,0.05360614,0.0,0.05360614,False
0.04482,0.03658726,7.009e-195,0.03658726,False
0.005184,0.15681154,1.554e-266,0.15681154,False
0.0011,0.03652657,4.046e-198,0.03652657,False
0.001874,0.12983398,0.02418,0.12983398,False


In [11]:

# all filters imposes MAF of 1%, HWE < 1*10^-7, MCPI pass (if looked at)
rem.snps <- filt.file[filt.file$all_filters==0,] # 655,654 out of 784,256

# filter for ld - part of LD-pruned set
rem.snps2 <- rem.snps[rem.snps$ld_indep=='True',] # 361,424
write.table(rem.snps2, sprintf("%s/snp_filt_metadata.txt", OUT_DIR))

In [13]:
DATA.FOLDER <- OUT_DIR
filterMAF <- function(maf.cutoff){
    rem.snps <- read.table(sprintf("%s/snp_filt_metadata.txt", DATA.FOLDER), header=TRUE)
    #print(head(rem.snps))
    filt.snps <- rem.snps[rem.snps$maf > maf.cutoff,]
    #print(head(filt.snps))
    print(summary(filt.snps$maf))
    return(filt.snps$ID)
}


res <- filterMAF(0.1)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1000  0.1359  0.1944  0.2267  0.2961  0.5000 


In [3]:
head(filt.file[,c(1:12,14:30)])

CHROM,POS,REF,ALT,ID,Gene,Consequence,HGVSp,LoF,LoF_filter,⋯,wcsg_only,bileve_only,filter,missingness,hwe,mcpi,gnomad_af,mgi,mgi_notes,all_filters
1,723307,C,G,rs28659788,ENSG00000237491,intron_variant,,,,⋯,False,True,,1,1,0,,,,2
1,727841,G,A,rs116587930,ENSG00000237491,intron_variant,,,,⋯,False,False,,1,1,0,,,,2
1,729632,C,T,rs116720794,ENSG00000237491,intron_variant,,,,⋯,False,False,,1,1,0,,,,2
1,752721,A,G,rs3131972,ENSG00000240453,intron_variant,,,,⋯,False,False,,0,1,0,,,,1
1,754105,C,T,rs12184325,ENSG00000177757,splice_region_variant,,,,⋯,False,False,,0,1,0,,,,1
1,756604,A,G,rs3131962,ENSG00000240453,upstream_gene_variant,,,,⋯,False,False,,0,0,0,,,,0


In [None]:
filterDat <- function(filt.f, filt.m, trait.type, se.cutoff) 
    dat.filt <- filterSE(filt.f, filt.m, trait.type, cutoff=se.cutoff)
    filt.f <- dat.filt$`1`
    filt.m <- dat.filt$`2`
    
    # extract dat in a format for stan input
    dat <- extractDataStan(filt.f, filt.m)
    return(dat)
}


### START OF CODE ###

In [2]:
# read in the data
trait <- '21001'
trait.type <- 'quant'
all.dat <- lapply(1:22, function(x){ getData(as.character(x), trait)})




In [None]:
# vary the MAF cutoff
#   - TODO: break up reformatData --> faster
maf.cut <- 0.1
dat.reform <- reformatData(all.dat, trait.type, maf.cut) # reformat data, remove rows not shared, vary MAF
filt.f <- dat.reform$`1`
filt.m <- dat.reform$`2`






In [None]:
# filter by standard error
# vary the SE cutoff
se.cut <- 0.2
dat.filt <- filterSE(filt.f, filt.m, trait.type, se.cut)
filt2.f <- dat.filt$`1`
filt2.m <- dat.filt$`2`

In [27]:
# sanity checks
range(filt2.m$SE)
range(filt2.f$SE)

In [34]:

dat.reform <- reformatData(all.dat, trait.type, maf.cut) # reformat data, remove rows not shared, vary MAF
filt.f <- dat.reform$`1`
filt.m <- dat.reform$`2`
range(filt.m$SE)
range(filt.f$SE)
dat.filt <- filterSE(filt.f, filt.m, trait.type, se.cut)
filt2.f <- dat.filt$`1`
filt2.m <- dat.filt$`2`
range(filt2.m$SE)
range(filt2.f$SE)
nrow(filt2.f)
nrow(filt2.m)
head(filt2.f)
head(filt2.m)



Unnamed: 0,CHR,BP,SNP,REF,ALT,TEST,OBS_CT,BETA,SE,T_STAT,P
10,1,768448,rs12562034,G,A,ADD,180207,0.00223183,0.00541826,-0.411909,0.680407
12,1,779322,rs4040617,A,G,ADD,180025,0.00283624,0.00497289,-0.570341,0.568447
21,1,838555,rs4970383,C,A,ADD,180155,0.00137271,0.00385814,-0.355795,0.721995
24,1,849998,rs13303222,A,G,ADD,180213,0.00060713,0.00433392,-0.140088,0.888591
64,1,891059,rs13303065,C,T,ADD,180311,0.00400785,0.00349634,-1.1463,0.251673
99,1,909238,rs3829740,G,C,ADD,180290,0.00424681,0.0033513,-1.26721,0.205081


Unnamed: 0,CHR,BP,SNP,REF,ALT,TEST,OBS_CT,BETA,SE,T_STAT,P
10,1,768448,rs12562034,G,A,ADD,155305,0.015228,0.00582732,-2.61321,0.00897047
12,1,779322,rs4040617,A,G,ADD,155196,-0.00368672,0.00539278,0.68364,0.494203
21,1,838555,rs4970383,C,A,ADD,155262,-0.00400314,0.00415949,0.962413,0.335844
24,1,849998,rs13303222,A,G,ADD,155308,-0.0139141,0.00468891,2.96745,0.00300325
64,1,891059,rs13303065,C,T,ADD,155396,-0.000494891,0.00377404,0.13113,0.895672
99,1,909238,rs3829740,G,C,ADD,155351,-0.000612335,0.00361447,0.169412,0.865473


In [None]:
# run optimizing
# run model 1 with optimizing
dat <- extractDataStan(filt2.f, filt2.m)

        dat$dat$K <- 2

        m1 <- stan_model("models/model1.stan")
        f1 <- timeModel(optimizing(m1, dat$dat, hessian=TRUE))
        print(f1)




In [None]:
# visualize this shift?