In [1]:
require('rstan')
require('tidyverse')
source("../mixture_model_scripts/model_utils.R")
source("../mixture_model_scripts/snp_utils.R")
source("../mixture_model_scripts/heritability_utils.R")

Loading required package: rstan
“there is no package called ‘rstan’”Loading required package: tidyverse
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.4
✔ tidyr   0.8.1     ✔ stringr 1.3.0
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: MASS

Attaching package: ‘MASS’

The following object is masked from ‘package:dplyr’:

    select

Loading required package: Matrix

Attaching package: ‘Matrix’

The following object is masked from ‘package:tidyr’:

    expand

Loading required package: mnormt
Loading required package: qqman

For example usage please run: vignette('qqman')

Citation appreciated but not required:
Turner, S.D. qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. biorXiv DOI:

### M2 Parameter Estimates + convergence

In [22]:

getM2out <- function(trait){
    
    load(sprintf("../data/1020/m2/f_m2_%s.RData", trait))
    p <- getPi(fit2)
    # sigmasq
    sigmasq <- summary(fit2)$summary[,]
    rhat_vals <- summary(fit2)$summary[,"Rhat"]
    rhat_present <- rhat_vals[!is.nan(rhat_vals)]
    high_val <- which(abs(rhat_present - 1) > 0.1)
    rhat_dev <- mean((rhat_present - 1)**2)
    if (length(high_val)>0){ 
        print(sprintf("warning for %s, non-convergent rhat", trait)); 
        print(rhat_present[high_val]);
    }
    df <- data.frame(trait, t(unlist(p)), t(unlist(sigmasq)), rhat_dev) 
    colnames(df) <- c("trait", "pi[1]", "pi[2]", "pi[3]", "pi[4]", "sigmasq[1]", "sigmasq[2]", "sigmasq[3]", "sigmasq[4]", "rhat_dev")
    return(df)
}

In [26]:
BIOMARKER.DIR <- "../data/1020/m2/"
biomarker_traits <- list.files(BIOMARKER.DIR, pattern="*.RData")
biomarkers <- sapply(biomarker_traits, function(x) strsplit(strsplit(x, "f_m2_",fixed=TRUE)[[1]][[2]], ".RData")[[1]][[1]])
names(biomarkers) <- NULL
biomarkers

In [27]:
m2.outs <- lapply(biomarkers, getM2out)

In [28]:
m2.df <- do.call(rbind, m2.outs)

In [35]:
m2.df %>% head()
fat_dist <- c("arm_fp", "leg_fp", "whr", "trunk_fp")
m2.df %>% dplyr::select(-rhat_dev) %>% filter(!trait %in% fat_dist) %>% write_csv("../data/res_1023/m2_summary_biomarker.csv")
m2.df %>% dplyr::select(-rhat_dev) %>% filter(trait %in% fat_dist) %>% write_csv("../data/res_1023/m2_summary_anthro.csv")

trait,pi[1],pi[2],pi[3],pi[4],sigmasq[1],sigmasq[2],sigmasq[3],sigmasq[4],rhat_dev
Alanine_aminotransferase,0.9966601,6.657939e-06,5.601664e-06,0.003327633,4.63061,4.08340894,0.00419519,0.004207383,1.024779e-06
Albumin,0.9972253,3.115938e-06,3.033431e-06,0.002768525,7.797013,11.04296564,0.003963794,0.004168399,2.366432e-06
Alkaline_phosphatase,0.9914165,1.68031e-05,1.889383e-05,0.008547766,2.830571,3.50952426,0.002852516,0.003468897,2.431953e-06
Apolipoprotein_A,0.9938319,1.222937e-05,9.754044e-06,0.006146107,2.145637,1.71965805,0.003217621,0.00340123,3.183732e-06
Apolipoprotein_B.adjust.statins,0.9934407,4.560755e-06,3.466295e-06,0.006551241,11.82964,5.11871464,0.005496158,0.004876533,2.826702e-06
arm_fp,0.9832122,0.0166507,0.0001163904,2.066419e-05,0.0009072801,0.05987742,0.365646951,0.379671244,4.582647e-06


In [None]:
### M1 FDR

In [71]:

getPosterior2 <- function(B, SE, p, Sigma){
        # get the posterior probability for a SNP

    zeros <- rep(0, length(SE)) #c(0,0)
    SE_mat <- diag(SE) #matrix(c(SE[1], 0, 0, SE[2]), 2, 2)
    p_1 = p[1]*dmnorm(B, zeros, SE_mat)
    p_2 = p[2]*dmnorm(B, zeros, SE_mat + Sigma)
    prob_1 = log(p_1) - log(p_1 + p_2)
    prob_2 = log(p_2) - log(p_1 + p_2)
    return(exp(prob_2))
}

getPosteriorVec <- function(trait){
    load(sprintf("../data/1019/m1/f_%s.RData", trait))
    load(sprintf("../data/1019/dat_%s.RData", trait))
    p <- getPi(fit1)
    Sigma <- getSigma(fit1)
    B_dat <- dat$dat$B
    SE_dat <- dat$dat$SE
    N <- dat$dat$N

    posteriors <- sapply(1:N, function(i) getPosterior2(B_dat[i,], SE_dat[i,], p, Sigma))
    save(posteriors, file=sprintf("../data/tmp_posteriors/post_%s.RData", trait))
    return(posteriors)
}




In [2]:
BIOMARKER.DIR <- "../data/1019/"
biomarker_traits <- list.files(BIOMARKER.DIR, pattern="*.RData")
biomarkers <- sapply(biomarker_traits, function(x) strsplit(strsplit(x, "dat_",fixed=TRUE)[[1]][[2]], ".RData")[[1]][[1]])
names(biomarkers) <- NULL
#biomarkers <- setdiff(biomarkers, c("Apolipoprotein_B","Cholesterol", "LDL_direct"))

In [3]:
biomarkers

In [None]:
post_vec_list <- lapply(biomarkers, getPosteriorVec)

In [7]:
calcFDRM1 <- function(post_vec, cutoff){
    # for M1
    non_null <- post_vec[post_vec > cutoff]
    if (length(non_null) < 5){
        return(NA)
    }
    return(sum(1-non_null)/length(non_null))
}


In [6]:
post_lists <- lapply(biomarkers, function(trait) {
    load(sprintf("../data/tmp_posteriors/post_%s.RData", trait));
    return(posteriors)
    })

In [16]:
fdrs <- do.call(cbind,
        lapply(seq(0.5, 0.9, 0.1), function(cutoff) 
            data.frame(sapply(post_lists, function(x) calcFDRM1(x, cutoff)))))

In [17]:
colnames(fdrs) <- sapply(seq(0.5, 0.9, 0.1), function(x) sprintf("post%s", x))
fdrs$trait <- biomarkers


ERROR: Error in select(., trait, everything()): unused arguments (trait, everything())


In [21]:
fdrs %>% dplyr::select(trait, everything()) %>% write_csv("../data/res_1023/m1_fdr.csv")

In [22]:
fdrs %>% dplyr::select(trait, everything()) %>% filter(is.na(post0.5))

trait,post0.5,post0.6,post0.7,post0.8,post0.9
Alanine_aminotransferase,,,,,
Aspartate_aminotransferase,,,,,
SHBG,,,,,


In [24]:
biomarkers