Re-introduce by-chromosome functionality and allow for .db SNP identi…

…fier to be speicified
shugamoe · Oct 19, 2023 · 318a2f2 · 318a2f2
1 parent 47c0e3b
commit 318a2f2
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 28 deletions.
diff --git a/R/ctwas_impute_expr_z.R b/R/ctwas_impute_expr_z.R
@@ -64,7 +64,9 @@ impute_expr_z <- function (z_snp, weight, ld_pgenfs = NULL, ld_R_dir = NULL, met
                            strand_ambig_action_z = c("drop", "none", "recover"), 
                            strand_ambig_action_wgt = c("drop", "none", "recover"),
                            ncore=1, chrom=1:22,
-                           scale_by_ld_variance=T){
+                           scale_by_ld_variance=T,
+			   db_id="rsid"
+			   ){
   dir.create(outputdir, showWarnings = F)
 
   strand_ambig_action_z <- match.arg(strand_ambig_action_z)
@@ -125,7 +127,8 @@ impute_expr_z <- function (z_snp, weight, ld_pgenfs = NULL, ld_R_dir = NULL, met
                                          ld_Rinfo=ld_Rinfo,
                                          strand_ambig_action=strand_ambig_action_wgt,
                                          ncore=ncore,
-                                         scale_by_ld_variance=scale_by_ld_variance)
+                                         scale_by_ld_variance=scale_by_ld_variance,
+                                         db_id=db_id)
     } else {
       stop("Unrecognized weight format, need to use either FUSION format or predict.db format")
     }

diff --git a/R/ctwas_process_regions.R b/R/ctwas_process_regions.R
@@ -53,7 +53,9 @@ index_regions <- function(regionfile,
                           outname = NULL,
                           outputdir = getwd(),
                           ncore = 1,
-                          reuse_R_gene = F) {
+                          reuse_R_gene = F,
+			  chrom = 1:22,
+			  ) {
 
   if (is.null(pvarfs) & is.null(ld_Rfs)){
     stop("Stopped: missing LD/genotype information.
@@ -82,7 +84,7 @@ index_regions <- function(regionfile,
   }
 
   regionlist <- list()
-  for (b in 1:length(exprvarfs)){
+  for (b in chrom){
 
     if (!is.null(pvarfs)){
       # get snp info (from pvarf file)
@@ -205,7 +207,7 @@ index_regions <- function(regionfile,
 
   if ("z" %in% colnames(select)) {
     # z score is given, trim snps with lower |z|
-    for (b in 1: length(regionlist)){
+    for (b in chrom){
       for (rn in names(regionlist[[b]])) {
         if (length(regionlist[[b]][[rn]][["sid"]]) > maxSNP){
           idx <- match(regionlist[[b]][[rn]][["sid"]], select[, "id"])
@@ -218,7 +220,7 @@ index_regions <- function(regionfile,
     }
   } else{
     # if no z score information, randomly select snps
-    for (b in 1: length(regionlist)){
+    for (b in chrom){
       for (rn in names(regionlist[[b]])) {
         if (length(regionlist[[b]][[rn]][["sid"]]) > maxSNP){
           n.ori <- length(regionlist[[b]][[rn]][["sid"]])
@@ -237,14 +239,14 @@ index_regions <- function(regionfile,
 
     dir.create(file.path(outputdir, paste0(outname, "_LDR")), showWarnings = F)
 
-    wgtall <- lapply(exprvarfs, function(x){load(paste0(strsplit(x, ".exprvar")[[1]], ".exprqc.Rd")); wgtlist})
+    wgtall <- lapply(exprvarfs[chrom], function(x){load(paste0(strsplit(x, ".exprvar")[[1]], ".exprqc.Rd")); wgtlist})
     wgtlistall <- do.call(c, wgtall)
     names(wgtlistall) <- do.call(c, lapply(wgtall, names))
     rm(wgtall)
 
     regionlist_all <- list()
 
-    for (b in 1:22){
+    for (b in chrom){
       regionlist_all[[b]] <- cbind(b, names(regionlist[[b]]))
     }
 
@@ -372,9 +374,9 @@ index_regions <- function(regionfile,
 
 
 #' filter regions based on probability of at most 1 causal effect
-filter_regions <- function(regionlist, group_prior, prob_single = 0.8, zdf){
+filter_regions <- function(regionlist, group_prior, prob_single = 0.8, zdf, chrom=1:22){
   regionlist2 <- regionlist
-  for (b in 1: length(regionlist)){
+  for (b in chrom){
     for (rn in names(regionlist[[b]])) {
       gid <- regionlist[[b]][[rn]][["gid"]]
       sid <- regionlist[[b]][[rn]][["sid"]]
@@ -399,9 +401,9 @@ filter_regions <- function(regionlist, group_prior, prob_single = 0.8, zdf){
 #' regions allocated to given number of cores
 #' regionlist need to contain at least 1 non-empty
 #' 
-region2core <- function(regionlist, ncore = 1){
+region2core <- function(regionlist, ncore = 1, chrom=1:22){
   dflist <- list()
-  for (b in 1:length(regionlist)){
+  for (b in chrom){
     if (length(regionlist[[b]]) > 0){
       dflist[[b]] <- data.frame("b" = b, "rn"= names(regionlist[[b]]), stringsAsFactors = FALSE)
     }

diff --git a/R/ctwas_read_data.R b/R/ctwas_read_data.R
@@ -357,7 +357,9 @@ read_weight_predictdb <- function (weight,
                                    ld_pgenfs=NULL, 
                                    ld_Rinfo=NULL,
                                    scale_by_ld_variance=T, 
-                                   ncore=1){
+                                   ncore=1,
+				   db_id="rsid"
+				   ){
 
   strand_ambig_action <- match.arg(strand_ambig_action)
 
@@ -431,11 +433,11 @@ read_weight_predictdb <- function (weight,
         wgt <- query("select * from weights where gene = ?", params = list(gname))
         wgt.matrix <- as.matrix(wgt[, "weight", drop = F])
 
-        rownames(wgt.matrix) <- wgt$rsid
+        rownames(wgt.matrix) <- wgt[[db_id]]
         chrpos <- do.call(rbind, strsplit(wgt$varID, "_"))
 
 
-        snps <- data.frame(gsub("chr", "", chrpos[, 1]), wgt$rsid,
+        snps <- data.frame(gsub("chr", "", chrpos[, 1]), wgt[[db_id]],
                            "0", chrpos[, 2], wgt$eff_allele, wgt$ref_allele,
                            stringsAsFactors = F)
         colnames(snps) <- c("chrom", "id", "cm", "pos", "alt", "ref")

diff --git a/R/ctwas_rss.R b/R/ctwas_rss.R
@@ -150,19 +150,27 @@ ctwas_rss <- function(
   outname = NULL,
   logfile = NULL,
   merge = TRUE,
-  fine_map = T){
+  fine_map = T,
+  chrom=1:22
+  ){
 
   if (!is.null(logfile)){
     addHandler(writeToFile, file= logfile, level='DEBUG')
   }
 
   loginfo('ctwas started ... ')
 
-  if (length(ld_exprfs) != 22){
-    stop("Not all imputed expression files for 22 chromosomes are provided.")
-  }
+  # if (length(ld_exprfs) != 22){
+  #   stop("Not all imputed expression files for 22 chromosomes are provided.")
+  # }
   ld_exprvarfs <- sapply(ld_exprfs, prep_exprvar)
 
+  if (!all(chrom == 1:22)){ # Try to have either all 22 or 1 chrom at a time
+    dummy_ld_exprvarfs <- rep(NA, 22)
+    dummy_ld_exprvarfs[chrom] <- ld_exprvarfs
+    ld_exprvarfs <- dummy_ld_exprvarfs
+  }
+
   if (is.null(ld_pgenfs) & is.null(ld_R_dir)){
     stop("Error: need to provide either .pgen file or ld_R file")
   } else if (!is.null(ld_pgenfs)){
@@ -214,11 +222,12 @@ ctwas_rss <- function(
                               outname = outname,
                               outputdir = outputdir,
                               merge = merge,
+			      chrom = chrom,
                               ncore = ncore_LDR) # susie_rss can't take 1 var.
 
   saveRDS(regionlist, file=paste0(outputdir, "/", outname, ".regionlist.RDS"))
 
-  temp_regs <- lapply(1:22, function(x) cbind(x,
+  temp_regs <- lapply(chrom, function(x) cbind(x,
                                               unlist(lapply(regionlist[[x]], "[[", "start")),
                                               unlist(lapply(regionlist[[x]], "[[", "stop"))))
 
@@ -254,7 +263,8 @@ ctwas_rss <- function(
                        outputdir = outputdir,
                        outname = paste0(outname, ".s1"),
                        inv_gamma_shape=inv_gamma_shape,
-                       inv_gamma_rate=inv_gamma_rate
+                       inv_gamma_rate=inv_gamma_rate,
+		       chrom = chrom
                    )
 
     group_prior <- pars[["group_prior"]]
@@ -264,7 +274,8 @@ ctwas_rss <- function(
     regionlist2 <- filter_regions(regionlist,
                                   group_prior,
                                   prob_single,
-                                  zdf)
+                                  zdf,
+                                  chrom = chrom)
 
     loginfo("Blocks are filtered: %s blocks left",
             sum(unlist(lapply(regionlist2, length))))
@@ -290,7 +301,8 @@ ctwas_rss <- function(
                        outputdir = outputdir,
                        outname = paste0(outname, ".s2"),
                        inv_gamma_shape=inv_gamma_shape,
-                       inv_gamma_rate=inv_gamma_rate)
+                       inv_gamma_rate=inv_gamma_rate,
+                       chrom = chrom)
 
     group_prior <- pars[["group_prior"]]
     group_prior_var <- pars[["group_prior_var"]]
@@ -319,7 +331,8 @@ ctwas_rss <- function(
                        outname = paste0(outname, ".temp"),
                        inv_gamma_shape=inv_gamma_shape,
                        inv_gamma_rate=inv_gamma_rate,
-                       report_parameters=F)
+                       report_parameters=F,
+                       chrom = chrom)
 
     group_prior["SNP"] <- group_prior["SNP"] * thin # convert snp pi1
 
@@ -337,13 +350,14 @@ ctwas_rss <- function(
                                   outname = outname, outputdir = outputdir,
                                   merge = merge,
                                   ncore = ncore_LDR,
+				  chrom = chrom,
                                   reuse_R_gene = T) # susie_rss can't take 1 var.
 
       res <- data.table::fread(paste0(file.path(outputdir, outname), ".temp.susieIrss.txt"))
 
       # filter out regions based on max gene PIP of the region
       res.keep <- NULL
-      for (b in 1: length(regionlist)){
+      for (b in chrom){
         for (rn in names(regionlist[[b]])){
           gene_PIP <- max(res$susie_pip[res$type != "SNP" & res$region_tag1 == b & res$region_tag2 == rn], 0)
           if (gene_PIP < rerun_gene_PIP) {
@@ -384,7 +398,8 @@ ctwas_rss <- function(
                            outname = paste0(outname, ".s3"),
                            inv_gamma_shape=inv_gamma_shape,
                            inv_gamma_rate=inv_gamma_rate,
-                           report_parameters=F)
+                           report_parameters=F,
+	                   chrom = chrom)
 
         res.rerun <- data.table::fread(paste0(file.path(outputdir, outname), ".s3.susieIrss.txt"))
 

diff --git a/R/ctwas_susieI_rss.R b/R/ctwas_susieI_rss.R
@@ -94,14 +94,21 @@ susieI_rss <- function(zdf,
                        outname = NULL,
                        inv_gamma_shape=1,
                        inv_gamma_rate=0,
-                       report_parameters=T){
+                       report_parameters=T,
+		       chrom = 1:22){
 
   outname <- file.path(outputdir, outname)
 
   group_prior_var_structure <- match.arg(group_prior_var_structure)
 
   ld_exprvarfs <- sapply(ld_exprfs, prep_exprvar)
 
+  if (!all(chrom == 1:22)){ # Try to have either all 22 or 1 chrom at a time
+    dummy_ld_exprvarfs <- rep(NA, 22)
+    dummy_ld_exprvarfs[chrom] <- ld_exprvarfs
+    ld_exprvarfs <- dummy_ld_exprvarfs
+  }
+
   if (is.null(ld_pgenfs) & is.null(ld_Rfs)){
     stop("Error: need to provide either .pgen file or ld_R file")
   }
@@ -146,7 +153,7 @@ susieI_rss <- function(zdf,
     cl <- parallel::makeCluster(ncore, outfile = "")
     doParallel::registerDoParallel(cl)
 
-    corelist <- region2core(regionlist, ncore)
+    corelist <- region2core(regionlist, ncore, chrom=chrom)
 
     outdf <- foreach (core = 1:length(corelist), .combine = "rbind",
                       .packages = "ctwas") %dopar% {