Add normalization

simonlabcode · Jan 26, 2024 · 0992fa4 · 0992fa4
1 parent e9bfcd5
commit 0992fa4
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 70 deletions.
diff --git a/workflow/rules/bam2bakr.smk b/workflow/rules/bam2bakr.smk
@@ -87,17 +87,23 @@ if NORMALIZE:
 
     rule normalize:
         input:
-            expand("results/sf_reads/{sample}.s.bam", sample=SAMP_NAMES),
+            expand(
+                "results/featurecounts_exons/{sample}.featureCounts", sample=SAMP_NAMES
+            ),
         output:
             "results/normalization/scale",
         log:
             "logs/normalize/normalize.log",
         threads: 1
+        params:
+            rscript=workflow.source_path("../scripts/bam2bakR/normalize.R"),
+            spikename=config["spikename"],
         conda:
             "../envs/full.yaml"
         shell:
-            """
-            touch {output}
+            r"""
+            chmod +x {params.rscript}
+            {params.rscript} --dirs ./results/featurecounts_exons/ --output {output} --spikename {params.spikename} 1> {log} 2>&1
             """
 
 else:

diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R
@@ -21,6 +21,9 @@ args = commandArgs(trailingOnly = TRUE)
         make_option(c("-s", "--spikename", type="character"),
                      default = "",
                      help = 'character string that is common identifier of spikeins gene_name'),
+        make_option(c("-o", "--output", type="character"),
+                    default = "./results/normalization/scale",
+                    help = 'File to output scale factors to'),
         make_option(c("-e", "--echocode", type="logical"),
                      default = "FALSE",
                      help = 'print R code to stdout'))
@@ -36,6 +39,7 @@ args = commandArgs(trailingOnly = TRUE)
     library(tidyverse)
     library(edgeR)
     library(MASS)
+    library(data.table)
 
 
 # Set date
@@ -48,101 +52,99 @@ args = commandArgs(trailingOnly = TRUE)
     master <- tibble()
 
 
-# # Load and clean master data
 
-    # Define samples to load
-        ## Martin's old code
-	# dirs <- opt$dirs %>% str_split(',') %>% unlist()
+# Screw it, going to hardcode directory cause I can
+dirs <- paste0(getwd(), "./results/featurecounts_exons/")
 
-	# Screw it, going to hardcode directory cause I can
-	dirs <- paste0(getwd(), "/results/htseq/")
-    print(dirs)
+# Currently will not work alone due to fact that CORE files also have .featureCounts
+samplenames <- list.files(path = dirs,
+                          pattern = "\\.featureCounts$",
+                          recursive = FALSE,
+                          full.names = TRUE)
 
-        samplefiles <- list.files(path = dirs,
-                                  pattern = 'mature.*.txt',
-                                  recursive = FALSE)
+# Remove the CORE files
+samplenames <- samplenames[!grepl("\\.bam\\.featureCounts$", samplenames)]
 
-	## Martin's old code that doesn't work in my workflow
-        # samplenames <- gsub(".dir.*", "", samplefiles)
-        samplenames <- paste0(dirs, samplefiles)
+# Actual sample name wildcards
+snames <- gsub(".featureCounts", "", samplefiles)
 
-    print(samplenames)
-	# Actual sample name wildcards
-	snames <- gsub(".*mature.|_htseq.*", "", samplefiles)
-	#snames <- gsub("htseq*", "", snames)
 
+# Loop through the samples to load them:
 
-    print(snames)
-    # Loop through the samples to load them:
+for (i in seq_along(samplenames)){
 
-    for (i in seq_along(samplenames)){
+    df <- fread(samplenames[i],
+                sep = "\t")
+
+    colnames(df) <- c("Geneid", "Chr", "Start", "End",
+                      "Strand", "Length", "count")
 
-        df <- read_tsv(samplenames[i],
-                       col_names = c('gene', 'count'),
-                       col_types = 'ci')
+    df$sample <- paste0(snames[i])
 
-        df$sample <- paste0(snames[i])
+    master <- rbind(master, df)
 
-        master <- rbind(master, df)
-
-    }
-
-    rm(df)
+}
 
-    # glimpse(master)
+rm(df)
 
-    m <- master %>%
-        pivot_wider(names_from = sample, values_from = count) %>%
-        filter(!grepl('(__not_aligned|__alignment_not_unique|__ambiguous|__no_feature|__too_low_aQual)', gene))
 
-    # Filter only for spikeins. gene_name (or other identifier must contain universal character string opt$spikename)
-    if (opt$spikename != "") {
-        m <- m %>%
-                filter(grepl(opt$spikename, gene))
+m <- master %>%
+    dplyr::select(-Chr, -Start, -End, -Strand, -Length) %>%
+    pivot_wider(names_from = sample, values_from = count)
 
-        print(paste0("* Calculating normalization factors from spikeins with name *", opt$spikename))
-    }
+# Filter only for spikeins. gene_name (or other identifier must contain universal character string opt$spikename)
+if (opt$spikename != "") {
+    m <- m %>%
+            filter(grepl(opt$spikename, gene))
 
-    # head(m, 20)
-    rnames <- m[,1]
-    m <- as.matrix(m[,-1])
-    storage.mode(m) <- "numeric"
-    rownames(m) <- unlist(rnames)
-    # head(m, 20)
+    print(paste0("* Calculating normalization factors from spikeins with name *", opt$spikename))
+}
 
-    print(rnames)
+# Turn into a count matrix accceptable for edgeR
+rnames <- m[,1]
+m <- as.matrix(m[,-1])
+storage.mode(m) <- "numeric"
+rownames(m) <- unlist(rnames)
 
-# Correlation analysis
-    #cor(m)
 
-# Examine RNA seq by edgeR:
+### Normalize by edgeR:
 
 # Add dummy IDs (don't need them yet)
-    ers <- rep("X", time = dim(m)[2])
-    ers <- factor(ers)
+ers <- rep("X", time = dim(m)[2])
+ers <- factor(ers)
 
 # Filter to only data-containing genes:
-    keep <- rowSums(m > 0) > 1
-    erde <- DGEList(m[keep,], group = ers)
-    erde <- calcNormFactors(erde, method = "upperquartile")
-    scale <- erde$samples$norm.factors*erde$samples$lib.size
-    # scale
+keep <- rowSums(m > 0) > 1
+erde <- DGEList(m[keep,], group = ers)
+erde <- calcNormFactors(erde, method = "upperquartile")
+scale <- erde$samples$norm.factors*erde$samples$lib.size
 
 if (sum(is.finite(scale)) == length(scale)){
 
-    x <- mean(scale)/scale
-
-    sdf <- tibble(sample = snames,
-                  scale = x)
-
-    print(sdf)
+  x <- mean(scale)/scale
+
+  sdf <- tibble(sample = snames,
+                scale = x)
+
 
-    write.table(sdf, file = 'scale',
-                     quote = FALSE,
-                     row.names = FALSE,
-                     col.names = FALSE)
+  write.table(sdf, file = opt$output,
+                   quote = FALSE,
+                   row.names = FALSE,
+                   col.names = FALSE)
 
 } else {
-    paste('no acceptable scale values')
+
+  warning('!!no acceptable scale values!! Reverting to scale factor of 1')
+
+  x <- rep(1, times = length(snames))
+
+  sdf <- tibble(sample = snames,
+                scale = x)
+
+
+  write.table(sdf, file = opt$output,
+              quote = FALSE,
+              row.names = FALSE,
+              col.names = FALSE)
 }