Skip to content

Commit

Permalink
Add normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
isaacvock committed Jan 26, 2024
1 parent e9bfcd5 commit 0992fa4
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 70 deletions.
12 changes: 9 additions & 3 deletions workflow/rules/bam2bakr.smk
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,23 @@ if NORMALIZE:

rule normalize:
input:
expand("results/sf_reads/{sample}.s.bam", sample=SAMP_NAMES),
expand(
"results/featurecounts_exons/{sample}.featureCounts", sample=SAMP_NAMES
),
output:
"results/normalization/scale",
log:
"logs/normalize/normalize.log",
threads: 1
params:
rscript=workflow.source_path("../scripts/bam2bakR/normalize.R"),
spikename=config["spikename"],
conda:
"../envs/full.yaml"
shell:
"""
touch {output}
r"""
chmod +x {params.rscript}
{params.rscript} --dirs ./results/featurecounts_exons/ --output {output} --spikename {params.spikename} 1> {log} 2>&1
"""

else:
Expand Down
136 changes: 69 additions & 67 deletions workflow/scripts/normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ args = commandArgs(trailingOnly = TRUE)
make_option(c("-s", "--spikename", type="character"),
default = "",
help = 'character string that is common identifier of spikeins gene_name'),
make_option(c("-o", "--output", type="character"),
default = "./results/normalization/scale",
help = 'File to output scale factors to'),
make_option(c("-e", "--echocode", type="logical"),
default = "FALSE",
help = 'print R code to stdout'))
Expand All @@ -36,6 +39,7 @@ args = commandArgs(trailingOnly = TRUE)
library(tidyverse)
library(edgeR)
library(MASS)
library(data.table)


# Set date
Expand All @@ -48,101 +52,99 @@ args = commandArgs(trailingOnly = TRUE)
master <- tibble()


# # Load and clean master data

# Define samples to load
## Martin's old code
# dirs <- opt$dirs %>% str_split(',') %>% unlist()
# Screw it, going to hardcode directory cause I can
dirs <- paste0(getwd(), "./results/featurecounts_exons/")

# Screw it, going to hardcode directory cause I can
dirs <- paste0(getwd(), "/results/htseq/")
print(dirs)
# Currently will not work alone due to fact that CORE files also have .featureCounts
samplenames <- list.files(path = dirs,
pattern = "\\.featureCounts$",
recursive = FALSE,
full.names = TRUE)

samplefiles <- list.files(path = dirs,
pattern = 'mature.*.txt',
recursive = FALSE)
# Remove the CORE files
samplenames <- samplenames[!grepl("\\.bam\\.featureCounts$", samplenames)]

## Martin's old code that doesn't work in my workflow
# samplenames <- gsub(".dir.*", "", samplefiles)
samplenames <- paste0(dirs, samplefiles)
# Actual sample name wildcards
snames <- gsub(".featureCounts", "", samplefiles)

print(samplenames)
# Actual sample name wildcards
snames <- gsub(".*mature.|_htseq.*", "", samplefiles)
#snames <- gsub("htseq*", "", snames)

# Loop through the samples to load them:

print(snames)
# Loop through the samples to load them:
for (i in seq_along(samplenames)){

for (i in seq_along(samplenames)){
df <- fread(samplenames[i],
sep = "\t")

colnames(df) <- c("Geneid", "Chr", "Start", "End",
"Strand", "Length", "count")

df <- read_tsv(samplenames[i],
col_names = c('gene', 'count'),
col_types = 'ci')
df$sample <- paste0(snames[i])

df$sample <- paste0(snames[i])
master <- rbind(master, df)

master <- rbind(master, df)

}

rm(df)
}

# glimpse(master)
rm(df)

m <- master %>%
pivot_wider(names_from = sample, values_from = count) %>%
filter(!grepl('(__not_aligned|__alignment_not_unique|__ambiguous|__no_feature|__too_low_aQual)', gene))

# Filter only for spikeins. gene_name (or other identifier must contain universal character string opt$spikename)
if (opt$spikename != "") {
m <- m %>%
filter(grepl(opt$spikename, gene))
m <- master %>%
dplyr::select(-Chr, -Start, -End, -Strand, -Length) %>%
pivot_wider(names_from = sample, values_from = count)

print(paste0("* Calculating normalization factors from spikeins with name *", opt$spikename))
}
# Filter only for spikeins. gene_name (or other identifier must contain universal character string opt$spikename)
if (opt$spikename != "") {
m <- m %>%
filter(grepl(opt$spikename, gene))

# head(m, 20)
rnames <- m[,1]
m <- as.matrix(m[,-1])
storage.mode(m) <- "numeric"
rownames(m) <- unlist(rnames)
# head(m, 20)
print(paste0("* Calculating normalization factors from spikeins with name *", opt$spikename))
}

print(rnames)
# Turn into a count matrix accceptable for edgeR
rnames <- m[,1]
m <- as.matrix(m[,-1])
storage.mode(m) <- "numeric"
rownames(m) <- unlist(rnames)

# Correlation analysis
#cor(m)

# Examine RNA seq by edgeR:
### Normalize by edgeR:

# Add dummy IDs (don't need them yet)
ers <- rep("X", time = dim(m)[2])
ers <- factor(ers)
ers <- rep("X", time = dim(m)[2])
ers <- factor(ers)

# Filter to only data-containing genes:
keep <- rowSums(m > 0) > 1
erde <- DGEList(m[keep,], group = ers)
erde <- calcNormFactors(erde, method = "upperquartile")
scale <- erde$samples$norm.factors*erde$samples$lib.size
# scale
keep <- rowSums(m > 0) > 1
erde <- DGEList(m[keep,], group = ers)
erde <- calcNormFactors(erde, method = "upperquartile")
scale <- erde$samples$norm.factors*erde$samples$lib.size

if (sum(is.finite(scale)) == length(scale)){

x <- mean(scale)/scale

sdf <- tibble(sample = snames,
scale = x)

print(sdf)
x <- mean(scale)/scale

sdf <- tibble(sample = snames,
scale = x)


write.table(sdf, file = 'scale',
quote = FALSE,
row.names = FALSE,
col.names = FALSE)
write.table(sdf, file = opt$output,
quote = FALSE,
row.names = FALSE,
col.names = FALSE)

} else {
paste('no acceptable scale values')

warning('!!no acceptable scale values!! Reverting to scale factor of 1')

x <- rep(1, times = length(snames))

sdf <- tibble(sample = snames,
scale = x)


write.table(sdf, file = opt$output,
quote = FALSE,
row.names = FALSE,
col.names = FALSE)
}

0 comments on commit 0992fa4

Please sign in to comment.