/
make_master_peaks.R
128 lines (100 loc) · 3.81 KB
/
make_master_peaks.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#! /usr/bin/env Rscript
# The Parker Lab
# theparkerlab.org
#
# Original author: Peter Orchard
suppressPackageStartupMessages({
library(optparse)
library(dplyr)
library(tidyr)
library(ggplot2)
library(glue)
library(GenomicRanges)
})
option_list <- list(
make_option(c("--peaks-dir"),
action = "store", dest = "peaks_dir", type = "character",
help = "[Required] Path to the directory of peak files ('*_peaks.broadPeak.noblacklist')"
),
make_option(c("--number-samples-threshold"),
action = "store", dest = "number_samples",
default = 1, type = "numeric", help = "[Optional] Number of samples that must show overlap with a master peak"
),
make_option(c("--fdr"),
action = "store", type = "numeric", default = 0.05,
help = "[Optional] FDR to filter initial peak lists by (default: 0.05)"
),
make_option(c("--out"),
action = "store", type = "character",
help = "[Required] Name of the output file (a bed file of master peaks)"
)
)
option_parser <- OptionParser(
usage = "usage: Rscript %prog [options]",
option_list = option_list, add_help_option = T
)
opts <- parse_args(option_parser)
#
# Helper functions
#
parse_file_name <- function(f) {
regex <- "^(.*)_peaks.broadPeak.noblacklist"
lib <- gsub(regex, "\\1", basename(f))
x <- c("library" = lib)
}
read_peak_file <- function(f) {
print(glue("Reading {basename(f)}.."), stderr())
tmp <- data.table::fread(f, header = F, stringsAsFactors = F, sep = "\t")
if (ncol(tmp) != 9) {
print("Pleae provide 9-column broadPeak file(s) as <FILE>.broadPeak.noblacklist.", stderr())
exit(1)
}
colnames(tmp) <- c("chrom", "start", "end", "peak_name", "V5", "V6", "V7", "V8", "neg_log_10_q")
tmp <- tmp %>%
dplyr::filter(neg_log_10_q >= -log10(opts$fdr)) %>%
dplyr::select(chrom:end, peak_name)
tmp$library <- parse_file_name(f)["library"]
tmp
}
#
# main
#
PEAKS <- list.files(opts$peaks_dir, pattern = "*.broadPeak.noblacklist", full.names = T)
results <- bind_rows(lapply(PEAKS, read_peak_file))
print(
glue("Merging peaks from {length(unique(results$library))} libraries: {libraries}",
libraries = paste(unique(results$library), collapse = ", ")
),
stderr()
)
results.granges <- makeGRangesFromDataFrame(results, keep.extra.columns = T)
master_peaks <- GenomicRanges::reduce(results.granges)
print(glue("There are {n} master peaks", n = length(master_peaks)))
print(glue("Counting overlap with all libraries..", n = length(master_peaks)))
# Determine the overlap between master peaks and the peak calls from each library
overlaps <- findOverlaps(results.granges, master_peaks)
overlaps.master_peak <- master_peaks[subjectHits(overlaps)]
overlaps.library <- results$library[queryHits(overlaps)]
overlap_counts <- cbind(as.data.frame(overlaps.master_peak)[, 1:3],
as.character(overlaps.library))
colnames(overlap_counts) <- c("chrom", "start", "end", "library")
o_master_peaks <- overlap_counts %>%
unique() %>%
group_by(chrom, start, end) %>%
summarize(number_libraries_with_overlap = n()) %>%
unique()
pdf(glue("{tools::file_path_sans_ext(basename(opts$out))}.pdf"), height = 6, width = 6)
ggplot(o_master_peaks) + geom_bar(aes(number_libraries_with_overlap))
dev.off()
# ggplot(o_master_peaks) + geom_bar(aes(number_libraries_with_overlap, cumsum(..count..)))
o_master_peaks <- o_master_peaks %>%
filter(number_libraries_with_overlap >= opts$number_samples_threshold) %>%
select(chrom, start, end)
o_master_peaks$chrom <- as.character(o_master_peaks$chrom)
o_master_peaks <- o_master_peaks[order(o_master_peaks$chrom, o_master_peaks$start), ]
print(glue("There are {n} master peaks", n = nrow(o_master_peaks)))
write.table(o_master_peaks,
file = opts$out, append = F,
quote = F, sep = "\t", row.names = F, col.names = F
)
print(glue("Overlap counts output to {out}", out = opts$out))