Permalink
Fetching contributors…
Cannot retrieve contributors at this time
193 lines (167 sloc) 4.64 KB
---
title: "valr Benchmarks"
date: '`r format(Sys.Date(), "%B %d %Y")`'
author: "Jay Hesselberth"
output:
rmarkdown::html_vignette:
toc: true
toc_depth: 2
vignette: >
%\VignetteIndexEntry{valr-benchmarks}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.height = 4,
fig.align = "center",
fig.width = 8
)
```
## valr Functions
Many valr functions are written in Rcpp/C++ to maximize speed. Here are benchmarks for 1 million random `x` and `y` intervals.
```{r valr_benchmark, echo = FALSE, message = FALSE, warning = FALSE}
library(valr)
library(dplyr)
library(ggplot2)
library(tibble)
library(scales)
library(GenomicRanges)
library(microbenchmark)
genome <- read_genome(valr_example('hg19.chrom.sizes.gz'))
# number of intervals
n <- 1e6
# number of timing reps
nrep <- 10
seed_x <- 1010486
x <- bed_random(genome, n = n, seed = seed_x)
seed_y <- 9283019
y <- bed_random(genome, n = n, seed = seed_y)
res <- microbenchmark(
# randomizing functions
bed_random(genome, n = n, seed = seed_x),
bed_shuffle(x, genome, seed = seed_x),
# # single tbl functions
bed_slop(x, genome, both = 1000),
bed_flank(x, genome, both = 1000),
bed_shift(x, genome),
bed_merge(x),
bed_partition(x),
bed_cluster(x),
bed_complement(x, genome),
# multi tbl functions
bed_closest(x, y),
bed_intersect(x, y),
bed_map(x, y, .n = n()),
bed_subtract(x, y),
bed_window(x, y, genome),
# stats
bed_absdist(x, y, genome),
bed_reldist(x, y),
bed_jaccard(x, y),
bed_fisher(x, y, genome),
bed_projection(x, y, genome),
# utilities
bed_makewindows(x, win_size = 100),
times = nrep,
unit = 's')
# covert nanoseconds to seconds
res <- res %>%
as_tibble() %>%
mutate(time = time / 1e9) %>%
arrange(time)
# futz with the x-axis
maxs <- res %>%
group_by(expr) %>%
summarize(max.time = max(boxplot.stats(time)$stats))
# filter out outliers
res <- res %>%
left_join(maxs) %>%
filter(time <= max.time * 1.05)
ggplot(res, aes(x=reorder(expr, time), y=time)) +
geom_boxplot(fill = 'red', outlier.shape = NA, alpha = 0.5) +
coord_flip() +
theme_bw() +
labs(
y='execution time (seconds)',
x='',
title="valr benchmarks",
subtitle=paste(comma(n), "random x/y intervals,", comma(nrep), "repetitions"))
```
## Comparison to GenomicRanges
Several functions in valr are comparable in speed to the equivalent function in GenomicRanges.
```{r valr_comparison, echo = FALSE, message = FALSE, warning = FALSE}
# set up test for valr vs GenomicRanges
seed_x <- 1010486
x <- bed_random(genome, n = n, seed = seed_x)
seed_y <- 9283019
y <- bed_random(genome, n = n, seed = seed_y)
seed_z <- 1234567
z <- bed_random(genome, n = n, seed = seed_z, sort_by = NULL)
# convert valr tibbles into GR objects
x_gr <- GRanges(
seqnames = Rle(x$chrom),
ranges = IRanges(x$start + 1, end = x$end))
y_gr <- GRanges(
seqnames = Rle(y$chrom),
ranges = IRanges(y$start + 1, end = y$end))
z_gr <- GRanges(
seqnames = Rle(z$chrom),
ranges = IRanges(z$start + 1, end = z$end))
res2 <- microbenchmark(
# valr
bed_flank(x, genome, both = 1000),
bed_shift(x, genome),
bed_merge(x),
bed_complement(x, genome),
bed_closest(x, y),
bed_intersect(x, y),
bed_subtract(x, y),
bed_makewindows(x, win_size = 100),
bed_sort(z),
# GRanges
flank(x_gr, 1000),
shift(x_gr,0),
reduce(x_gr),
gaps(x_gr),
nearest(x_gr),
intersect(x_gr, y_gr),
setdiff(x_gr, y_gr),
tile(x_gr, width = 100),
sort(z_gr),
times = nrep,
unit = 's')
# label experiment pair and software
comp <- as.tibble(summary(res2)) %>%
mutate(pair = as.numeric(row_number()) %% (nrow(res2) / nrep / 2)) %>%
mutate(software = rep(c("valr", "GR"), each = (nrow(res2) / nrep / 2))) %>%
select(expr, pair, software)
# covert nanoseconds to seconds
res2 <- res2 %>%
as_tibble() %>%
left_join(comp) %>%
mutate(time = time / 1e9) %>%
arrange(time)
# futz with the x-axis
maxs2 <- res2 %>%
group_by(expr) %>%
summarize(max.time = max(boxplot.stats(time)$stats))
# filter out outliers
res2 <- res2 %>%
left_join(maxs2) %>%
filter(time <= max.time * 1.05)
#label
ggplot(res2, aes(x=reorder(expr, pair), y=time)) +
geom_boxplot(aes(fill = software), outlier.shape = NA, alpha = 0.5) +
coord_flip() +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
geom_vline(xintercept=seq(1,nrow(res2),2)-.5, color = 'grey', alpha = 0.23) +
labs(
y='execution time (seconds)',
x='',
title="valr vs GenomicRanges benchmarks",
subtitle=paste(comma(n), "random x/y intervals,", comma(nrep), "repetitions"))
```