New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Performance regressions with dplyr #29
Comments
Thanks for your work on I have a 1.3GB file that that gives me inconsistent results when I try to import using temp_csv <- tempfile(fileext = ".csv")
download.file("https://files.digital.nhs.uk/ED/D3D2E5/T201811PDPI%20BNFT.CSV", destfile = temp_csv)
library(readr)
library(dplyr)
library(stringr)
library(magrittr)
library(vroom)
col_names <-
c("sha",
"pct",
"practice",
"bnf_code",
"bnf_name",
"items",
"nic",
"act_cost",
"quantity",
"period",
"x99"
)
col_types = "ccccciddiic"
# use read_csv() ----------------------------------------------------------
system.time({
d1 <- readr::read_csv(temp_csv, col_names = col_names, col_types = col_types)
})
## ~25s
system.time({
d1 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
})
## ~3s
rm(d1); gc()
# use vroom() -------------------------------------------------------------
system.time({
d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types) %>%
select(-x99)
})
## ~3s
system.time({
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
})
## ~56s
rm(d2); gc() |
Thank you for the dataset, it has proved useful in benchmarking this! I have made some improvements to reduce copies when materializing vectors and now using vroom + immediately materializing all the vectors is roughly on par with However naively using vroom still has a strange performance regression with dplyr, it should be at worst equivalent to using the explicit materialized version, but it is actually ~2x as slow. I will continue to hunt down the cause. pkgload::load_all("~/p/vroom")
#> Loading vroom
temp_csv = "~/p/vroom/andrie.csv"
col_names <-
c("sha",
"pct",
"practice",
"bnf_code",
"bnf_name",
"items",
"nic",
"act_cost",
"quantity",
"period",
"x99"
)
col_types = "ccccciddiic"
library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.5.2
#>
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:testthat':
#>
#> matches
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# use fread() (for low baseline) ---------------------------------------------------------------
print(system.time({
d2 <- data.table::fread(temp_csv, col.names = col_names) %>%
select(-x99)
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> user system elapsed
#> 34.123 2.741 8.288
rm(d2); invisible(gc(FALSE))
# use vroom() fast explicit materialize ------------------------------------------------------
print(system.time({
d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>%
select(-x99)
system.time(vroom:::vroom_materialize(d2))
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> user system elapsed
#> 22.243 1.211 9.643
rm(d2); invisible(gc(FALSE))
# use vroom() explicit materialize ------------------------------------------------------
print(system.time({
d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>%
select(-x99)
system.time(for (i in seq_along(d2)) vroom:::force_materialization(d2[[i]]))
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> user system elapsed
#> 22.052 0.726 10.587
rm(d2); invisible(gc(FALSE))
# use vroom() naively------------------------------------------------------
print(system.time({
d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>%
select(-x99)
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> user system elapsed
#> 30.049 0.735 18.933
rm(d2); invisible(gc(FALSE))
# read_csv() for high baseline ---------------------------------------------
print(system.time({
d2 <- readr::read_delim(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>%
select(-x99)
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> Warning: 5 parsing failures.
#> row col expected actual file
#> 1 items an integer ITEMS '~/p/vroom/andrie.csv'
#> 1 nic a double NIC '~/p/vroom/andrie.csv'
#> 1 act_cost a double ACT COST '~/p/vroom/andrie.csv'
#> 1 quantity an integer QUANTITY '~/p/vroom/andrie.csv'
#> 1 period an integer PERIOD '~/p/vroom/andrie.csv'
#> user system elapsed
#> 16.484 0.792 17.316
rm(d2); invisible(gc(FALSE)) Created on 2019-02-28 by the reprex package (v0.2.1) |
So after tidyverse/dplyr#4314 and using the current version of vroom we now recover all the performance regressions you saw and are actually a bit faster in this example than data.table. I will close this for now, if you run into similar issues please let me know and open a new issue! pkgload::load_all("~/p/vroom")
#> Loading vroom
temp_csv = "~/data/td/andrie.csv"
col_names <-
c("sha",
"pct",
"practice",
"bnf_code",
"bnf_name",
"items",
"nic",
"act_cost",
"quantity",
"period",
"x99"
)
col_types = "ccccciddiic"
library(dplyr)
# use fread() (for low baseline) ---------------------------------------------------------------
print(system.time({
d2 <- data.table::fread(temp_csv, col.names = col_names) %>%
select(-x99)
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> user system elapsed
#> 34.532 2.580 8.255
rm(d2); invisible(gc(FALSE))
# use vroom() ----------------------------------------------------------------------------------
print(system.time({
d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>%
select(-x99)
d3 <- d2 %>%
group_by(bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> user system elapsed
#> 15.454 0.987 6.530
rm(d2); invisible(gc(FALSE))
# read_csv() for high baseline -----------------------------------------------------------------
print(system.time({
d2 <- readr::read_delim(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>%
select(-x99)
d2 %>%
group_by(period, bnf_name) %>%
summarise_if(is.numeric, sum) %>%
arrange(desc(act_cost))
}))
#> user system elapsed
#> 16.634 0.764 17.443
rm(d2); invisible(gc(FALSE)) Created on 2019-04-04 by the reprex package (v0.2.1) |
This issue now is tracking dplyr performance regressions.
The text was updated successfully, but these errors were encountered: