Performance regressions with dplyr #29

jimhester · 2019-02-10T19:41:44Z

This issue now is tracking dplyr performance regressions.

andrie · 2019-02-27T15:47:55Z

Thanks for your work on vroom. This is looking tremendously promising!

I have a 1.3GB file that that gives me inconsistent results when I try to import using vroom(). Here is a reprex:

temp_csv <- tempfile(fileext = ".csv")
download.file("https://files.digital.nhs.uk/ED/D3D2E5/T201811PDPI%20BNFT.CSV", destfile = temp_csv)


library(readr)
library(dplyr)
library(stringr)
library(magrittr)
library(vroom)

col_names <- 
  c("sha",
    "pct", 
    "practice",
    "bnf_code",
    "bnf_name",
    "items",
    "nic",
    "act_cost",
    "quantity",
    "period",
    "x99"
  )
col_types = "ccccciddiic"


# use read_csv() ----------------------------------------------------------


system.time({
  d1 <- readr::read_csv(temp_csv, col_names = col_names, col_types = col_types)
})

## ~25s

system.time({
  d1 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>%
    arrange(desc(act_cost))
})

## ~3s

rm(d1); gc()


# use vroom() -------------------------------------------------------------

system.time({
  d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types) %>% 
    select(-x99)
})

## ~3s

system.time({
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
})

## ~56s

rm(d2); gc()

jimhester · 2019-02-28T15:39:36Z

Thank you for the dataset, it has proved useful in benchmarking this! I have made some improvements to reduce copies when materializing vectors and now using vroom + immediately materializing all the vectors is roughly on par with fread() for this dataset + manipulation.

However naively using vroom still has a strange performance regression with dplyr, it should be at worst equivalent to using the explicit materialized version, but it is actually ~2x as slow. I will continue to hunt down the cause.

pkgload::load_all("~/p/vroom")
#> Loading vroom
temp_csv = "~/p/vroom/andrie.csv"

col_names <- 
  c("sha",
    "pct", 
    "practice",
    "bnf_code",
    "bnf_name",
    "items",
    "nic",
    "act_cost",
    "quantity",
    "period",
    "x99"
  )
col_types = "ccccciddiic"
library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.5.2
#> 
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:testthat':
#> 
#>     matches
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

# use fread() (for low baseline) ---------------------------------------------------------------

print(system.time({
  d2 <- data.table::fread(temp_csv, col.names = col_names) %>%
    select(-x99)
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#>    user  system elapsed 
#>  34.123   2.741   8.288
rm(d2); invisible(gc(FALSE))

# use vroom() fast explicit materialize ------------------------------------------------------

print(system.time({
  d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>% 
    select(-x99)
  system.time(vroom:::vroom_materialize(d2))
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#>    user  system elapsed 
#>  22.243   1.211   9.643
rm(d2); invisible(gc(FALSE))

# use vroom() explicit materialize ------------------------------------------------------

print(system.time({
  d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>% 
    select(-x99)
  system.time(for (i in seq_along(d2)) vroom:::force_materialization(d2[[i]]))
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#>    user  system elapsed 
#>  22.052   0.726  10.587
rm(d2); invisible(gc(FALSE))

# use vroom() naively------------------------------------------------------

print(system.time({
  d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>% 
    select(-x99)
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#>    user  system elapsed 
#>  30.049   0.735  18.933
rm(d2); invisible(gc(FALSE))

# read_csv() for high baseline ---------------------------------------------

print(system.time({
  d2 <- readr::read_delim(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>% 
    select(-x99)
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#> Warning: 5 parsing failures.
#> row      col   expected      actual                   file
#>   1 items    an integer ITEMS       '~/p/vroom/andrie.csv'
#>   1 nic      a double   NIC         '~/p/vroom/andrie.csv'
#>   1 act_cost a double   ACT COST    '~/p/vroom/andrie.csv'
#>   1 quantity an integer QUANTITY    '~/p/vroom/andrie.csv'
#>   1 period   an integer PERIOD      '~/p/vroom/andrie.csv'
#>    user  system elapsed 
#>  16.484   0.792  17.316
rm(d2); invisible(gc(FALSE))

^{Created on 2019-02-28 by the reprex package (v0.2.1)}

jimhester · 2019-04-04T13:34:18Z

So after tidyverse/dplyr#4314 and using the current version of vroom we now recover all the performance regressions you saw and are actually a bit faster in this example than data.table. I will close this for now, if you run into similar issues please let me know and open a new issue!

pkgload::load_all("~/p/vroom")
#> Loading vroom
temp_csv = "~/data/td/andrie.csv"

col_names <- 
  c("sha",
    "pct", 
    "practice",
    "bnf_code",
    "bnf_name",
    "items",
    "nic",
    "act_cost",
    "quantity",
    "period",
    "x99"
  )
col_types = "ccccciddiic"
library(dplyr)

# use fread() (for low baseline) ---------------------------------------------------------------

print(system.time({
  d2 <- data.table::fread(temp_csv, col.names = col_names) %>%
    select(-x99)
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#>    user  system elapsed 
#>  34.532   2.580   8.255
rm(d2); invisible(gc(FALSE))

# use vroom() ----------------------------------------------------------------------------------

print(system.time({
  d2 <- vroom::vroom(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>% 
    select(-x99)
  d3 <- d2 %>% 
    group_by(bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#>    user  system elapsed 
#>  15.454   0.987   6.530
rm(d2); invisible(gc(FALSE))

# read_csv() for high baseline -----------------------------------------------------------------

print(system.time({
  d2 <- readr::read_delim(temp_csv, delim = ",", col_names = col_names, col_types = col_types, escape_double = FALSE) %>% 
    select(-x99)
  d2 %>% 
    group_by(period, bnf_name) %>% 
    summarise_if(is.numeric, sum) %>% 
    arrange(desc(act_cost))
}))
#>    user  system elapsed 
#>  16.634   0.764  17.443
rm(d2); invisible(gc(FALSE))

^{Created on 2019-04-04 by the reprex package (v0.2.1)}

jimhester added the feature a feature request or enhancement label Feb 11, 2019

jimhester mentioned this issue Mar 25, 2019

Commas inside quoted fields breaks column reading #65

Closed

jimhester changed the title ~~Robustness to malformed inputs~~ Performance regressions with dplyr Apr 4, 2019

jimhester closed this as completed Apr 4, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Performance regressions with dplyr #29

Performance regressions with dplyr #29

jimhester commented Feb 10, 2019 •

edited

andrie commented Feb 27, 2019

jimhester commented Feb 28, 2019

jimhester commented Apr 4, 2019

Performance regressions with dplyr #29

Performance regressions with dplyr #29

Comments

jimhester commented Feb 10, 2019 • edited

andrie commented Feb 27, 2019

jimhester commented Feb 28, 2019

jimhester commented Apr 4, 2019

jimhester commented Feb 10, 2019 •

edited