From 65cd895479b1401e14b6f18ca418f9bc9af672c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 6 Feb 2016 00:26:15 +0100 Subject: [PATCH 01/17] table data iter gains new member all() --- R/tabledata.r | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 7d4f82ab..71d51bd5 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -45,6 +45,8 @@ list_tabledata <- function(project, dataset, table, page_size = 1e4, do.call("rbind", rows) } +DEFAULT_PAGE_SIZE <- 1e4 + #' @description #' \code{list_tabledata_callback} calls the supplied callback with each page #' of data. @@ -52,7 +54,7 @@ list_tabledata <- function(project, dataset, table, page_size = 1e4, #' @export list_tabledata_callback <- function(project, dataset, table, callback, table_info = NULL, - page_size = 1e4, max_pages = 10, + page_size = DEFAULT_PAGE_SIZE, max_pages = 10, warn = TRUE, quiet = getOption("bigquery.quiet")) { assert_that(is.string(project), is.string(dataset), is.string(table)) @@ -131,7 +133,16 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows) } - list(next_ = next_, is_complete = is_complete) + all <- function(page_size = DEFAULT_PAGE_SIZE) { + ret <- list() + while (!is_complete()) { + chunk <- next_(page_size) + ret <- c(ret, list(chunk)) + } + do.call(rbind, ret) + } + + list(next_ = next_, all = all, is_complete = is_complete) } #Types can be loaded into R, record is not supported yet. From 714597cd82f2e3e7c350b67c68601e8b3c43efe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 6 Feb 2016 00:33:27 +0100 Subject: [PATCH 02/17] document --- DESCRIPTION | 2 +- man/list_tabledata.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 28d3a184..4c6eb917 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,4 +18,4 @@ Suggests: testthat License: GPL-3 LazyData: true -RoxygenNote: 5.0.1.9000 +RoxygenNote: 5.0.1 diff --git a/man/list_tabledata.Rd b/man/list_tabledata.Rd index 1dd01b29..dc4ffbc2 100644 --- a/man/list_tabledata.Rd +++ b/man/list_tabledata.Rd @@ -11,7 +11,7 @@ list_tabledata(project, dataset, table, page_size = 10000, quiet = getOption("bigquery.quiet")) list_tabledata_callback(project, dataset, table, callback, table_info = NULL, - page_size = 10000, max_pages = 10, warn = TRUE, + page_size = DEFAULT_PAGE_SIZE, max_pages = 10, warn = TRUE, quiet = getOption("bigquery.quiet")) list_tabledata_iter(project, dataset, table, table_info = NULL) From 8e9b7da71d6e9264cc5a23012d1af0d05cfc5388 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 6 Feb 2016 00:34:02 +0100 Subject: [PATCH 03/17] NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 67ae3a09..22063814 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,6 @@ * New `format_dataset()` and `format_table()`. (#81, @krlmlr) -* New `list_tabledata_iter()` that allows fetching a table in chunks of varying size. (#77, @krlmlr) +* New `list_tabledata_iter()` that allows fetching a table in chunks of varying size. (#77, #87, @krlmlr) * Add support for API keys via the `BIGRQUERY_API_KEY` environment variable. (#49) From 8b2f45a4dc0afafa6f4ae17c64af4213c1102a0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 6 Feb 2016 01:01:23 +0100 Subject: [PATCH 04/17] use function instead of constant for default page size --- R/tabledata.r | 7 ++++--- man/list_tabledata.Rd | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 71d51bd5..19f21510 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -45,7 +45,7 @@ list_tabledata <- function(project, dataset, table, page_size = 1e4, do.call("rbind", rows) } -DEFAULT_PAGE_SIZE <- 1e4 +default_page_size() <- function() 1e4 #' @description #' \code{list_tabledata_callback} calls the supplied callback with each page @@ -54,7 +54,8 @@ DEFAULT_PAGE_SIZE <- 1e4 #' @export list_tabledata_callback <- function(project, dataset, table, callback, table_info = NULL, - page_size = DEFAULT_PAGE_SIZE, max_pages = 10, + page_size = default_page_size(), + max_pages = 10, warn = TRUE, quiet = getOption("bigquery.quiet")) { assert_that(is.string(project), is.string(dataset), is.string(table)) @@ -133,7 +134,7 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows) } - all <- function(page_size = DEFAULT_PAGE_SIZE) { + all <- function(page_size = default_page_size()) { ret <- list() while (!is_complete()) { chunk <- next_(page_size) diff --git a/man/list_tabledata.Rd b/man/list_tabledata.Rd index dc4ffbc2..367e30e5 100644 --- a/man/list_tabledata.Rd +++ b/man/list_tabledata.Rd @@ -11,7 +11,7 @@ list_tabledata(project, dataset, table, page_size = 10000, quiet = getOption("bigquery.quiet")) list_tabledata_callback(project, dataset, table, callback, table_info = NULL, - page_size = DEFAULT_PAGE_SIZE, max_pages = 10, warn = TRUE, + page_size = default_page_size(), max_pages = 10, warn = TRUE, quiet = getOption("bigquery.quiet")) list_tabledata_iter(project, dataset, table, table_info = NULL) From 345965ee3de844ace869033d88a047896e4a22e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Mon, 8 Feb 2016 11:38:00 +0100 Subject: [PATCH 05/17] oops --- R/tabledata.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tabledata.r b/R/tabledata.r index 19f21510..52ac5f60 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -45,7 +45,7 @@ list_tabledata <- function(project, dataset, table, page_size = 1e4, do.call("rbind", rows) } -default_page_size() <- function() 1e4 +default_page_size <- function() 1e4 #' @description #' \code{list_tabledata_callback} calls the supplied callback with each page From 15b141c11cd40d9b2b6c6d49f7e166b2b19d5ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Mon, 8 Feb 2016 11:51:27 +0100 Subject: [PATCH 06/17] new member get_rows_fetched() --- R/tabledata.r | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/tabledata.r b/R/tabledata.r index a71a2282..7089a090 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -134,6 +134,10 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows) } + get_rows_fetched <- function() { + rows_fetched + } + all <- function(page_size = default_page_size()) { ret <- list() while (!is_complete()) { @@ -143,7 +147,8 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { do.call(rbind, ret) } - list(next_ = next_, all = all, is_complete = is_complete) + list(next_ = next_, is_complete = is_complete, + get_rows_fetched = get_rows_fetched, all = all) } #Types can be loaded into R, record is not supported yet. From 7b3a799f08cc6ae515d41d0edd41b65b7ae7a637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Mon, 8 Feb 2016 11:52:23 +0100 Subject: [PATCH 07/17] new member get_schema() --- R/tabledata.r | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 7089a090..24d64794 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -134,10 +134,6 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows) } - get_rows_fetched <- function() { - rows_fetched - } - all <- function(page_size = default_page_size()) { ret <- list() while (!is_complete()) { @@ -147,8 +143,16 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { do.call(rbind, ret) } - list(next_ = next_, is_complete = is_complete, - get_rows_fetched = get_rows_fetched, all = all) + get_schema <- function() { + schema + } + + get_rows_fetched <- function() { + rows_fetched + } + + list(next_ = next_, is_complete = is_complete, all = all, + get_schema = get_schema, get_rows_fetched = get_rows_fetched) } #Types can be loaded into R, record is not supported yet. From 11683dd14ed3cbebb7943e6075de4b8969f4e452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 09:46:13 +0100 Subject: [PATCH 08/17] use option instead of default_page_size() function --- R/tabledata.r | 9 +++++---- R/zzz.r | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 24d64794..5e2d40b6 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -45,8 +45,6 @@ list_tabledata <- function(project, dataset, table, page_size = 1e4, do.call("rbind", rows) } -default_page_size <- function() 1e4 - #' @description #' \code{list_tabledata_callback} calls the supplied callback with each page #' of data. @@ -54,7 +52,7 @@ default_page_size <- function() 1e4 #' @export list_tabledata_callback <- function(project, dataset, table, callback, table_info = NULL, - page_size = default_page_size(), + page_size = getOption("bigrquery.page.size"), max_pages = 10, warn = TRUE, quiet = getOption("bigrquery.quiet")) { @@ -134,10 +132,13 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows) } - all <- function(page_size = default_page_size()) { + all <- function(page_size = getOption("bigrquery.page.size")) { ret <- list() while (!is_complete()) { chunk <- next_(page_size) + + # This has O(n^2) aggregated run time, but fetching large data from + # BigQuery will be slow for other reasons ret <- c(ret, list(chunk)) } do.call(rbind, ret) diff --git a/R/zzz.r b/R/zzz.r index 3dbb12d7..4dad3e2b 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -1,7 +1,8 @@ .onLoad <- function(libname, pkgname) { op <- options() defaults <- list( - bigrquery.quiet = NA + bigrquery.quiet = NA, + bigrquery.page.size = 1e4 ) toset <- !(names(defaults) %in% names(op)) if (any(toset)) options(defaults[toset]) From ec8fb8bf8be8af3df297f7926882b655c8b0a8b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 10:09:55 +0100 Subject: [PATCH 09/17] provide next_paged() instead of all() to avoid fetching too much data at once when using DBI --- R/tabledata.r | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 9375ca99..3d04911e 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -132,10 +132,13 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows) } - all <- function(page_size = getOption("bigrquery.page.size")) { + next_paged <- function(n, page_size = getOption("bigrquery.page.size")) { + target_rows_fetched <- rows_fetched + n + ret <- list() - while (!is_complete()) { - chunk <- next_(page_size) + while (!is_complete() && rows_fetched < target_rows_fetched) { + next_n <- min(page_size, target_rows_fetched - rows_fetched) + chunk <- next_(next_n) # This has O(n^2) aggregated run time, but fetching large data from # BigQuery will be slow for other reasons @@ -152,7 +155,7 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { rows_fetched } - list(next_ = next_, is_complete = is_complete, all = all, + list(next_ = next_, next_paged = next_paged, is_complete = is_complete, get_schema = get_schema, get_rows_fetched = get_rows_fetched) } From ee915e148c6da4b2ce51a6ea52af39ad64bac9c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 10:32:31 +0100 Subject: [PATCH 10/17] extract_data() now always returns data frame --- R/tabledata.r | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 3d04911e..6b25a9e4 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -118,9 +118,7 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { response <- bq_get(url, query = query) data <- extract_data(response$rows, schema) - if (!is.null(data)) { - rows_fetched <<- rows_fetched + nrow(data) - } + rows_fetched <<- rows_fetched + nrow(data) # Record only page token and total number of rows to reduce memory consumption last_response <<- response[c("pageToken", "totalRows")] From 3f7c6c0234f01c86f1e362a189826205fd1d8b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 10:32:49 +0100 Subject: [PATCH 11/17] use repeat loop to support corner case n = 0 --- R/tabledata.r | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/tabledata.r b/R/tabledata.r index 6b25a9e4..8bb30256 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -134,13 +134,17 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { target_rows_fetched <- rows_fetched + n ret <- list() - while (!is_complete() && rows_fetched < target_rows_fetched) { + repeat { next_n <- min(page_size, target_rows_fetched - rows_fetched) chunk <- next_(next_n) # This has O(n^2) aggregated run time, but fetching large data from # BigQuery will be slow for other reasons ret <- c(ret, list(chunk)) + + if (is_complete() || rows_fetched >= target_rows_fetched) { + break + } } do.call(rbind, ret) } From 3db0dfc4499c2dc0f5b92419dcc1bdc3a1a5e70e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 10:35:36 +0100 Subject: [PATCH 12/17] document --- man/list_tabledata.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/list_tabledata.Rd b/man/list_tabledata.Rd index 0d9d216d..e804b752 100644 --- a/man/list_tabledata.Rd +++ b/man/list_tabledata.Rd @@ -11,8 +11,8 @@ list_tabledata(project, dataset, table, page_size = 10000, quiet = getOption("bigrquery.quiet")) list_tabledata_callback(project, dataset, table, callback, table_info = NULL, - page_size = default_page_size(), max_pages = 10, warn = TRUE, - quiet = getOption("bigrquery.quiet")) + page_size = getOption("bigrquery.page.size"), max_pages = 10, + warn = TRUE, quiet = getOption("bigrquery.quiet")) list_tabledata_iter(project, dataset, table, table_info = NULL) } From 9f2322de358763c6a4ff49cab15ab326a1fae77d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 10:35:48 +0100 Subject: [PATCH 13/17] document package options --- R/bigrquery.r | 11 +++++++++++ man/bigrquery.Rd | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/R/bigrquery.r b/R/bigrquery.r index 0b77f823..7330ef0c 100644 --- a/R/bigrquery.r +++ b/R/bigrquery.r @@ -4,6 +4,17 @@ #' please see the project development page: \url{github.com/rstats-db/bigrquery}. #' The most important method to get started with is \code{\link{query_exec}}. #' +#' @section Package options: +#' \describe{ +#' \item{\code{bigrquery.quiet}}{Verbose output during processing? The default +#' value, \code{NA}, turns on verbose output for queries that run longer than +#' two seconds. Use \code{TRUE} for immediate verbose output, \code{FALSE} +#' for quiet operation.} +#' +#' \item{\code{bigrquery.page.size}}{Default page size for fetching data, +#' defaults to 1e4.} +#' } +#' #' @name bigrquery #' @aliases bigquery #' @docType package diff --git a/man/bigrquery.Rd b/man/bigrquery.Rd index cb200fd4..c2127a2f 100644 --- a/man/bigrquery.Rd +++ b/man/bigrquery.Rd @@ -11,4 +11,16 @@ For more information about how bigrquery works, and how to get started, please see the project development page: \url{github.com/rstats-db/bigrquery}. The most important method to get started with is \code{\link{query_exec}}. } +\section{Package options}{ + +\describe{ + \item{\code{bigrquery.quiet}}{Verbose output during processing? The default + value, \code{NA}, turns on verbose output for queries that run longer than + two seconds. Use \code{TRUE} for immediate verbose output, \code{FALSE} + for quiet operation.} + + \item{\code{bigrquery.page.size}}{Default page size for fetching data, + defaults to 1e4.} +} +} From 7917abac1a924cf6829b42f2e3dfed2685bdc8dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 12:18:44 +0100 Subject: [PATCH 14/17] add old NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 067f9566..fe2be476 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # Version 0.1.0.9000 +* Computation of the SQL data type that corresponds to a given R object is now more robust against unknown classes. (#95, @krlmlr) + * A data frame with full schema information is returned for zero-row results. (#88, @krlmlr) * New `exists_table()`. (#91, @krlmlr) From 8fad20690f75b0a85a8d9de932e38d3b6a0cee5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 12:18:57 +0100 Subject: [PATCH 15/17] document all members in-place --- R/tabledata.r | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 8bb30256..1f198c01 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -94,10 +94,6 @@ list_tabledata_callback <- function(project, dataset, table, callback, invisible(TRUE) } -#' @description -#' \code{list_tabledata_iter} returns a named list with components \code{next_} -#' (a function that fetches rows) and \code{is_complete} (a function that checks -#' if all rows have been fetched). #' @rdname list_tabledata #' @export list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { @@ -157,6 +153,12 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { rows_fetched } + #' @description + #' \code{list_tabledata_iter} returns a named list with functions \code{next_} + #' (fetches one chunk of rows), \code{next_paged} (fetches arbitrarily many + #' rows using a specified page size), \code{is_complete} (checks if all rows + #' have been fetched), \code{get_schema} (returns the schema of the table), + #' and \code{get_rows_fetched} (returns the number of rows already fetched). list(next_ = next_, next_paged = next_paged, is_complete = is_complete, get_schema = get_schema, get_rows_fetched = get_rows_fetched) } From 482f670dbc674af21b20c61a8e0a8d26ec61e944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 12:20:08 +0100 Subject: [PATCH 16/17] use numeric for counting number of rows integers have 32 bits only, which might be not enough --- R/tabledata.r | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/tabledata.r b/R/tabledata.r index 1f198c01..843e66b3 100644 --- a/R/tabledata.r +++ b/R/tabledata.r @@ -105,7 +105,7 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { table) last_response <- NULL - rows_fetched <- 0L + rows_fetched <- 0 next_ <- function(n) { query <- list(maxResults = n) @@ -123,7 +123,7 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) { } is_complete <- function() { - !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows) + !is.null(last_response) && rows_fetched >= as.numeric(last_response$totalRows) } next_paged <- function(n, page_size = getOption("bigrquery.page.size")) { From 45ee37cf8110f41000ace638ed875b5e0cbc9c6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Feb 2016 16:23:25 +0100 Subject: [PATCH 17/17] document --- man/list_tabledata.Rd | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/man/list_tabledata.Rd b/man/list_tabledata.Rd index e804b752..661cf3a1 100644 --- a/man/list_tabledata.Rd +++ b/man/list_tabledata.Rd @@ -45,9 +45,11 @@ current page of data} \code{list_tabledata_callback} calls the supplied callback with each page of data. -\code{list_tabledata_iter} returns a named list with components \code{next_} -(a function that fetches rows) and \code{is_complete} (a function that checks -if all rows have been fetched). +\code{list_tabledata_iter} returns a named list with functions \code{next_} +(fetches one chunk of rows), \code{next_paged} (fetches arbitrarily many +rows using a specified page size), \code{is_complete} (checks if all rows +have been fetched), \code{get_schema} (returns the schema of the table), +and \code{get_rows_fetched} (returns the number of rows already fetched). } \examples{ \dontrun{