Merge pull request #87 from krlmlr/feature/iter-all

- table data iterator gains new members `next_paged()`, `get_schema()` and `get_rows_fetched()` (#87. @krlmlr).
r-dbi · Feb 24, 2016 · 2866909 · 2866909
2 parents 94e5c3b + 45ee37c
commit 2866909
Show file tree

Hide file tree

Showing 7 changed files with 76 additions and 19 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -18,4 +18,4 @@ Suggests:
     testthat
 License: GPL-3
 LazyData: true
-RoxygenNote: 5.0.1.9000
+RoxygenNote: 5.0.1
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # Version 0.1.0.9000
 
+* Computation of the SQL data type that corresponds to a given R object is now more robust against unknown classes. (#95, @krlmlr)
+
 * A data frame with full schema information is returned for zero-row results. (#88, @krlmlr)
 
 * New `exists_table()`. (#91, @krlmlr)
@@ -10,6 +12,6 @@
 
 * New `format_dataset()` and `format_table()`. (#81, @krlmlr)
 
-* New `list_tabledata_iter()` that allows fetching a table in chunks of varying size. (#77, @krlmlr)
+* New `list_tabledata_iter()` that allows fetching a table in chunks of varying size. (#77, #87, @krlmlr)
 
 * Add support for API keys via the `BIGRQUERY_API_KEY` environment variable. (#49)
diff --git a/R/bigrquery.r b/R/bigrquery.r
@@ -4,6 +4,17 @@
 #' please see the project development page: \url{github.com/rstats-db/bigrquery}.
 #' The most important method to get started with is \code{\link{query_exec}}.
 #'
+#' @section Package options:
+#' \describe{
+#'   \item{\code{bigrquery.quiet}}{Verbose output during processing? The default
+#'   value, \code{NA}, turns on verbose output for queries that run longer than
+#'   two seconds.  Use \code{TRUE} for immediate verbose output, \code{FALSE}
+#'   for quiet operation.}
+#'
+#'   \item{\code{bigrquery.page.size}}{Default page size for fetching data,
+#'   defaults to 1e4.}
+#' }
+#'
 #' @name bigrquery
 #' @aliases bigquery
 #' @docType package

diff --git a/R/tabledata.r b/R/tabledata.r
@@ -52,7 +52,8 @@ list_tabledata <- function(project, dataset, table, page_size = 1e4,
 #' @export
 list_tabledata_callback <- function(project, dataset, table, callback,
                                     table_info = NULL,
-                                    page_size = 1e4, max_pages = 10,
+                                    page_size = getOption("bigrquery.page.size"),
+                                    max_pages = 10,
                                     warn = TRUE,
                                     quiet = getOption("bigrquery.quiet")) {
   assert_that(is.string(project), is.string(dataset), is.string(table))
@@ -93,10 +94,6 @@ list_tabledata_callback <- function(project, dataset, table, callback,
   invisible(TRUE)
 }
 
-#' @description
-#' \code{list_tabledata_iter} returns a named list with components \code{next_}
-#' (a function that fetches rows) and \code{is_complete} (a function that checks
-#' if all rows have been fetched).
 #' @rdname list_tabledata
 #' @export
 list_tabledata_iter <- function(project, dataset, table, table_info = NULL) {
@@ -108,7 +105,7 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) {
     table)
 
   last_response <- NULL
-  rows_fetched <- 0L
+  rows_fetched <- 0
 
   next_ <- function(n) {
     query <- list(maxResults = n)
@@ -117,9 +114,7 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) {
     response <- bq_get(url, query = query)
 
     data <- extract_data(response$rows, schema)
-    if (!is.null(data)) {
-      rows_fetched <<- rows_fetched + nrow(data)
-    }
+    rows_fetched <<- rows_fetched + nrow(data)
 
     # Record only page token and total number of rows to reduce memory consumption
     last_response <<- response[c("pageToken", "totalRows")]
@@ -128,10 +123,44 @@ list_tabledata_iter <- function(project, dataset, table, table_info = NULL) {
   }
 
   is_complete <- function() {
-    !is.null(last_response) && rows_fetched >= as.integer(last_response$totalRows)
+    !is.null(last_response) && rows_fetched >= as.numeric(last_response$totalRows)
+  }
+
+  next_paged <- function(n, page_size = getOption("bigrquery.page.size")) {
+    target_rows_fetched <- rows_fetched + n
+
+    ret <- list()
+    repeat {
+      next_n <- min(page_size, target_rows_fetched - rows_fetched)
+      chunk <- next_(next_n)
+
+      # This has O(n^2) aggregated run time, but fetching large data from
+      # BigQuery will be slow for other reasons
+      ret <- c(ret, list(chunk))
+
+      if (is_complete() || rows_fetched >= target_rows_fetched) {
+        break
+      }
+    }
+    do.call(rbind, ret)
+  }
+
+  get_schema <- function() {
+    schema
+  }
+
+  get_rows_fetched <- function() {
+    rows_fetched
   }
 
-  list(next_ = next_, is_complete = is_complete)
+  #' @description
+  #' \code{list_tabledata_iter} returns a named list with functions \code{next_}
+  #' (fetches one chunk of rows), \code{next_paged} (fetches arbitrarily many
+  #' rows using a specified page size), \code{is_complete} (checks if all rows
+  #' have been fetched), \code{get_schema} (returns the schema of the table),
+  #' and \code{get_rows_fetched} (returns the number of rows already fetched).
+  list(next_ = next_, next_paged = next_paged, is_complete = is_complete,
+       get_schema = get_schema, get_rows_fetched = get_rows_fetched)
 }
 
 #Types can be loaded into R, record is not supported yet.

diff --git a/R/zzz.r b/R/zzz.r
@@ -1,7 +1,8 @@
 .onLoad <- function(libname, pkgname) {
   op <- options()
   defaults <- list(
-    bigrquery.quiet = NA
+    bigrquery.quiet = NA,
+    bigrquery.page.size = 1e4
   )
   toset <- !(names(defaults) %in% names(op))
   if (any(toset)) options(defaults[toset])

diff --git a/man/bigrquery.Rd b/man/bigrquery.Rd
diff --git a/man/list_tabledata.Rd b/man/list_tabledata.Rd