From 8d313c1c10898a67c02ab455f44481302f9d4ba4 Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Wed, 17 Apr 2024 12:26:20 +0100 Subject: [PATCH 01/11] cache tempfile for each url --- R/io_csv.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/R/io_csv.R b/R/io_csv.R index dfadc407c..1846be963 100644 --- a/R/io_csv.R +++ b/R/io_csv.R @@ -187,6 +187,7 @@ pl_read_csv = function( unwrap("in pl$read_csv():") } +cache_temp_file <- new.env(parent = new.env()) check_is_link = function(path, reuse_downloaded, raise_error = FALSE) { # do nothing let path fail on rust side if (is.na(path)) { @@ -213,13 +214,14 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) { # try download file if valid url if (!is.null(con)) { close(con) - tmp_file = paste0(tempdir(), "/", make.names(actual_url)) - if (isFALSE(reuse_downloaded) || isFALSE(file.exists(tmp_file))) { - download.file(url = actual_url, destfile = tmp_file) - message(paste("tmp file placed in \n", tmp_file)) + if (is.null(cache_temp_file[[actual_url]])) + cache_temp_file[[actual_url]] <- tempfile() + if (isFALSE(reuse_downloaded) || isFALSE(file.exists(cache_temp_file[[actual_url]]))) { + download.file(url = actual_url, destfile = cache_temp_file[[actual_url]]) + message(paste("tmp file placed in \n", cache_temp_file[[actual_url]])) } - path = tmp_file # redirect path to tmp downloaded file + path = cache_temp_file[[actual_url]] # redirect path to tmp downloaded file } else { if (raise_error) { stop("failed to locate file at path/url: ", path) From 0f3241f62c09379e2c7d44ed618b4c21e59cc880 Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Wed, 17 Apr 2024 12:38:40 +0100 Subject: [PATCH 02/11] replace stop with Err_plain --- R/io_csv.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/R/io_csv.R b/R/io_csv.R index 1846be963..775fe0076 100644 --- a/R/io_csv.R +++ b/R/io_csv.R @@ -134,7 +134,9 @@ pl_scan_csv = function( return(RPolarsRNullValues$new_named(unlist(null_values))) } - stop("null_values arg must be a string OR unamed char vec OR named char vec") + Err_plain( + "null_values arg must be a string OR unamed char vec OR named char vec" + ) |> unwrap() })() args$null_values = RNullValues @@ -224,7 +226,7 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) { path = cache_temp_file[[actual_url]] # redirect path to tmp downloaded file } else { if (raise_error) { - stop("failed to locate file at path/url: ", path) + Err_plain(paste("failed to locate file at path/url:", path)) |> unwrap() } # do nothing let path fail on rust side path = NULL @@ -237,7 +239,8 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) { list_to_datatype_vector = function(x) { if (!is.list(x) || !is_named(x)) { - stop("could not interpret dtypes, must be a named list of DataTypes") + Err_plain("could not interpret dtypes, must be a named list of DataTypes") |> + unwrap() } datatype_vector = RPolarsDataTypeVector$new() # mutable mapply( @@ -249,7 +252,8 @@ list_to_datatype_vector = function(x) { type = DataType$new(type) } if (!inherits(type, "RPolarsDataType")) { - stop("arg dtypes must be a named list of dtypes or dtype names") + Err_plain("arg dtypes must be a named list of dtypes or dtype names") |> + unwrap() } datatype_vector$push(name, type) } From f1f8f1e30087308966fda11ba203747d5de0023d Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Wed, 17 Apr 2024 13:00:51 +0100 Subject: [PATCH 03/11] formatting --- R/io_csv.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/io_csv.R b/R/io_csv.R index 775fe0076..e091084c4 100644 --- a/R/io_csv.R +++ b/R/io_csv.R @@ -189,7 +189,7 @@ pl_read_csv = function( unwrap("in pl$read_csv():") } -cache_temp_file <- new.env(parent = new.env()) +cache_temp_file = new.env(parent = new.env()) check_is_link = function(path, reuse_downloaded, raise_error = FALSE) { # do nothing let path fail on rust side if (is.na(path)) { From 0335604e5595d281a0b2e7ac92346c8dce9cf0bd Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Wed, 17 Apr 2024 13:01:16 +0100 Subject: [PATCH 04/11] check if url is cached correctly --- tests/testthat/test-csv-read.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R index 9a4d874cd..81f49935d 100644 --- a/tests/testthat/test-csv-read.R +++ b/tests/testthat/test-csv-read.R @@ -181,3 +181,18 @@ test_that("bad paths", { "failed to locate file" ) }) + +test_that("cache url tempfile", { + url <- "https://vincentarelbundock.github.io/Rdatasets/csv/AER/BenderlyZwick.csv" + local_mocked_bindings( + download.file = function(...) invisible(NULL), + ) + check_is_link(url, reuse_downloaded = T) + attempt_1 = cache_temp_file[[url]] + + check_is_link(url, reuse_downloaded = T) + attempt_2 = cache_temp_file[[url]] + + expect_true(!is.null(cache_temp_file[[url]])) + expect_equal(attempt_1, attempt_2) +}) From ede7e3bdf485656ac9b90713f18ff5ac2b79db31 Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Thu, 18 Apr 2024 11:04:24 +0100 Subject: [PATCH 05/11] skip test if offline --- tests/testthat/test-csv-read.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R index 81f49935d..b1c5d27d6 100644 --- a/tests/testthat/test-csv-read.R +++ b/tests/testthat/test-csv-read.R @@ -183,6 +183,7 @@ test_that("bad paths", { }) test_that("cache url tempfile", { + skip_if_offline() url <- "https://vincentarelbundock.github.io/Rdatasets/csv/AER/BenderlyZwick.csv" local_mocked_bindings( download.file = function(...) invisible(NULL), From b997014a8013a247359bcf071b503f9fb23778f5 Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Fri, 19 Apr 2024 10:32:55 +0100 Subject: [PATCH 06/11] formatting --- tests/testthat/test-csv-read.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R index b1c5d27d6..bc2fa5cc8 100644 --- a/tests/testthat/test-csv-read.R +++ b/tests/testthat/test-csv-read.R @@ -188,10 +188,10 @@ test_that("cache url tempfile", { local_mocked_bindings( download.file = function(...) invisible(NULL), ) - check_is_link(url, reuse_downloaded = T) + check_is_link(url, reuse_downloaded = TRUE) attempt_1 = cache_temp_file[[url]] - check_is_link(url, reuse_downloaded = T) + check_is_link(url, reuse_downloaded = TRUE) attempt_2 = cache_temp_file[[url]] expect_true(!is.null(cache_temp_file[[url]])) From ddfa320ffcd1576de43111eb748bcfa8757978cb Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Tue, 23 Apr 2024 10:22:19 +0100 Subject: [PATCH 07/11] cache temporary file name --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 155ea985c..945b3b27b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ ### Bug fixes * `$len()` now correctly includes `null` values in the count (#1044). +* cache temporary file name for url file download (#1046) ## Polars R Package 0.16.0 From 8e279096212e6bf22359e996d2d1fbaaf445e035 Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Tue, 23 Apr 2024 11:36:16 +0100 Subject: [PATCH 08/11] add context argument --- R/io_csv.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/io_csv.R b/R/io_csv.R index e091084c4..280356142 100644 --- a/R/io_csv.R +++ b/R/io_csv.R @@ -136,7 +136,7 @@ pl_scan_csv = function( Err_plain( "null_values arg must be a string OR unamed char vec OR named char vec" - ) |> unwrap() + ) |> unwrap(" in pl$scan_csv():") })() args$null_values = RNullValues From 83bf47060e40270b95b11b2b81638b6ad55dddd0 Mon Sep 17 00:00:00 2001 From: Larefly Date: Tue, 23 Apr 2024 12:05:58 +0100 Subject: [PATCH 09/11] remove white space Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- R/io_csv.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/io_csv.R b/R/io_csv.R index 280356142..4002eb6fc 100644 --- a/R/io_csv.R +++ b/R/io_csv.R @@ -136,7 +136,7 @@ pl_scan_csv = function( Err_plain( "null_values arg must be a string OR unamed char vec OR named char vec" - ) |> unwrap(" in pl$scan_csv():") + ) |> unwrap("in pl$scan_csv():") })() args$null_values = RNullValues From 908882e6b092f4e70d482b4300e27eaa8456549d Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Tue, 23 Apr 2024 12:28:39 +0100 Subject: [PATCH 10/11] add problem and not solution employed --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 945b3b27b..0a82648fc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,8 @@ ### Bug fixes * `$len()` now correctly includes `null` values in the count (#1044). -* cache temporary file name for url file download (#1046) +* In some read/scan functions, downloading files could fail if the URL was too + long. This is now fixed (#1049). ## Polars R Package 0.16.0 From 827aefb07ac632adc52f68b265e27473ddfab1b4 Mon Sep 17 00:00:00 2001 From: "dyfan.jones" Date: Tue, 23 Apr 2024 12:31:54 +0100 Subject: [PATCH 11/11] add contributor name --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 0a82648fc..a0ea8f73e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ * `$len()` now correctly includes `null` values in the count (#1044). * In some read/scan functions, downloading files could fail if the URL was too - long. This is now fixed (#1049). + long. This is now fixed (#1049, @DyfanJones). ## Polars R Package 0.16.0