pola-rs · eitsupi · Apr 23, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -10,7 +10,9 @@ Also, there are some updates.
 
 ### Bug fixes
 
-- `$len()` now correctly includes `null` values in the count (#1044).
+* `$len()` now correctly includes `null` values in the count (#1044).
+* In some read/scan functions, downloading files could fail if the URL was too
+  long. This is now fixed (#1049, @DyfanJones).
 
 ### Other improvements
 

diff --git a/R/io_csv.R b/R/io_csv.R
@@ -134,7 +134,9 @@ pl_scan_csv = function(
         return(RPolarsRNullValues$new_named(unlist(null_values)))
       }
 
-      stop("null_values arg must be a string OR unamed char vec OR named char vec")
+      Err_plain(
+        "null_values arg must be a string OR unamed char vec OR named char vec"
+      ) |> unwrap("in pl$scan_csv():")
     })()
 
     args$null_values = RNullValues
@@ -187,6 +189,7 @@ pl_read_csv = function(
     unwrap("in pl$read_csv():")
 }
 
+cache_temp_file = new.env(parent = new.env())
 check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
   # do nothing let path fail on rust side
   if (is.na(path)) {
@@ -213,16 +216,17 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
     # try download file if valid url
     if (!is.null(con)) {
       close(con)
-      tmp_file = paste0(tempdir(), "/", make.names(actual_url))
-      if (isFALSE(reuse_downloaded) || isFALSE(file.exists(tmp_file))) {
-        download.file(url = actual_url, destfile = tmp_file)
-        message(paste("tmp file placed in \n", tmp_file))
+      if (is.null(cache_temp_file[[actual_url]]))
+        cache_temp_file[[actual_url]] <- tempfile()
+      if (isFALSE(reuse_downloaded) || isFALSE(file.exists(cache_temp_file[[actual_url]]))) {
+        download.file(url = actual_url, destfile = cache_temp_file[[actual_url]])
+        message(paste("tmp file placed in \n", cache_temp_file[[actual_url]]))
       }
 
-      path = tmp_file # redirect path to tmp downloaded file
+      path = cache_temp_file[[actual_url]] # redirect path to tmp downloaded file
     } else {
       if (raise_error) {
-        stop("failed to locate file at path/url: ", path)
+        Err_plain(paste("failed to locate file at path/url:", path)) |> unwrap()
       }
       # do nothing let path fail on rust side
       path = NULL
@@ -235,7 +239,8 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
 
 list_to_datatype_vector = function(x) {
   if (!is.list(x) || !is_named(x)) {
-    stop("could not interpret dtypes, must be a named list of DataTypes")
+    Err_plain("could not interpret dtypes, must be a named list of DataTypes") |>
+      unwrap()
   }
   datatype_vector = RPolarsDataTypeVector$new() # mutable
   mapply(
@@ -247,7 +252,8 @@ list_to_datatype_vector = function(x) {
         type = DataType$new(type)
       }
       if (!inherits(type, "RPolarsDataType")) {
-        stop("arg dtypes must be a named list of dtypes or dtype names")
+        Err_plain("arg dtypes must be a named list of dtypes or dtype names") |>
+          unwrap()
       }
       datatype_vector$push(name, type)
     }

diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R
@@ -181,3 +181,19 @@ test_that("bad paths", {
     "failed to locate file"
   )
 })
+
+test_that("cache url tempfile", {
+  skip_if_offline()
+  url <- "https://vincentarelbundock.github.io/Rdatasets/csv/AER/BenderlyZwick.csv"
+  local_mocked_bindings(
+    download.file = function(...) invisible(NULL),
+  )
+  check_is_link(url, reuse_downloaded = TRUE)
+  attempt_1 = cache_temp_file[[url]]
+
+  check_is_link(url, reuse_downloaded = TRUE)
+  attempt_2 = cache_temp_file[[url]]
+
+  expect_true(!is.null(cache_temp_file[[url]]))
+  expect_equal(attempt_1, attempt_2)
+})