Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: store URLs in environment to avoid issues with long filenames #1049

Merged
merged 12 commits into from
Apr 23, 2024
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ Also, there are some updates.

### Bug fixes

- `$len()` now correctly includes `null` values in the count (#1044).
* `$len()` now correctly includes `null` values in the count (#1044).
* In some read/scan functions, downloading files could fail if the URL was too
long. This is now fixed (#1049, @DyfanJones).

### Other improvements

Expand Down
24 changes: 15 additions & 9 deletions R/io_csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ pl_scan_csv = function(
return(RPolarsRNullValues$new_named(unlist(null_values)))
}

stop("null_values arg must be a string OR unamed char vec OR named char vec")
Err_plain(
"null_values arg must be a string OR unamed char vec OR named char vec"
) |> unwrap("in pl$scan_csv():")
})()

args$null_values = RNullValues
Expand Down Expand Up @@ -187,6 +189,7 @@ pl_read_csv = function(
unwrap("in pl$read_csv():")
}

cache_temp_file = new.env(parent = new.env())
check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
# do nothing let path fail on rust side
if (is.na(path)) {
Expand All @@ -213,16 +216,17 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
# try download file if valid url
if (!is.null(con)) {
close(con)
tmp_file = paste0(tempdir(), "/", make.names(actual_url))
if (isFALSE(reuse_downloaded) || isFALSE(file.exists(tmp_file))) {
download.file(url = actual_url, destfile = tmp_file)
message(paste("tmp file placed in \n", tmp_file))
if (is.null(cache_temp_file[[actual_url]]))
cache_temp_file[[actual_url]] <- tempfile()
if (isFALSE(reuse_downloaded) || isFALSE(file.exists(cache_temp_file[[actual_url]]))) {
download.file(url = actual_url, destfile = cache_temp_file[[actual_url]])
message(paste("tmp file placed in \n", cache_temp_file[[actual_url]]))
}

path = tmp_file # redirect path to tmp downloaded file
path = cache_temp_file[[actual_url]] # redirect path to tmp downloaded file
} else {
if (raise_error) {
stop("failed to locate file at path/url: ", path)
Err_plain(paste("failed to locate file at path/url:", path)) |> unwrap()
}
# do nothing let path fail on rust side
path = NULL
Expand All @@ -235,7 +239,8 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {

list_to_datatype_vector = function(x) {
if (!is.list(x) || !is_named(x)) {
stop("could not interpret dtypes, must be a named list of DataTypes")
Err_plain("could not interpret dtypes, must be a named list of DataTypes") |>
unwrap()
}
datatype_vector = RPolarsDataTypeVector$new() # mutable
mapply(
Expand All @@ -247,7 +252,8 @@ list_to_datatype_vector = function(x) {
type = DataType$new(type)
}
if (!inherits(type, "RPolarsDataType")) {
stop("arg dtypes must be a named list of dtypes or dtype names")
Err_plain("arg dtypes must be a named list of dtypes or dtype names") |>
unwrap()
}
datatype_vector$push(name, type)
}
Expand Down
16 changes: 16 additions & 0 deletions tests/testthat/test-csv-read.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,19 @@ test_that("bad paths", {
"failed to locate file"
)
})

test_that("cache url tempfile", {
skip_if_offline()
url <- "https://vincentarelbundock.github.io/Rdatasets/csv/AER/BenderlyZwick.csv"
local_mocked_bindings(
download.file = function(...) invisible(NULL),
)
check_is_link(url, reuse_downloaded = TRUE)
attempt_1 = cache_temp_file[[url]]

check_is_link(url, reuse_downloaded = TRUE)
attempt_2 = cache_temp_file[[url]]

expect_true(!is.null(cache_temp_file[[url]]))
expect_equal(attempt_1, attempt_2)
})
Loading