Skip to content

Commit

Permalink
Support reading files that do not end in a newline
Browse files Browse the repository at this point in the history
Fixes #40
  • Loading branch information
jimhester committed Jun 6, 2019
1 parent 505f36e commit 4b18c91
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 14 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# vroom (development)

* `vroom()` and `vroom_lines()` now support reading files which do not end in
newlines by using a file connection (#40).

* Fix additional UBSAN issue in the mio project reported by CRAN (#97)

* Fix indexing into connections with quoted fields (#119)
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ vroom_ <- function(inputs, delim, quote, trim_ws, escape_double, escape_backslas
.Call(`_vroom_vroom_`, inputs, delim, quote, trim_ws, escape_double, escape_backslash, comment, skip, n_max, progress, col_names, col_types, col_select, id, na, locale, guess_max, num_threads, altrep_opts)
}

has_trailing_newline <- function(filename) {
.Call(`_vroom_has_trailing_newline`, filename)
}

vroom_fwf_ <- function(inputs, col_starts, col_ends, trim_ws, col_names, col_types, col_select, skip, comment, n_max, id, na, locale, guess_max, num_threads, altrep_opts, progress) {
.Call(`_vroom_vroom_fwf_`, inputs, col_starts, col_ends, trim_ws, col_names, col_types, col_select, skip, comment, n_max, id, na, locale, guess_max, num_threads, altrep_opts, progress)
}
Expand Down
6 changes: 5 additions & 1 deletion R/path.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ standardise_one_path <- function (path, check = TRUE) {
bz2 = bzfile(path, ""),
xz = xzfile(path, ""),
zip = zipfile(path, ""),
path
if (!has_trailing_newline(path)) {
file(path)
} else {
path
}
)
}

Expand Down
12 changes: 12 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,17 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// has_trailing_newline
bool has_trailing_newline(std::string filename);
RcppExport SEXP _vroom_has_trailing_newline(SEXP filenameSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type filename(filenameSEXP);
rcpp_result_gen = Rcpp::wrap(has_trailing_newline(filename));
return rcpp_result_gen;
END_RCPP
}
// vroom_fwf_
List vroom_fwf_(List inputs, std::vector<int> col_starts, std::vector<int> col_ends, bool trim_ws, RObject col_names, RObject col_types, RObject col_select, size_t skip, const char comment, ptrdiff_t n_max, SEXP id, CharacterVector na, List locale, ptrdiff_t guess_max, size_t num_threads, size_t altrep_opts, bool progress);
RcppExport SEXP _vroom_vroom_fwf_(SEXP inputsSEXP, SEXP col_startsSEXP, SEXP col_endsSEXP, SEXP trim_wsSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP col_selectSEXP, SEXP skipSEXP, SEXP commentSEXP, SEXP n_maxSEXP, SEXP idSEXP, SEXP naSEXP, SEXP localeSEXP, SEXP guess_maxSEXP, SEXP num_threadsSEXP, SEXP altrep_optsSEXP, SEXP progressSEXP) {
Expand Down Expand Up @@ -198,6 +209,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_vroom_gen_character_", (DL_FUNC) &_vroom_gen_character_, 6},
{"_vroom_guess_type_", (DL_FUNC) &_vroom_guess_type_, 3},
{"_vroom_vroom_", (DL_FUNC) &_vroom_vroom_, 19},
{"_vroom_has_trailing_newline", (DL_FUNC) &_vroom_has_trailing_newline, 1},
{"_vroom_vroom_fwf_", (DL_FUNC) &_vroom_vroom_fwf_, 17},
{"_vroom_whitespace_columns_", (DL_FUNC) &_vroom_whitespace_columns_, 4},
{"_vroom_vroom_write_", (DL_FUNC) &_vroom_vroom_write_, 10},
Expand Down
12 changes: 8 additions & 4 deletions src/delimited_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,18 @@ class delimited_index : public index,
}
}

else if (c == quote) {
in_quote = !in_quote;
}

else if (escape_backslash_ && c == '\\') {
++pos;
}

else if (c == '\0') {
break;
}

else if (c == quote) {
in_quote = !in_quote;
}

++pos;
}

Expand Down
42 changes: 33 additions & 9 deletions src/delimited_index_connection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ delimited_index_connection::delimited_index_connection(
size_t sz = R_ReadConnection(con, buf[i].data(), chunk_size - 1);
buf[i][sz] = '\0';

if (sz == 0) {
if (should_close) {
Rcpp::as<Rcpp::Function>(Rcpp::Environment::base_env()["close"])(in);
}
return;
}

// Parse header
size_t start = find_first_line(buf[i], skip_, comment_);

Expand All @@ -85,16 +92,22 @@ delimited_index_connection::delimited_index_connection(
// This first newline must not have fit in the buffer, throw error
// suggesting a larger buffer size.

if (should_close) {
Rcpp::as<Rcpp::Function>(Rcpp::Environment::base_env()["close"])(in);
// Try reading again, if size is 0 we are at the end of the file, so should
// just go on.
size_t next_sz = R_ReadConnection(con, buf[i].data(), chunk_size - 1);
if (!(next_sz == 0)) {
if (should_close) {
Rcpp::as<Rcpp::Function>(Rcpp::Environment::base_env()["close"])(in);
}
std::stringstream ss;

ss << "The size of the connection buffer (" << chunk_size
<< ") was not large enough\nto fit a complete line:\n * Increase it "
"by "
"setting `Sys.setenv(\"VROOM_CONNECTION_SIZE\")`";

throw Rcpp::exception(ss.str().c_str(), false);
}
std::stringstream ss;

ss << "The size of the connection buffer (" << chunk_size
<< ") was not large enough\nto fit a complete line:\n * Increase it by "
"setting `Sys.setenv(\"VROOM_CONNECTION_SIZE\")`";

throw Rcpp::exception(ss.str().c_str(), false);
}

// Check for windows newlines
Expand Down Expand Up @@ -210,6 +223,17 @@ delimited_index_connection::delimited_index_connection(
throw Rcpp::exception(error.message().c_str(), false);
}

size_t file_size = mmap_.size();

if (mmap_[file_size - 1] != '\n') {
if (columns_ == 0) {
++columns_;
idx_[0].push_back(file_size);
} else {
idx_[1].push_back(file_size);
}
}

size_t total_size = std::accumulate(
idx_.begin(), idx_.end(), 0, [](size_t sum, const idx_t& v) {
sum += v.size() > 0 ? v.size() - 1 : 0;
Expand Down
13 changes: 13 additions & 0 deletions src/vroom.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,16 @@ SEXP vroom_(
guess_max,
num_threads);
}

// [[Rcpp::export]]
bool has_trailing_newline(std::string filename) {
std::FILE* f = std::fopen(filename.c_str(), "rb");

if (!f) {
return true;
}

fseek(f, -1, SEEK_END);
char c = fgetc(f);
return c == '\n';
}
14 changes: 14 additions & 0 deletions tests/testthat/test-vroom.R
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,17 @@ test_that("Files with windows newlines and missing fields work", {
equals = tibble::tibble(a = c("m", NA), b = c(NA, NA), c = c(NA, NA), d = c(NA, NA))
)
})

test_that("vroom can read files with no trailing newline", {
f <- tempfile()
on.exit(unlink(f))

writeBin(charToRaw("foo\nbar"), f)
expect_equal(vroom(f, col_names = FALSE)[[1]], c("foo", "bar"))

f2 <- tempfile()
on.exit(unlink(f2), add = TRUE)

writeBin(charToRaw("foo,bar\n1,2"), f2)
expect_equal(vroom(f2), tibble::tibble(foo = 1, bar = 2))
})
15 changes: 15 additions & 0 deletions tests/testthat/test-vroom_lines.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,18 @@ test_that("vroom_lines works with connections files", {

expect_equal(actual, expected)
})


test_that("vroom_lines works with files with no trailing newline", {
f <- tempfile()
on.exit(unlink(f))

writeBin(charToRaw("foo"), f)
expect_equal(vroom_lines(f), "foo")

f2 <- tempfile()
on.exit(unlink(f2), add = TRUE)

writeBin(charToRaw("foo\nbar"), f2)
expect_equal(vroom_lines(f2), c("foo", "bar"))
})

0 comments on commit 4b18c91

Please sign in to comment.