Skip to content
Permalink
Browse files

Ensure column names are UTF-8 Encoded

The encodings were inadvertently dropped because there was a roundtrip
to std::string

Fixes #113
Fixes #115
  • Loading branch information...
jimhester committed May 20, 2019
1 parent 7c0dc5d commit 74627dcb5e8214c6224595374cf8d7aa8d616065
Showing with 20 additions and 10 deletions.
  1. +2 −0 NEWS.md
  2. +4 −5 src/collectors.h
  3. +9 −5 src/columns.h
  4. +5 −0 tests/testthat/test-vroom.R
@@ -1,5 +1,7 @@
# vroom (development)

* Fix encoding of column names (#113, #115)

* Fix non-deterministic crash when `vroom_write()` was used on Altrep vectors.

# vroom 1.0.1
@@ -6,7 +6,7 @@ using namespace Rcpp;

class collector {
const Rcpp::List data_;
const std::string name_;
const SEXP name_;
const column_type type_;
const size_t altrep_opts_;

@@ -44,14 +44,14 @@ class collector {
}

public:
collector(Rcpp::List data, std::string name, size_t altrep_opts)
collector(Rcpp::List data, SEXP name, size_t altrep_opts)
: data_(data),
name_(name),
type_(derive_type(Rcpp::as<std::string>(
Rcpp::as<Rcpp::CharacterVector>(data_.attr("class"))[0]))),
altrep_opts_(altrep_opts) {}
column_type type() const { return type_; }
std::string name() const { return name_; }
SEXP name() const { return name_; }
SEXP operator[](const char* nme) { return data_[nme]; }
bool use_altrep() {
switch (type()) {
@@ -91,8 +91,7 @@ class collectors {
altrep_opts_(altrep_opts) {}
collector operator[](int i) {
return {collectors_[i],
Rcpp::as<std::string>(
Rcpp::as<Rcpp::CharacterVector>(collectors_.attr("names"))[i]),
Rcpp::as<Rcpp::CharacterVector>(collectors_.attr("names"))[i],
altrep_opts_};
}
Rcpp::List spec() { return spec_; }
@@ -76,18 +76,19 @@ inline List create_columns(

auto locale_info = std::make_shared<LocaleInfo>(locale);

std::vector<std::string> res_nms;

size_t i = 0;

bool add_filename = !Rf_isNull(id);

List res(num_cols + add_filename);

CharacterVector res_nms(num_cols + add_filename);

if (add_filename) {
res[i++] =
res[i] =
generate_filename_column(filenames, idx->row_sizes(), idx->num_rows());
res_nms.push_back(Rcpp::as<std::string>(id));
res_nms[i] = Rcpp::as<Rcpp::CharacterVector>(id)[0];
++i;
}

auto my_collectors = resolve_collectors(
@@ -124,7 +125,7 @@ inline List create_columns(
locale_info,
std::string()};

res_nms.push_back(collector.name());
res_nms[i] = collector.name();

switch (collector.type()) {
case column_type::Dbl:
@@ -232,6 +233,9 @@ inline List create_columns(
// Resize the list appropriately
SETLENGTH(res, i);
SET_TRUELENGTH(res, i);

SETLENGTH(res_nms, i);
SET_TRUELENGTH(res_nms, i);
}

res.attr("names") = res_nms;
@@ -363,3 +363,8 @@ test_that("guess_type works with long strings (#74)", {
test_that("vroom errors if unnamed column types do not match the number of columns", {
expect_error(vroom("a,b\n1,2\n", col_types = "i"), "must have the same length", class = "Rcpp::eval_error")
})

test_that("column names are properly encoded", {
nms <- vroom::vroom("f\U00F6\U00F6\nbar\n")
expect_equal(Encoding(colnames(nms)), "UTF-8")
})

1 comment on commit 74627dc

@karsfri

This comment has been minimized.

Copy link

commented on 74627dc May 20, 2019

Wow, awesome! Thank you!

Please sign in to comment.
You can’t perform that action at this time.