Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix N/A detection for empty fields in CSV reader #6922

Merged
merged 15 commits into from
Dec 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

## Bug Fixes

- PR #6922 Fix N/A detection for empty fields in CSV reader
- PR #6912 Fix rmm_mode=managed parameter for gtests


Expand Down
9 changes: 6 additions & 3 deletions cpp/include/cudf/detail/utilities/trie.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
// Serialize the tree trie
std::deque<IndexedTrieNode> to_visit;
thrust::host_vector<SerialTrieNode> nodes;
// suport for matching empty input
nodes.push_back(SerialTrieNode(trie_terminating_character, tree_trie.is_end_of_word));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean all tries match empty input? Difficult to follow the logic how this coordinates with the code below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not always. Here we create a node that will have is_end_of_word = true if the root node of the tree_trie is also end of a word (empty word, since it's the root). Hence, passing tree_trie.is_end_of_word as the second parameter.
I'll expand the comment above to make this clearer. 👍

// Add root node to queue. this node is not included to the serialized trie
to_visit.emplace_back(&tree_trie, -1);
while (!to_visit.empty()) {
Expand All @@ -112,7 +114,7 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
has_children = true;
}
}
// Only add the terminating character is there any nodes were added
// Only add the terminating character any nodes were added
if (has_children) { nodes.push_back(SerialTrieNode(trie_terminating_character)); }
}
return nodes;
Expand All @@ -133,8 +135,9 @@ __host__ __device__ inline bool serialized_trie_contains(device_span<SerialTrieN
char const *key,
size_t key_len)
{
if (trie.data() == nullptr) return false;
int curr_node = 0;
if (trie.data() == nullptr || trie.empty()) return false;
if (key_len == 0) return trie[0].is_leaf;
int curr_node = 1;
for (size_t i = 0; i < key_len; ++i) {
// Don't jump away from root node
if (i != 0) { curr_node += trie[curr_node].children_offset; }
Expand Down
25 changes: 2 additions & 23 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,24 +116,7 @@ class csv_reader_options {
// Additional values to recognize as boolean false values
std::vector<std::string> _false_values{"False", "FALSE", "false"};
// Additional values to recognize as null values
std::vector<std::string> _na_values{"#N/A",
"#N/A N/A",
"#NA",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"<NA>",
"N/A",
"NA",
"NULL",
"NaN",
"n/a",
"nan",
"null"};

std::vector<std::string> _na_values;
// Whether to keep the built-in default NA values
bool _keep_default_na = true;
// Whether to disable null filter; disabling can improve performance
Expand Down Expand Up @@ -613,11 +596,7 @@ class csv_reader_options {
CUDF_FAIL("Can't set na_values when na_filtering is disabled");
}

if (_keep_default_na) {
_na_values.insert(_na_values.end(), vals.begin(), vals.end());
} else {
_na_values = std::move(vals);
}
_na_values = std::move(vals);
}

/**
Expand Down
16 changes: 6 additions & 10 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
long tempPos = pos - 1;
long field_len = pos - start;

if (field_len <= 0 || serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) {
if (field_len < 0 || serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess field_len < 0 is because we're using inclusive offsets (i.e, [0, 0] includes index 0)... Do we have a bug open for changing inclusive offsets -> exclusive offsets throughout cuio?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might want to add a comment to this so it's not thought to be a mistake. Or use <= -1 ?

Copy link
Contributor Author

@vuule vuule Dec 9, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Took another look and it seems like field_len can never be negative. Since long field_len = pos - start;, and pos is initialized to start and can only advance.
I left the condition as a sanity check, but it actually makes no sense in the context. Will remove the condition, thanks!

atomicAdd(&d_columnData[actual_col].null_count, 1);
} else if (serialized_trie_contains(opts.trie_true, raw_csv + start, field_len) ||
serialized_trie_contains(opts.trie_false, raw_csv + start, field_len)) {
Expand Down Expand Up @@ -273,10 +273,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
--int_req_number_cnt;
}

if (field_len == 0) {
// Ignoring whitespace and quotes can result in empty fields
atomicAdd(&d_columnData[actual_col].null_count, 1);
} else if (column_flags[col] & column_parse::as_datetime) {
if (column_flags[col] & column_parse::as_datetime) {
// PANDAS uses `object` dtype if the date is unparseable
if (is_datetime(countString, countDecimal, countColon, countDash, countSlash)) {
atomicAdd(&d_columnData[actual_col].datetime_count, 1);
Expand Down Expand Up @@ -592,16 +589,15 @@ __global__ void __launch_bounds__(csvparse_block_dim)

if (column_flags[col] & column_parse::enabled) {
// check if the entire field is a NaN string - consistent with pandas
const bool is_na = serialized_trie_contains(options.trie_na, raw_csv + start, pos - start);
auto const is_valid =
!serialized_trie_contains(options.trie_na, raw_csv + start, pos - start);

// Modify start & end to ignore whitespace and quotechars
long tempPos = pos - 1;
if (!is_na && dtypes[actual_col].id() != cudf::type_id::STRING) {
if (is_valid && dtypes[actual_col].id() != cudf::type_id::STRING) {
trim_field_start_end(raw_csv, &start, &tempPos, options.quotechar);
}

if (!is_na && start <= (tempPos)) { // Empty fields are not legal values

if (is_valid) {
// Type dispatcher does not handle STRING
if (dtypes[actual_col].id() == cudf::type_id::STRING) {
long end = pos;
Expand Down
45 changes: 42 additions & 3 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,47 @@ std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> cons
return out_buffers;
}

/**
* @brief Create a serialized trie for N/A value matching, based on the options.
*/
thrust::host_vector<SerialTrieNode> create_na_trie(char quotechar,
csv_reader_options const &reader_opts)
{
// Default values to recognize as null values
static std::vector<std::string> const default_na_values{"",
"#N/A",
"#N/A N/A",
"#NA",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"<NA>",
"N/A",
"NA",
"NULL",
"NaN",
"n/a",
"nan",
"null"};

if (!reader_opts.is_enabled_na_filter()) { return {}; }

std::vector<std::string> na_values = reader_opts.get_na_values();
if (reader_opts.is_enabled_keep_default_na()) {
na_values.insert(na_values.end(), default_na_values.begin(), default_na_values.end());
}

// Pandas treats empty strings as N/A if empty fields are treated as N/A
if (std::find(na_values.begin(), na_values.end(), "") != na_values.end()) {
na_values.push_back(std::string(2, quotechar));
}

return createSerializedTrie(na_values);
}

parse_options make_parse_options(csv_reader_options const &reader_opts)
{
auto parse_opts = parse_options{};
Expand Down Expand Up @@ -747,9 +788,7 @@ parse_options make_parse_options(csv_reader_options const &reader_opts)
}

// Handle user-defined N/A values, whereby field data is treated as null
if (reader_opts.get_na_values().size() != 0) {
parse_opts.trie_na = createSerializedTrie(reader_opts.get_na_values());
}
parse_opts.trie_na = create_na_trie(parse_opts.quotechar, reader_opts);

return parse_opts;
}
Expand Down
33 changes: 20 additions & 13 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ TEST_F(CsvReaderTest, nullHandling)
const auto filepath = temp_env->get_temp_dir() + "NullValues.csv";
{
std::ofstream outfile(filepath, std::ofstream::out);
outfile << "NULL\nnull\nn/a\nNull\nNA\nnan";
outfile << "NULL\n\nnull\nn/a\nNull\nNA\nnan";
}

// Test disabling na_filter
Expand All @@ -937,11 +937,12 @@ TEST_F(CsvReaderTest, nullHandling)
cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
.na_filter(false)
.dtypes({"str"})
.header(-1);
.header(-1)
.skip_blank_lines(false);
const auto result = cudf_io::read_csv(in_opts);
const auto view = result.tbl->view();
auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"});

auto expect =
cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"});
expect_columns_equal(expect, view.column(0));
}

Expand All @@ -950,11 +951,13 @@ TEST_F(CsvReaderTest, nullHandling)
cudf_io::csv_reader_options in_opts =
cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
.dtypes({"str"})
.header(-1);
.header(-1)
.skip_blank_lines(false);
const auto result = cudf_io::read_csv(in_opts);
const auto view = result.tbl->view();
auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"},
{false, false, false, true, false, false});
auto expect =
cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
{false, false, false, false, true, false, false});

expect_columns_equal(expect, view.column(0));
}
Expand All @@ -965,11 +968,13 @@ TEST_F(CsvReaderTest, nullHandling)
cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
.na_values({"Null"})
.dtypes({"str"})
.header(-1);
.header(-1)
.skip_blank_lines(false);
const auto result = cudf_io::read_csv(in_opts);
const auto view = result.tbl->view();
auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"},
{false, false, false, false, false, false});
auto expect =
cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
{false, false, false, false, false, false, false});

expect_columns_equal(expect, view.column(0));
}
Expand All @@ -981,11 +986,13 @@ TEST_F(CsvReaderTest, nullHandling)
.keep_default_na(false)
.na_values({"Null"})
.dtypes({"str"})
.header(-1);
.header(-1)
.skip_blank_lines(false);
const auto result = cudf_io::read_csv(in_opts);
const auto view = result.tbl->view();
auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"},
{true, true, true, false, true, true});
auto expect =
cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
{true, true, true, true, false, true, true, true});

expect_columns_equal(expect, view.column(0));
}
Expand Down
8 changes: 0 additions & 8 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,6 @@ def read_csv(
if na_values is not None and is_scalar(na_values):
na_values = [na_values]

if keep_default_na is False:
# TODO: Remove this error once the following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6680
raise NotImplementedError(
"keep_default_na=False is currently not supported, please refer "
"to: https://github.com/rapidsai/cudf/issues/6680"
)

return libcudf.csv.read_csv(
filepath_or_buffer,
lineterminator=lineterminator,
Expand Down
Loading