rapidsai · rapids-bot · Dec 9, 2020 · Nov 23, 2020 · Nov 30, 2020 · Dec 1, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 
 ## Bug Fixes
 
+- PR #6922 Fix N/A detection for empty fields in CSV reader
 - PR #6912 Fix rmm_mode=managed parameter for gtests
 
 

@@ -89,6 +89,8 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
   // Serialize the tree trie
   std::deque<IndexedTrieNode> to_visit;
   thrust::host_vector<SerialTrieNode> nodes;
+  // suport for matching empty input
+  nodes.push_back(SerialTrieNode(trie_terminating_character, tree_trie.is_end_of_word));
   // Add root node to queue. this node is not included to the serialized trie
   to_visit.emplace_back(&tree_trie, -1);
   while (!to_visit.empty()) {
@@ -112,7 +114,7 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
         has_children = true;
       }
     }
-    // Only add the terminating character is there any nodes were added
+    // Only add the terminating character any nodes were added
     if (has_children) { nodes.push_back(SerialTrieNode(trie_terminating_character)); }
   }
   return nodes;
@@ -133,8 +135,9 @@ __host__ __device__ inline bool serialized_trie_contains(device_span<SerialTrieN
                                                          char const *key,
                                                          size_t key_len)
 {
-  if (trie.data() == nullptr) return false;
-  int curr_node = 0;
+  if (trie.data() == nullptr || trie.empty()) return false;
+  if (key_len == 0) return trie[0].is_leaf;
+  int curr_node = 1;
   for (size_t i = 0; i < key_len; ++i) {
     // Don't jump away from root node
     if (i != 0) { curr_node += trie[curr_node].children_offset; }

@@ -116,24 +116,7 @@ class csv_reader_options {
   // Additional values to recognize as boolean false values
   std::vector<std::string> _false_values{"False", "FALSE", "false"};
   // Additional values to recognize as null values
-  std::vector<std::string> _na_values{"#N/A",
-                                      "#N/A N/A",
-                                      "#NA",
-                                      "-1.#IND",
-                                      "-1.#QNAN",
-                                      "-NaN",
-                                      "-nan",
-                                      "1.#IND",
-                                      "1.#QNAN",
-                                      "<NA>",
-                                      "N/A",
-                                      "NA",
-                                      "NULL",
-                                      "NaN",
-                                      "n/a",
-                                      "nan",
-                                      "null"};
-
+  std::vector<std::string> _na_values;
   // Whether to keep the built-in default NA values
   bool _keep_default_na = true;
   // Whether to disable null filter; disabling can improve performance
@@ -613,11 +596,7 @@ class csv_reader_options {
       CUDF_FAIL("Can't set na_values when na_filtering is disabled");
     }
 
-    if (_keep_default_na) {
-      _na_values.insert(_na_values.end(), vals.begin(), vals.end());
-    } else {
-      _na_values = std::move(vals);
-    }
+    _na_values = std::move(vals);
   }
 
   /**

@@ -224,7 +224,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
       long tempPos   = pos - 1;
       long field_len = pos - start;
 
-      if (field_len <= 0 || serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) {
+      if (field_len < 0 || serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) {
         atomicAdd(&d_columnData[actual_col].null_count, 1);
       } else if (serialized_trie_contains(opts.trie_true, raw_csv + start, field_len) ||
                  serialized_trie_contains(opts.trie_false, raw_csv + start, field_len)) {
@@ -273,10 +273,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
           --int_req_number_cnt;
         }
 
-        if (field_len == 0) {
-          // Ignoring whitespace and quotes can result in empty fields
-          atomicAdd(&d_columnData[actual_col].null_count, 1);
-        } else if (column_flags[col] & column_parse::as_datetime) {
+        if (column_flags[col] & column_parse::as_datetime) {
           // PANDAS uses `object` dtype if the date is unparseable
           if (is_datetime(countString, countDecimal, countColon, countDash, countSlash)) {
             atomicAdd(&d_columnData[actual_col].datetime_count, 1);
@@ -592,16 +589,15 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 
     if (column_flags[col] & column_parse::enabled) {
       // check if the entire field is a NaN string - consistent with pandas
-      const bool is_na = serialized_trie_contains(options.trie_na, raw_csv + start, pos - start);
+      auto const is_valid =
+        !serialized_trie_contains(options.trie_na, raw_csv + start, pos - start);
 
       // Modify start & end to ignore whitespace and quotechars
       long tempPos = pos - 1;
-      if (!is_na && dtypes[actual_col].id() != cudf::type_id::STRING) {
+      if (is_valid && dtypes[actual_col].id() != cudf::type_id::STRING) {
         trim_field_start_end(raw_csv, &start, &tempPos, options.quotechar);
       }
-
-      if (!is_na && start <= (tempPos)) {  // Empty fields are not legal values
-
+      if (is_valid) {
         // Type dispatcher does not handle STRING
         if (dtypes[actual_col].id() == cudf::type_id::STRING) {
           long end = pos;

@@ -699,6 +699,47 @@ std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> cons
   return out_buffers;
 }
 
+/**
+ * @brief Create a serialized trie for N/A value matching, based on the options.
+ */
+thrust::host_vector<SerialTrieNode> create_na_trie(char quotechar,
+                                                   csv_reader_options const &reader_opts)
+{
+  // Default values to recognize as null values
+  static std::vector<std::string> const default_na_values{"",
+                                                          "#N/A",
+                                                          "#N/A N/A",
+                                                          "#NA",
+                                                          "-1.#IND",
+                                                          "-1.#QNAN",
+                                                          "-NaN",
+                                                          "-nan",
+                                                          "1.#IND",
+                                                          "1.#QNAN",
+                                                          "<NA>",
+                                                          "N/A",
+                                                          "NA",
+                                                          "NULL",
+                                                          "NaN",
+                                                          "n/a",
+                                                          "nan",
+                                                          "null"};
+
+  if (!reader_opts.is_enabled_na_filter()) { return {}; }
+
+  std::vector<std::string> na_values = reader_opts.get_na_values();
+  if (reader_opts.is_enabled_keep_default_na()) {
+    na_values.insert(na_values.end(), default_na_values.begin(), default_na_values.end());
+  }
+
+  // Pandas treats empty strings as N/A if empty fields are treated as N/A
+  if (std::find(na_values.begin(), na_values.end(), "") != na_values.end()) {
+    na_values.push_back(std::string(2, quotechar));
+  }
+
+  return createSerializedTrie(na_values);
+}
+
 parse_options make_parse_options(csv_reader_options const &reader_opts)
 {
   auto parse_opts = parse_options{};
@@ -747,9 +788,7 @@ parse_options make_parse_options(csv_reader_options const &reader_opts)
   }
 
   // Handle user-defined N/A values, whereby field data is treated as null
-  if (reader_opts.get_na_values().size() != 0) {
-    parse_opts.trie_na = createSerializedTrie(reader_opts.get_na_values());
-  }
+  parse_opts.trie_na = create_na_trie(parse_opts.quotechar, reader_opts);
 
   return parse_opts;
 }

@@ -928,7 +928,7 @@ TEST_F(CsvReaderTest, nullHandling)
   const auto filepath = temp_env->get_temp_dir() + "NullValues.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << "NULL\nnull\nn/a\nNull\nNA\nnan";
+    outfile << "NULL\n\nnull\nn/a\nNull\nNA\nnan";
   }
 
   // Test disabling na_filter
@@ -937,11 +937,12 @@ TEST_F(CsvReaderTest, nullHandling)
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .na_filter(false)
         .dtypes({"str"})
-        .header(-1);
+        .header(-1)
+        .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
     const auto view   = result.tbl->view();
-    auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"});
-
+    auto expect =
+      cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"});
     expect_columns_equal(expect, view.column(0));
   }
 
@@ -950,11 +951,13 @@ TEST_F(CsvReaderTest, nullHandling)
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .dtypes({"str"})
-        .header(-1);
+        .header(-1)
+        .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
     const auto view   = result.tbl->view();
-    auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"},
-                                                     {false, false, false, true, false, false});
+    auto expect =
+      cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
+                                         {false, false, false, false, true, false, false});
 
     expect_columns_equal(expect, view.column(0));
   }
@@ -965,11 +968,13 @@ TEST_F(CsvReaderTest, nullHandling)
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .na_values({"Null"})
         .dtypes({"str"})
-        .header(-1);
+        .header(-1)
+        .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
     const auto view   = result.tbl->view();
-    auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"},
-                                                     {false, false, false, false, false, false});
+    auto expect =
+      cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
+                                         {false, false, false, false, false, false, false});
 
     expect_columns_equal(expect, view.column(0));
   }
@@ -981,11 +986,13 @@ TEST_F(CsvReaderTest, nullHandling)
         .keep_default_na(false)
         .na_values({"Null"})
         .dtypes({"str"})
-        .header(-1);
+        .header(-1)
+        .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
     const auto view   = result.tbl->view();
-    auto expect = cudf::test::strings_column_wrapper({"NULL", "null", "n/a", "Null", "NA", "nan"},
-                                                     {true, true, true, false, true, true});
+    auto expect =
+      cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
+                                         {true, true, true, true, false, true, true, true});
 
     expect_columns_equal(expect, view.column(0));
   }

@@ -67,14 +67,6 @@ def read_csv(
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
 
-    if keep_default_na is False:
-        # TODO: Remove this error once the following issue is fixed:
-        # https://github.com/rapidsai/cudf/issues/6680
-        raise NotImplementedError(
-            "keep_default_na=False is currently not supported, please refer "
-            "to: https://github.com/rapidsai/cudf/issues/6680"
-        )
-
     return libcudf.csv.read_csv(
         filepath_or_buffer,
         lineterminator=lineterminator,