From 71a5292dba61b516f70feb099c37d326bcd49ea1 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 1 Aug 2022 16:44:03 -0700 Subject: [PATCH] Return schema info from JSON reader (#11419) Populate the `schema_info` structure (in addition to `column_names`) to match the behavior of a (future) JSON reader that supports nested columns. Use the `schema_info` in Cython to set the struct columns' field names (unused until nested type support is added). Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/11419 --- cpp/src/io/json/reader_impl.cu | 10 +++++++++- python/cudf/cudf/_lib/json.pyx | 18 ++++++++++++------ python/cudf/cudf/io/json.py | 6 ++---- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 7e6be190acb..6b12b462dd9 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -550,13 +550,21 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, } } + std::vector column_infos; + column_infos.reserve(column_names.size()); + std::transform(column_names.cbegin(), + column_names.cend(), + std::back_inserter(column_infos), + [](auto const& col_name) { return column_name_info{col_name}; }); + // This is to ensure the stream-ordered make_stream_column calls above complete before // the temporary std::vectors are destroyed on exit from this function. stream.synchronize(); CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input"); - return table_with_metadata{std::make_unique(std::move(out_columns)), {column_names}}; + return table_with_metadata{std::make_unique
(std::move(out_columns)), + {column_names, column_infos}}; } /** diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 263d70afe26..9c820a56104 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -22,7 +22,7 @@ from cudf._lib.cpp.io.json cimport ( read_json as libcudf_read_json, ) from cudf._lib.cpp.types cimport data_type, size_type, type_id -from cudf._lib.io.utils cimport make_source_info +from cudf._lib.io.utils cimport make_source_info, update_struct_field_names from cudf._lib.types cimport dtype_to_data_type from cudf._lib.utils cimport data_from_unique_ptr @@ -106,14 +106,20 @@ cpdef read_json(object filepaths_or_buffers, opts.set_dtypes(c_dtypes_map) # Read JSON - cdef cudf_io_types.table_with_metadata c_out_table + cdef cudf_io_types.table_with_metadata c_result with nogil: - c_out_table = move(libcudf_read_json(opts)) + c_result = move(libcudf_read_json(opts)) - column_names = [x.decode() for x in c_out_table.metadata.column_names] - return data_from_unique_ptr(move(c_out_table.tbl), - column_names=column_names) + meta_names = [name.decode() for name in c_result.metadata.column_names] + df = cudf.DataFrame._from_data(*data_from_unique_ptr( + move(c_result.tbl), + column_names=meta_names + )) + + update_struct_field_names(df, c_result.metadata.schema_info) + + return df cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: if cudf.api.types.is_categorical_dtype(dtype): diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 489256564e0..85f024e2420 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -57,10 +57,8 @@ def read_json( else: filepaths_or_buffers.append(tmp_source) - df = cudf.DataFrame._from_data( - *libjson.read_json( - filepaths_or_buffers, dtype, lines, compression, byte_range - ) + df = libjson.read_json( + filepaths_or_buffers, dtype, lines, compression, byte_range ) else: warnings.warn(