Skip to content

Commit

Permalink
Return schema info from JSON reader (#11419)
Browse files Browse the repository at this point in the history
Populate the `schema_info` structure (in addition to `column_names`) to match the behavior of a (future) JSON reader that supports nested columns.
Use the `schema_info` in Cython to set the struct columns' field names (unused until nested type support is added).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: #11419
  • Loading branch information
vuule committed Aug 1, 2022
1 parent f92ba2b commit 71a5292
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 11 deletions.
10 changes: 9 additions & 1 deletion cpp/src/io/json/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -550,13 +550,21 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
}
}

std::vector<column_name_info> column_infos;
column_infos.reserve(column_names.size());
std::transform(column_names.cbegin(),
column_names.cend(),
std::back_inserter(column_infos),
[](auto const& col_name) { return column_name_info{col_name}; });

// This is to ensure the stream-ordered make_stream_column calls above complete before
// the temporary std::vectors are destroyed on exit from this function.
stream.synchronize();

CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");

return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_names}};
return table_with_metadata{std::make_unique<table>(std::move(out_columns)),
{column_names, column_infos}};
}

/**
Expand Down
18 changes: 12 additions & 6 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ from cudf._lib.cpp.io.json cimport (
read_json as libcudf_read_json,
)
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.io.utils cimport make_source_info
from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr

Expand Down Expand Up @@ -106,14 +106,20 @@ cpdef read_json(object filepaths_or_buffers,
opts.set_dtypes(c_dtypes_map)

# Read JSON
cdef cudf_io_types.table_with_metadata c_out_table
cdef cudf_io_types.table_with_metadata c_result

with nogil:
c_out_table = move(libcudf_read_json(opts))
c_result = move(libcudf_read_json(opts))

column_names = [x.decode() for x in c_out_table.metadata.column_names]
return data_from_unique_ptr(move(c_out_table.tbl),
column_names=column_names)
meta_names = [name.decode() for name in c_result.metadata.column_names]
df = cudf.DataFrame._from_data(*data_from_unique_ptr(
move(c_result.tbl),
column_names=meta_names
))

update_struct_field_names(df, c_result.metadata.schema_info)

return df

cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
if cudf.api.types.is_categorical_dtype(dtype):
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,8 @@ def read_json(
else:
filepaths_or_buffers.append(tmp_source)

df = cudf.DataFrame._from_data(
*libjson.read_json(
filepaths_or_buffers, dtype, lines, compression, byte_range
)
df = libjson.read_json(
filepaths_or_buffers, dtype, lines, compression, byte_range
)
else:
warnings.warn(
Expand Down

0 comments on commit 71a5292

Please sign in to comment.