Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return schema info from JSON reader #11419

Merged
merged 4 commits into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion cpp/src/io/json/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -550,13 +550,21 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
}
}

std::vector<column_name_info> column_infos;
column_infos.reserve(column_names.size());
std::transform(column_names.cbegin(),
column_names.cend(),
std::back_inserter(column_infos),
[](auto const& col_name) { return column_name_info{col_name}; });

// This is to ensure the stream-ordered make_stream_column calls above complete before
// the temporary std::vectors are destroyed on exit from this function.
stream.synchronize();

CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");

return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_names}};
return table_with_metadata{std::make_unique<table>(std::move(out_columns)),
{column_names, column_infos}};
}

/**
Expand Down
18 changes: 12 additions & 6 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ from cudf._lib.cpp.io.json cimport (
read_json as libcudf_read_json,
)
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.io.utils cimport make_source_info
from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr

Expand Down Expand Up @@ -106,14 +106,20 @@ cpdef read_json(object filepaths_or_buffers,
opts.set_dtypes(c_dtypes_map)

# Read JSON
cdef cudf_io_types.table_with_metadata c_out_table
cdef cudf_io_types.table_with_metadata c_result

with nogil:
c_out_table = move(libcudf_read_json(opts))
c_result = move(libcudf_read_json(opts))

column_names = [x.decode() for x in c_out_table.metadata.column_names]
return data_from_unique_ptr(move(c_out_table.tbl),
column_names=column_names)
meta_names = [name.decode() for name in c_result.metadata.column_names]
df = cudf.DataFrame._from_data(*data_from_unique_ptr(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess you moved the _from_data call into the Cython because update_struct_field_names has to happen in Cython? In the long term I think that probably indicates some level of restructuring is required, but we can deal with that when we get around to cuIO/Cython refactoring more broadly.

move(c_result.tbl),
column_names=meta_names
))

update_struct_field_names(df, c_result.metadata.schema_info)

return df

cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
if cudf.api.types.is_categorical_dtype(dtype):
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,8 @@ def read_json(
else:
filepaths_or_buffers.append(tmp_source)

df = cudf.DataFrame._from_data(
*libjson.read_json(
filepaths_or_buffers, dtype, lines, compression, byte_range
)
df = libjson.read_json(
filepaths_or_buffers, dtype, lines, compression, byte_range
)
else:
warnings.warn(
Expand Down