Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix columns ordering issue in parquet reader #10066

Merged
merged 4 commits into from Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 10 additions & 3 deletions python/cudf/cudf/_lib/parquet.pyx
Expand Up @@ -200,12 +200,19 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,

update_struct_field_names(df, c_out_table.metadata.schema_info)

# update the decimal precision of each column
if meta is not None:
for col, col_meta in zip(column_names, meta["columns"]):
# Book keep each column metadata as the order
# of `meta["columns"]` and `column_names` are not
# guaranteed to be deterministic and same always.
meta_data_per_column = {}
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved
for col_meta in meta["columns"]:
meta_data_per_column[col_meta['name']] = col_meta

# update the decimal precision of each column
for col in column_names:
if is_decimal_dtype(df._data[col].dtype):
df._data[col].dtype.precision = (
col_meta["metadata"]["precision"]
meta_data_per_column[col]["metadata"]["precision"]
)

# Set the index column
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Expand Up @@ -2373,3 +2373,21 @@ def test_parquet_writer_row_group_size(
math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
)
assert expected_num_rows == row_groups


def test_parquet_reader_decimal_columns():
df = cudf.DataFrame(
{
"col1": cudf.Series([1, 2, 3], dtype=cudf.Decimal64Dtype(10, 2)),
"col2": [10, 11, 12],
"col3": [12, 13, 14],
"col4": ["a", "b", "c"],
}
)
buffer = BytesIO()
df.to_parquet(buffer)

actual = cudf.read_parquet(buffer, columns=["col3", "col2", "col1"])
expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"])

assert_eq(actual, expected)