Fix columns ordering issue in parquet reader (#10066)

Fixes: #10062 This PR fixes issue where the order of `columns` and parquet metadata columns(i.e., `meta['columns']`) can differ and both are not guaranteed to be in the same order always. In this PR, removed the code that has this assumption and created a new dict that contains the metadata of columns which are later used to update the column metadata in dataframe. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Devavret Makkar (https://github.com/devavret) URL: #10066
rapidsai · Jan 19, 2022 · 8e88adc · 8e88adc
1 parent 3aecce2
commit 8e88adc
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 3 deletions.
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -200,12 +200,19 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
     update_struct_field_names(df, c_out_table.metadata.schema_info)
 
-    # update the decimal precision of each column
     if meta is not None:
-        for col, col_meta in zip(column_names, meta["columns"]):
+        # Book keep each column metadata as the order
+        # of `meta["columns"]` and `column_names` are not
+        # guaranteed to be deterministic and same always.
+        meta_data_per_column = {
+            col_meta['name']: col_meta for col_meta in meta["columns"]
+        }
+
+        # update the decimal precision of each column
+        for col in column_names:
             if is_decimal_dtype(df._data[col].dtype):
                 df._data[col].dtype.precision = (
-                    col_meta["metadata"]["precision"]
+                    meta_data_per_column[col]["metadata"]["precision"]
                 )
 
     # Set the index column

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -2368,3 +2368,21 @@ def test_parquet_writer_row_group_size(
         math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
     )
     assert expected_num_rows == row_groups
+
+
+def test_parquet_reader_decimal_columns():
+    df = cudf.DataFrame(
+        {
+            "col1": cudf.Series([1, 2, 3], dtype=cudf.Decimal64Dtype(10, 2)),
+            "col2": [10, 11, 12],
+            "col3": [12, 13, 14],
+            "col4": ["a", "b", "c"],
+        }
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+
+    actual = cudf.read_parquet(buffer, columns=["col3", "col2", "col1"])
+    expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"])
+
+    assert_eq(actual, expected)