From 71a5292dba61b516f70feb099c37d326bcd49ea1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 1 Aug 2022 16:44:03 -0700
Subject: [PATCH] Return schema info from JSON reader (#11419)

Populate the `schema_info` structure (in addition to `column_names`) to match the behavior of a (future) JSON reader that supports nested columns.
Use the `schema_info` in Cython to set the struct columns' field names (unused until nested type support is added).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/11419
---
 cpp/src/io/json/reader_impl.cu | 10 +++++++++-
 python/cudf/cudf/_lib/json.pyx | 18 ++++++++++++------
 python/cudf/cudf/io/json.py    |  6 ++----
 3 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 7e6be190acb..6b12b462dd9 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -550,13 +550,21 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
     }
   }
 
+  std::vector<column_name_info> column_infos;
+  column_infos.reserve(column_names.size());
+  std::transform(column_names.cbegin(),
+                 column_names.cend(),
+                 std::back_inserter(column_infos),
+                 [](auto const& col_name) { return column_name_info{col_name}; });
+
   // This is to ensure the stream-ordered make_stream_column calls above complete before
   // the temporary std::vectors are destroyed on exit from this function.
   stream.synchronize();
 
   CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");
 
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_names}};
+  return table_with_metadata{std::make_unique<table>(std::move(out_columns)),
+                             {column_names, column_infos}};
 }
 
 /**
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 263d70afe26..9c820a56104 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -22,7 +22,7 @@ from cudf._lib.cpp.io.json cimport (
     read_json as libcudf_read_json,
 )
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
-from cudf._lib.io.utils cimport make_source_info
+from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr
 
@@ -106,14 +106,20 @@ cpdef read_json(object filepaths_or_buffers,
         opts.set_dtypes(c_dtypes_map)
 
     # Read JSON
-    cdef cudf_io_types.table_with_metadata c_out_table
+    cdef cudf_io_types.table_with_metadata c_result
 
     with nogil:
-        c_out_table = move(libcudf_read_json(opts))
+        c_result = move(libcudf_read_json(opts))
 
-    column_names = [x.decode() for x in c_out_table.metadata.column_names]
-    return data_from_unique_ptr(move(c_out_table.tbl),
-                                column_names=column_names)
+    meta_names = [name.decode() for name in c_result.metadata.column_names]
+    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
+        move(c_result.tbl),
+        column_names=meta_names
+    ))
+
+    update_struct_field_names(df, c_result.metadata.schema_info)
+
+    return df
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
     if cudf.api.types.is_categorical_dtype(dtype):
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 489256564e0..85f024e2420 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -57,10 +57,8 @@ def read_json(
             else:
                 filepaths_or_buffers.append(tmp_source)
 
-        df = cudf.DataFrame._from_data(
-            *libjson.read_json(
-                filepaths_or_buffers, dtype, lines, compression, byte_range
-            )
+        df = libjson.read_json(
+            filepaths_or_buffers, dtype, lines, compression, byte_range
         )
     else:
         warnings.warn(