From a5f4e47df6eb80708fda61a9b9e3ae2258062e41 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 27 Apr 2022 11:25:19 +0100 Subject: [PATCH] Fix Socrata previews returning null It looks like `row_to_json(t.*)` makes Multicorn think we're requesting no columns from the FDW (this could be related to our Multicorn changes, but it doesn't happen with the LQFDW): ```sql explain SELECT row_to_json(p.*) FROM sg_tmp_67cecbc993a0738c2f815e80169dafd3.some_table p LIMIT 10 Limit (cost=20.00..20.02 rows=10 width=32) -> Foreign Scan on some_table p (cost=20.00..237.38 rows=94954 width=32) Multicorn: Socrata query to data.cityofnewyork.us Multicorn: Socrata dataset ID: 8wbx-tsch Multicorn: Query: Multicorn: Columns: Multicorn: Order: :id ``` This query returns JSON objects full of NULLs. Rewrite the preview query to first get the first 10 rows as a materialized CTE, then use `row_to_json` on the result, which seems to fix the issue: ```sql explain SELECT t.* FROM sg_tmp_67cecbc993a0738c2f815e80169dafd3.some_table t LIMIT 10 Limit (cost=20.00..33620.00 rows=10 width=3360) -> Foreign Scan on some_table t (cost=20.00..319045440.00 rows=94954 width=3360) Multicorn: Socrata query to data.cityofnewyork.us Multicorn: Socrata dataset ID: 8wbx-tsch Multicorn: Query: Multicorn: Columns: `last_date_updated`,`license_type`,`last_time_updated`,`vehicle_license_number`,`wheelchair_accessible`,`vehicle_year`,`permit_license_number`,`order_date`,`certification_date`,`base_address`,`base_name`,`reason`,`hack_up_date`,`expiration_date`,`vehicle_vin_number`,`base_number`,`base_telephone_number`,`dmv_license_plate_number`,`website`,`active`,`base_type`,`veh`,`name` Multicorn: Order: :id explain with p as materialized (SELECT * FROM sg_tmp_67cecbc993a0738c2f815e80169dafd3.some_table LIMIT 10) select row_to_json(p.*) from p CTE Scan on p (cost=33620.00..33620.22 rows=10 width=32) CTE p -> Limit (cost=20.00..33620.00 rows=10 width=3360) -> Foreign Scan on some_table (cost=20.00..319045440.00 rows=94954 width=3360) Multicorn: Socrata query to data.cityofnewyork.us Multicorn: Socrata dataset ID: 8wbx-tsch Multicorn: Query: Multicorn: Columns: `last_date_updated`,`license_type`,`last_time_updated`,`vehicle_license_number`,`wheelchair_accessible`,`vehicle_year`,`permit_license_number`,`order_date`,`certification_date`,`base_address`,`base_name`,`reason`,`hack_up_date`,`expiration_date`,`vehicle_vin_number`,`base_number`,`base_telephone_number`,`dmv_license_plate_number`,`website`,`active`,`base_type`,`veh`,`name` Multicorn: Order: :id ``` --- splitgraph/hooks/data_source/fdw.py | 7 ++++--- test/splitgraph/ingestion/test_socrata.py | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/splitgraph/hooks/data_source/fdw.py b/splitgraph/hooks/data_source/fdw.py index e42c8117..3b91baeb 100644 --- a/splitgraph/hooks/data_source/fdw.py +++ b/splitgraph/hooks/data_source/fdw.py @@ -235,9 +235,10 @@ def _preview_table(self, schema: str, table: str, limit: int = 10) -> List[Dict[ result_json = cast( List[Dict[str, Any]], self.engine.run_sql( - SQL("SELECT row_to_json(t.*) FROM {}.{} t LIMIT %s").format( - Identifier(schema), Identifier(table) - ), + SQL( + "WITH p AS MATERIALIZED(SELECT * FROM {}.{} LIMIT %s) " + "SELECT row_to_json(p.*) FROM p" + ).format(Identifier(schema), Identifier(table)), (limit,), return_shape=ResultShape.MANY_ONE, ), diff --git a/test/splitgraph/ingestion/test_socrata.py b/test/splitgraph/ingestion/test_socrata.py index 473d3ba7..449a533b 100644 --- a/test/splitgraph/ingestion/test_socrata.py +++ b/test/splitgraph/ingestion/test_socrata.py @@ -423,3 +423,7 @@ def test_socrata_data_source_introspection_smoke(local_engine_empty): schema, params = result["some_table"] assert len(schema) > 1 assert params == {"socrata_id": "8wbx-tsch"} + + preview_result = data_source.preview(tables=result) + assert len(preview_result["some_table"]) == 10 + assert preview_result["some_table"][0][":id"] is not None