## Imports and variables confguration

As we have already updated the compute with the Hudi-Spark bundle jar, its automatically added to the classpath. So you can start reading and writing Hudi tables as you would in any other Spark environment.

In [0]:
# pyspark
from pyspark.sql.functions import lit, col

tableName = "trips_table"
basePath = "file:///tmp/trips_table"


### Creating dataframe

In [0]:
columns = ["ts","uuid","rider","driver","fare","city"]
data =[(1695159649087,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),
       (1695091554788,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70 ,"san_francisco"),
       (1695046462179,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90 ,"san_francisco"),
       (1695516137016,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"),
       (1695115999911,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai")]
inserts = spark.createDataFrame(data).toDF(*columns)

In [0]:
inserts.show()

+-------------+--------------------+-------+--------+-----+-------------+
|           ts|                uuid|  rider|  driver| fare|         city|
+-------------+--------------------+-------+--------+-----+-------------+
|1695159649087|334e26e9-8355-45c...|rider-A|driver-K| 19.1|san_francisco|
|1695091554788|e96c4396-3fad-413...|rider-C|driver-M| 27.7|san_francisco|
|1695046462179|9909a8b1-2d15-4d3...|rider-D|driver-L| 33.9|san_francisco|
|1695516137016|e3cf430c-889d-401...|rider-F|driver-P|34.15|    sao_paulo|
|1695115999911|c8abbe79-8d89-47e...|rider-J|driver-T|17.85|      chennai|
+-------------+--------------------+-------+--------+-----+-------------+



### Saving the table as Hudi table

In [0]:
hudi_options = {
    'hoodie.table.name': tableName
}

In [0]:
inserts.write.format("hudi"). \
    options(**hudi_options). \
    mode("overwrite"). \
    save(basePath)

### Reading the table

Note: Pay close attention to `hoodie.file.index.enable` being set to `false`, this enables use of the spark file index implementation for Hudi, that speeds up listing of large tables and is mandatory if you're using Databricks to read Hudi tables.

In [0]:
tripsDF = spark.read.format("hudi").option("hoodie.file.index.enable", "false").load(basePath)

In [0]:
tripsDF.show()

+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|_hoodie_commit_time|_hoodie_commit_seqno|  _hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|           ts|                uuid|  rider|  driver| fare|         city|
+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|  20240514185003982|20240514185003982...|20240514185003982...|                      |08452ccd-a94a-4c7...|1695046462179|9909a8b1-2d15-4d3...|rider-D|driver-L| 33.9|san_francisco|
|  20240514185003982|20240514185003982...|20240514185003982...|                      |11aaa299-e866-4ce...|1695091554788|e96c4396-3fad-413...|rider-C|driver-M| 27.7|san_francisco|
|  20240514185003982|20240514185003982...|20240514185003982...|                      |f230c189-9295-

In [0]:
tripsDF.createOrReplaceTempView("trips_table")

In [0]:
%sql
SELECT uuid, fare, ts, rider, driver, city FROM  trips_table WHERE fare > 20.0

uuid,fare,ts,rider,driver,city
9909a8b1-2d15-4d3d-8ec9-efc48c536a00,33.9,1695046462179,rider-D,driver-L,san_francisco
e96c4396-3fad-413a-a942-4cb36106d721,27.7,1695091554788,rider-C,driver-M,san_francisco
e3cf430c-889d-4015-bc98-59bdce1e530c,34.15,1695516137016,rider-F,driver-P,sao_paulo
