In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count
from pyspark.sql import Row

# Step 2.1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Hudi Example") \
    .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.5-bundle_2.12:0.15.0") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()


24/11/27 12:01:48 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.33 instead (on interface en0)
24/11/27 12:01:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/bhland/.ivy2/cache
The jars for the packages stored in: /Users/bhland/.ivy2/jars
org.apache.hudi#hudi-spark3.5-bundle_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a679abae-25f9-4de9-a22d-6621dbc7602d;1.0
	confs: [default]
	found org.apache.hudi#hudi-spark3.5-bundle_2.12;0.15.0 in central
	found org.apache.hive#hive-storage-api;2.8.1 in central


:: loading settings :: url = jar:file:/Users/bhland/miniforge3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.slf4j#slf4j-api;1.7.36 in local-m2-cache
:: resolution report :: resolve 77ms :: artifacts dl 2ms
	:: modules in use:
	org.apache.hive#hive-storage-api;2.8.1 from central in [default]
	org.apache.hudi#hudi-spark3.5-bundle_2.12;0.15.0 from central in [default]
	org.slf4j#slf4j-api;1.7.36 from local-m2-cache in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-a679abae-25f9-4de9-a22d-6621dbc7602d
	confs: [default]
	0 artifacts copied, 3 already retrieved (0kB/3ms)
24/11/27 12:01:48 WARN NativeCodeLoader: Unable to load native-hadoop library for 

In [2]:
# Define sample data
sample_data = [
    Row(id=1, category="Electronics", price=299.99),
    Row(id=2, category="Books", price=15.99),
    Row(id=3, category="Clothing", price=49.99),
    Row(id=4, category="Furniture", price=89.99),
]

# Create DataFrame
schema_df = spark.createDataFrame(sample_data)
schema_df.show()


# Hudi options
hudi_options_insert = {
    'hoodie.table.name': 'hudi_streaming_table',
    'hoodie.datasource.write.recordkey.field': 'id',  # Primary key
    'hoodie.datasource.write.partitionpath.field': '',  # Non-partitioned table
    'hoodie.datasource.write.hive_style_partitioning': 'false',
    'hoodie.datasource.write.precombine.field': 'price',  # Used for deduplication
    'hoodie.datasource.write.operation': 'insert',  # Initial insert
    'hoodie.insert.shuffle.parallelism': 2,
    'hoodie.upsert.shuffle.parallelism': 2,
    'hoodie.write.handle.insert.skip.null.record': 'true'
}

# Output path for Hudi table
output_path = "file:///Users/bhland/Big_Data_Project/RealTimePOC/output/hudi_streaming_table" 

# Write the DataFrame to Hudi
schema_df.write.format("hudi").options(**hudi_options_insert).mode("overwrite").save(output_path)

print(f"Hudi table created at: {output_path}")

                                                                                

+---+-----------+------+
| id|   category| price|
+---+-----------+------+
|  1|Electronics|299.99|
|  2|      Books| 15.99|
|  3|   Clothing| 49.99|
|  4|  Furniture| 89.99|
+---+-----------+------+



24/11/27 11:37:16 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf
24/11/27 11:37:16 WARN DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
24/11/27 11:37:17 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-hbase.properties,hadoop-metrics2.properties


Hudi table created at: file:///Volumes/BH_Land/BigData Project/Dataset/RealTimePOC/output/hudi_streaming_table


24/11/27 11:37:19 WARN HoodieSparkSqlWriterInternal: Closing write client


In [3]:
hudi_df = spark.read.format("hudi").load(output_path)
hudi_df.show()

+-------------------+--------------------+------------------+----------------------+--------------------+---+-----------+------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| id|   category| price|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----------+------+
|  20241127113716384|20241127113716384...|                 1|                      |07ec6f27-a6ff-46c...|  1|Electronics|299.99|
|  20241127113716384|20241127113716384...|                 2|                      |07ec6f27-a6ff-46c...|  2|      Books| 15.99|
|  20241127113716384|20241127113716384...|                 3|                      |07ec6f27-a6ff-46c...|  3|   Clothing| 49.99|
|  20241127113716384|20241127113716384...|                 4|                      |07ec6f27-a6ff-46c...|  4|  Furniture| 89.99|
+-------------------+--------------------+------------------+----------------------+-------------

In [3]:
# Step 2.2: Read JSON File into a Spark DataFrame
input_path = "file:///Users/bhland/Big_Data_Project/RealTimePOC/input"  # Update this path

# Step 2.3: Read JSON Files as a Streaming Source
json_stream_df = spark.readStream \
    .format("json") \
    .schema("id INT, category STRING, price DOUBLE") \
    .load(input_path)

json_stream_df = json_stream_df.na.drop(how="any")

'''
print_query = json_stream_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

print_query.awaitTermination()
'''

# Step 2.4: Apply Aggregation Transformation
agg_stream_df = json_stream_df.groupBy("category").agg(
    count("id").alias("count"),
    avg("price").alias("avg_price")
)

# Step 2.4: Define Hudi Options
hudi_options_stream = {
    'hoodie.table.name': 'hudi_streaming_table',
    'hoodie.datasource.write.recordkey.field': 'category',
    'hoodie.datasource.write.precombine.field': 'avg_price',
    'hoodie.datasource.write.table.name': 'hudi_streaming_table',
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.hive_style_partitioning': 'true',
    'hoodie.upsert.shuffle.parallelism': 2,
    'hoodie.insert.shuffle.parallelism': 2,
    'hoodie.write.handle.insert.skip.null.record': 'true'
}

# Output path for Hudi table
output_path = "file:///Users/bhland/Big_Data_Project/RealTimePOC/output/hudi_streaming_table" 

# Step 2.6: Write to Hudi Table Using Structured Streaming
hudi_query = agg_stream_df.writeStream \
    .format("hudi") \
    .options(**hudi_options_stream) \
    .outputMode("update") \
    .option("checkpointLocation", "file:///Users/bhland/Big_Data_Project/RealTimePOC/output/checkpoint_dir") \
    .start(output_path)


# Await termination to keep the stream running
hudi_query.awaitTermination()



24/11/27 12:02:44 WARN HoodieStreamingSink: Ignore TableNotFoundException as it is first microbatch.
24/11/27 12:02:44 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/11/27 12:02:54 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf
24/11/27 12:02:54 WARN DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
24/11/27 12:02:54 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark
24/11/27 12:02:55 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-hbase.properties,hadoop-metrics2.properties
                                                                                



24/11/27 12:02:59 WARN HoodieSparkSqlWriterInternal: Closing write client
ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/bhland/miniforge3/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/bhland/miniforge3/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/bhland/miniforge3/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [4]:
hudi_df = spark.read.format("hudi").load(output_path)
hudi_df.show()

+-------------------+--------------------+------------------+----------------------+--------------------+-----------+-----+-----------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|   category|count|        avg_price|
+-------------------+--------------------+------------------+----------------------+--------------------+-----------+-----+-----------------+
|  20241127120254388|20241127120254388...|       electronics|                      |081f6589-f80b-4f6...|electronics|    3|328.3333333333333|
|  20241127120254388|20241127120254388...|        appliances|                      |081f6589-f80b-4f6...| appliances|    1|            850.0|
|  20241127120254388|20241127120254388...|         furniture|                      |081f6589-f80b-4f6...|  furniture|    2|            550.0|
+-------------------+--------------------+------------------+----------------------+--------------------+-----------+-----+-----------------+

