In [2]:
from pyspark.sql import SparkSession



spark = (
    SparkSession.builder.master("local[1]") 
    .appName("read-postgres-jdbc") 
    .getOrCreate()
)
    # PostgreSQL connection details
jdbc_url = "jdbc:postgresql://plaid-assessment:5432/bookings"
table_name = "bookable_flights"
username = "postgres"
password = "plaid"
# Connection properties
connection_properties = {
    "user": username,
    "password": password,
    "driver": "org.postgresql.Driver"
}
# Read data from PostgreSQL into a Spark DataFrame
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .options(**connection_properties) \
    .load()
# Show the DataFrame schema and some data
df.printSchema()
df.show()
df.show(1)

root
 |-- departure_airport: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- arrival_airport: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- transit_airport: string (nullable = true)
 |-- transit_airport_arrival_time: string (nullable = true)
 |-- transit_airport_departure_time: string (nullable = true)
 |-- flights: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-----------------+--------------+---------------+-------------+---------------+----------------------------+------------------------------+--------+
|departure_airport|departure_time|arrival_airport| arrival_time|transit_airport|transit_airport_arrival_time|transit_airport_departure_time| flights|
+-----------------+--------------+---------------+-------------+---------------+----------------------------+------------------------------+--------+
|              UIK| Fri  12:15:00|            SGC|Fri  14:35:00|           NULL|                       

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Hudi Event Time Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar") \
    .getOrCreate()

# Define table name and base path
tableName = "iot_sensor_data"
basePath = "file:///tmp/iot_sensor_data"

# Sample data without event_time
columns = ["sensor_id", "temperature", "city"]
data = [
    ("sensor_1", 34, "Patiala"),
    ("sensor_2", 31, "Gurugram")
]
inserts = spark.createDataFrame(data).toDF(*columns)

# Add current timestamp as event_time
inserts_with_event_time = inserts.withColumn("event_time", current_timestamp())

# Reorder columns to match the original schema
inserts_with_event_time = inserts_with_event_time.select("event_time", "sensor_id", "temperature", "city")

# Hudi options
hudi_options = {
    'hoodie.table.name': tableName,
    'hoodie.datasource.write.recordkey.field': 'sensor_id',
    'hoodie.datasource.write.partitionpath.field': 'city',
    'hoodie.datasource.write.precombine.field': 'event_time',
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE'
}

# Write data to Hudi table
inserts_with_event_time.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(basePath)

# Verify the data
tripsDF = spark.read.format("hudi").load(basePath)
tripsDF.show()

+-------------------+--------------------+------------------+----------------------+--------------------+--------------------+---------+-----------+--------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|          event_time|sensor_id|temperature|    city|
+-------------------+--------------------+------------------+----------------------+--------------------+--------------------+---------+-----------+--------+
|  20251116025356321|20251116025356321...|          sensor_2|              Gurugram|5c8ef5e8-01a6-426...|2025-11-16 02:53:...| sensor_2|         31|Gurugram|
|  20251116025356321|20251116025356321...|          sensor_1|               Patiala|1281b525-2f04-4e1...|2025-11-16 02:53:...| sensor_1|         34| Patiala|
+-------------------+--------------------+------------------+----------------------+--------------------+--------------------+---------+-----------+--------+



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Hudi Event Time Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar") \
    .getOrCreate()

# Define table name and base path
tableName = "stock_ticks_cow"
basePath = "file:///user/hive/warehouse/stock_ticks_cow"


tripsDF = spark.read.format("hudi").load(basePath)
tripsDF.show()

IllegalArgumentException: None

In [5]:
import os

In [6]:
os.environ['test_var'] = 'a'
os.environ.get('test_var')

'a'

In [8]:
os.environ.get('test_var')

'a'

In [9]:
os.environ['test_var'] = 'b'

In [10]:
os.environ.get('test_var')

'b'