In [0]:
# Fleet Data Ingestion (Batch)
fleet_df = spark.table("enterprise_modernization.default.fleet_data")
fleet_df.show(5)

# Save to Bronze table
#fleet_df.write.mode("overwrite").saveAsTable("enterprise_modernization.default.bronze_fleet_data")

In [0]:
import time
from pyspark.sql.functions import monotonically_increasing_id

# Add a unique index to split data (if your dataset is not huge)
fleet_df = fleet_df.withColumn("row_id", monotonically_increasing_id())
batch_size = 100

# Process in batches for simulation
total_rows = fleet_df.count()
for start in range(0, total_rows, batch_size):
    batch = fleet_df.filter(
        (fleet_df.row_id >= start) & (fleet_df.row_id < start + batch_size)
    )
    print(f"Processing simulated stream batch: {start} - {start+batch_size}")
    batch.write.mode("overwrite").saveAsTable("enterprise_modernization.bronze.bronze_fleet")
    batch.show(3)
    time.sleep(1)  # Simulate 1 second delay per batch (adjust as needed)


In [0]:
# DEMO ONLY: Example of reading from Kafka (won't run in CE)
df_stream = (
    spark.readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "broker:9092")
        .option("subscribe", "fleet-topic")
        .load()
)

# Transform the raw values column (assuming JSON payload)
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StringType, StructType, StructField

schema = StructType([
    StructField("vehicle_id", StringType()),
    StructField("timestamp", StringType()),
    StructField("location", StringType()),
    # ... add other fields as needed
])

fleet_json_df = df_stream.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

# Write to bronze table
fleet_json_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/checkpoints/fleet") \
    .table("enterprise_modernization.default.bronze_fleet")


In [0]:
df = spark.table("enterprise_modernization.bronze.bronze_fleet")

print(df.columns)

print(df.count())

df.show(5)