In [0]:
# DEMO ONLY: Example of reading from Kafka
df_stream = (
    spark.readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "broker:9092")
        .option("subscribe", "fleet-topic")
        .load()
)

# Transform the raw values column (assuming JSON payload)
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StringType, StructType, StructField

schema = StructType([
    StructField("vehicle_id", StringType()),
    StructField("timestamp", StringType()),
    StructField("location", StringType()),
    # ... add other fields as needed
])

fleet_json_df = df_stream.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

# Write to bronze table
fleet_json_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/checkpoints/fleet") \
    .table("enterprise_modernization.default.bronze_fleet")


In [0]:
import requests
import pandas as pd

api_url = "https://fakestoreapi.com/products"
response = requests.get(api_url)
if response.ok:
    data = response.json()
    api_df = pd.DataFrame(data)
else:
    print("Request failed due to network restrictions in CE. Using uploaded table as a stand-in.")
    # Fallback: Read from uploaded source
    fleet_df = spark.table("enterprise_modernization.default.fleet_car_data_final")
    fleet_df.show(5)



In [0]:
import time
from pyspark.sql.functions import monotonically_increasing_id

# Add a unique index to split data (if your dataset is not huge)
fleet_df = fleet_df.withColumn("row_id", monotonically_increasing_id())

batch_size = 500
total_rows = fleet_df.count()

for start in range(0, total_rows, batch_size):
    batch = fleet_df.filter(
        (fleet_df.row_id >= start) & (fleet_df.row_id < start + batch_size)
    )
    print(f"Processing simulated stream batch: {start} - {start + batch_size}")
    batch.write.mode("append").saveAsTable("enterprise_modernization.bronze.bronze_fleet_1")
    batch.show(3)
    time.sleep(1)  



In [0]:
# Read Bronze layer
df = spark.table("enterprise_modernization.bronze.bronze_fleet_1")

print(df.columns)

print(df.count())

df.show(5)