- **Name:** 20.2_streaming_bad_data
- **Author:** Shamas Imran
- **Desciption:** Handling schema mismatch in streaming data
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Demonstrated bad-data handling                                                  
-->

In [0]:
# ------------------------------
# 1) Paths (your folders)
# ------------------------------
inputPath = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_input"
checkpointPath = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/checkpoints/csv_query"
badRecordsPath = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/bad_records"

# ------------------------------
# 2) Define expected schema
# ------------------------------
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("score", IntegerType(), True),
    StructField("event_time", TimestampType(), True)
])

# ------------------------------
# 3) Create streaming DataFrame
# ------------------------------
df_stream = (
    spark.readStream
         .option("header", "true")                 # CSV header
         .option("badRecordsPath", badRecordsPath) # Capture corrupt records
         .schema(schema)                           # Enforce schema
         .csv(inputPath)                           # Source folder
)

# ------------------------------
# 4) Optional: Validate data
# ------------------------------
# Example: filter out rows missing critical fields
df_valid = df_stream.filter("id IS NOT NULL AND score IS NOT NULL")

# ------------------------------
# 5) Write to console sink
# ------------------------------
query = (
    df_valid.writeStream
            .format("console")                     # Print rows to console
            .option("checkpointLocation", checkpointPath)  
            .outputMode("append")                  # Append-only (no aggregations yet)
            .trigger(once=True)                    # Run once and stop
            .start()
)

# ------------------------------
# 6) Wait for completion
# ------------------------------
query.awaitTermination()

In [0]:
id,name,score,event_time
1,John,85,2025-08-18 10:00:00
2,Jane,92,2025-08-18 10:05:00
3,Bob,78,2025-08-18 10:10:00
,MissingID,88,2025-08-18 10:15:00           # invalid: id is NULL
4,Alice,abc,2025-08-18 10:20:00              # invalid: score is not integer
5,Charlie,95,not_a_timestamp                  # invalid: event_time malformed
6,,88,2025-08-18 10:30:00                     # invalid: name is NULL (optional)
7,David,82,2025-08-18 10:35:00
8,Eva,90,2025-08-18 10:40:00