- **Name:** 20.3_streaming_bad_data
- **Author:** Shamas Imran
- **Desciption:** Handling bad or corrupt data in streaming pipelines
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Simulated bad records in streaming input  
                                              handled schema mismatch  
-->

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

In [0]:
# ------------------------------------------------------------
# 1) Spark Session
# ------------------------------------------------------------
spark = (
    SparkSession.builder
        .appName("CSV_Streaming_Error_Handling")
        .getOrCreate()
)

In [0]:
# ------------------------------------------------------------
# 2) Folder Paths (your provided Unity Catalog paths)
# ------------------------------------------------------------
inputPath       = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_input"
checkpointPath  = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/checkpoints/csv_query"
outputPath      = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_output"
badRecordsPath  = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_bad_records"

In [0]:
# ------------------------------------------------------------
# 3) Define Schema (expected columns)
# ------------------------------------------------------------
# NOTE: we add `_corrupt_record` column to capture invalid rows.
schema = StructType([
    StructField("id", IntegerType(), True),         # should be integer
    StructField("name", StringType(), True),        # string field
    StructField("score", IntegerType(), True),      # should be integer
    StructField("event_time", TimestampType(), True), # event time (for future)
    StructField("_corrupt_record", StringType(), True)  # keeps the bad row
])

In [0]:
# ------------------------------------------------------------
# 4) Create Streaming DataFrame with Error Handling
# ------------------------------------------------------------
# mode = PERMISSIVE => put invalid rows into _corrupt_record
df_stream = (
    spark.readStream
         .option("header", "true")                           # CSV has header
         .schema(schema)                                     # enforce schema
         .option("mode", "PERMISSIVE")                       # keep bad rows
         .option("columnNameOfCorruptRecord", "_corrupt_record")
         .csv(inputPath)                                     # folder to watch
)

# option("mode" ============>
# PERMISSIVE (default) → Keeps all rows; bad rows go into _corrupt_record column.
# DROPMALFORMED → Skips bad rows completely (they are dropped).
# FAILFAST → Stops the job immediately when a bad row is found.

# ------------------------------------------------------------
# 5) Separate Good vs Bad Records
# ------------------------------------------------------------
valid_rows = df_stream.filter(df_stream["_corrupt_record"].isNull())
bad_rows   = df_stream.filter(df_stream["_corrupt_record"].isNotNull())

In [0]:
# ------------------------------------------------------------
# 6) Write Valid Records to Output Folder
# ------------------------------------------------------------
valid_query = (
    valid_rows.writeStream
              .format("csv")                                 # write CSVs
              .option("path", outputPath)                    # output folder
              .option("checkpointLocation", checkpointPath + "/valid") # unique checkpoint
              .outputMode("append")                          # append = new rows only
              .start()
)

# ------------------------------------------------------------
# 7) Write Bad Records to Quarantine Folder
# ------------------------------------------------------------
bad_query = (
    bad_rows.writeStream
            .format("csv")                                   # store bad rows separately
            .option("path", badRecordsPath)
            .option("checkpointLocation", checkpointPath + "/bad") # unique checkpoint
            .outputMode("append")
            .start()
)

# ------------------------------------------------------------
# 8) Wait for Streams to Finish
# ------------------------------------------------------------
# This keeps your job running until you stop it manually
spark.streams.awaitAnyTermination()

In [0]:
Valid_data.csv
id,name,score,event_time
1,Ali,85,2025-08-18 10:00:00
2,Sara,90,2025-08-18 10:05:00
3,Omar,75,2025-08-18 10:10:00

invalidd_ata.csv
id,name,score,event_time,extra
4,Ayesha,88,2025-08-18 10:15:00,unexpected