In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# --------------------------------------------------
# Custom Schema with Corrupt Record Capture
# --------------------------------------------------

custom_schema = StructType([
    StructField("shipment_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("role", StringType(), True),
    StructField("_corrupt_record", StringType(), True)  # ðŸ‘ˆ REQUIRED
])

df_s1 = (
    spark.read
        .schema(custom_schema)
        .option("header", "true")
        .option("mode", "PERMISSIVE")  # default, but explicit is good
        .option("columnNameOfCorruptRecord", "_corrupt_record")
        .csv("/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1")
)

display(df_s1)






In [0]:
custom_schema = StructType([
    StructField("shipment_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("role", StringType(), True),
    StructField("hub_location", StringType(), True),
    StructField("vehicle_type", StringType(), True),
    StructField("_corrupt_record", StringType(), True)  # ðŸ‘ˆ REQUIRED
])

df_s2 = (
    spark.read
        .schema(custom_schema)
        .option("header", "true")
        .option("mode", "PERMISSIVE")  # default, but explicit is good
        .option("columnNameOfCorruptRecord", "_corrupt_record")
        .csv("/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2")
)

display(df_s2.count())

In [0]:
# --------------------------------------------------
# Combine Source DataFrames
# --------------------------------------------------
# - Union df_s1 and df_s2 by column name
# - Allow missing columns across sources
# --------------------------------------------------

df_combine = df_s1.unionByName(
    df_s2,
    allowMissingColumns=True
)

# --------------------------------------------------
# Persist Corrupt Records
# --------------------------------------------------
# - Filter rows where _corrupt_record is present
# - Write them to a quarantine location for analysis
# --------------------------------------------------

(
    df_combine
        .filter("_corrupt_record IS NOT NULL")
        .write
        .mode("overwrite")
        .option("header", "true")
        .csv("/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_corrupt")
)
