# Objective:
# Perform passive data munging (data profiling) on logistics_source1 and logistics_source2.
# No data cleansing or modification is performed in this notebook.
#
# We only IDENTIFY and COUNT data quality issues such as:
#  - Non-numeric shipment_id
#  - Non-integer age
#  - Schema drift (fewer / more columns than expected)
#  - Common shipment_ids across both datasets



In [0]:
# Databricks Volume paths (raw source data)
source_path_v1 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1"
source_path_v2 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2"


2. Read Raw Data in PERMISSIVE Mode

In [0]:
master_v1 = spark.read.csv(
    source_path_v1,
    header=True,
    mode="PERMISSIVE",
    columnNameOfCorruptRecord="corrupt_record"
)

master_v2 = spark.read.csv(
    source_path_v2,
    header=True,
    mode="PERMISSIVE",
    columnNameOfCorruptRecord="corrupt_record"
)
display(master_v1)
display(master_v2)

3. Shipment IDs Appearing in BOTH Datasets <BR>
Goal:
Identify shipment_id values that exist in both

In [0]:

# Identifying shipment_ids that appear in both master_v1 and master_v2
# No type casting is applied (passive analysis only)

common_shipment_ids = (
    master_v1.select("shipment_id").distinct()
    .join(
        master_v2.select("shipment_id").distinct(),
        on="shipment_id",
        how="inner"
    )
)

display(common_shipment_ids)
print("Common shipment_id count:", common_shipment_ids.count())
