# Objective:
# Perform passive data munging (data profiling) on logistics_source1 and logistics_source2.
# No data cleansing or modification is performed in this notebook.
#
# We only IDENTIFY and COUNT data quality issues such as:
#  - Non-numeric shipment_id
#  - Non-integer age
#  - Schema drift (fewer / more columns than expected)
#  - Common shipment_ids across both datasets



In [0]:
# Databricks Volume paths (raw source data)
source_path_v1 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1"
source_path_v2 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2"


2. Read Raw Data in PERMISSIVE Mode

In [0]:
master_v1 = spark.read.csv(
    source_path_v1,
    header=True,
    mode="PERMISSIVE",
    columnNameOfCorruptRecord="corrupt_record"
)

master_v2 = spark.read.csv(
    source_path_v2,
    header=True,
    mode="PERMISSIVE",
    columnNameOfCorruptRecord="corrupt_record"
)
display(master_v1)
display(master_v2)

3. Shipment IDs Appearing in BOTH Datasets <BR>
Goal:
Identify shipment_id values that exist in both

In [0]:

# Identifying shipment_ids that appear in both master_v1 and master_v2
# No type casting is applied (passive analysis only)

common_shipment_ids = (
    master_v1.select("shipment_id").distinct()
    .join(
        master_v2.select("shipment_id").distinct(),
        on="shipment_id",
        how="inner"
    )
)

display(common_shipment_ids)
print("Common shipment_id count:", common_shipment_ids.count())


4. Records where shipment_id is NON-NUMERIC

In [0]:
from pyspark.sql.functions import col

# shipment_id should contain only digits
non_numeric_shipment_v1 = master_v1.filter(
    ~col("shipment_id").rlike("^[0-9]+$")
)

non_numeric_shipment_v2 = master_v2.filter(
    ~col("shipment_id").rlike("^[0-9]+$")
)

print("Non-numeric shipment_id count (v1):", non_numeric_shipment_v1.count())
print("Non-numeric shipment_id count (v2):", non_numeric_shipment_v2.count())

#display(non_numeric_shipment_v1.select("shipment_id"))



Records where age is NOT an Integer

In [0]:
# age column is expected to be a numeric integer
# Values like 'ten', NULL, or empty string are considered invalid

non_integer_age_v1 = master_v1.filter(
    ~col("age").rlike("^[0-9]+$")
)

non_integer_age_v2 = master_v2.filter(
    ~col("age").rlike("^[0-9]+$")
)

print("Invalid age count (v1):", non_integer_age_v1.count())
print("Invalid age count (v2):", non_integer_age_v2.count())

display(non_integer_age_v1.select("shipment_id", "age"))
display(non_integer_age_v2.select("shipment_id", "age"))



Rows with MORE Columns than Expected (Schema Drift)

ðŸ”¹ 6. Rows with MORE Columns than Expected (Schema Drift)

In [0]:
# Rows with extra columns get captured in _corrupt_record
# Expected columns:
#  - master_v1 : 5 columns
#  - master_v2 : 7 columns

from pyspark.sql.functions import col

extra_columns_v1 = master_v1.filter(col("_corrupt_record").isNotNull())
extra_columns_v2 = master_v2.filter(col("_corrupt_record").isNotNull())

print("Rows with MORE columns (v1):", extra_columns_v1.count())
print("Rows with MORE columns (v2):", extra_columns_v2.count())

display(extra_columns_v1.select("_corrupt_record"))

