In [0]:
from pyspark.sql import SparkSession

# Creating SparkSession
spark = (
    SparkSession.builder
    .appName("Logistics_Data_Engineering")
    .getOrCreate()
)



In [0]:
# Databricks Volume paths (raw source data)
source_path_v1 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1"
source_path_v2 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2"


1.Combining Data + Schema Merging (Structuring) <br>
Read both files without enforcing schema <br>
Align them into a single canonical schema: shipment_id, first_name, last_name, age, role, hub_location, vehicle_type, data_source <br>
Add data_source column with values as: system1, system2 in the respective dataframes

1. Read both source files (NO schema enforcement)

In [0]:
# --------------------------------------------------
# Step 1: Read raw source files without enforcing schema
# --------------------------------------------------
# All columns are read as STRING to avoid early data loss

df_system1 = (
    spark.read
    .option("header", "true")
    .csv(source_path_v1)
)

df_system2 = (
    spark.read
    .option("header", "true")
    .csv(source_path_v2)
)

display(df_system1)
display(df_system2)




ðŸ”¹ 2. Add data_source column

In [0]:
from pyspark.sql.functions import lit

# --------------------------------------------------
# Step 2: Add data_source column
# --------------------------------------------------

df_system1 = df_system1.withColumn("data_source", lit("system1"))
df_system2 = df_system2.withColumn("data_source", lit("system2"))

display(df_system1)
display(df_system2)


In [0]:
from pyspark.sql.functions import lit

# --------------------------------------------------
# Add missing columns to system1 to match canonical schema
# --------------------------------------------------

df_system1_aligned = (
    df_system1
    .withColumn("hub_location", lit(None))
    .withColumn("vehicle_type", lit(None))
)

display(df_system1_aligned )

In [0]:
# --------------------------------------------------
# Step 3b: Align system2 schema
# --------------------------------------------------

df_system2_aligned = (
    df_system2
    .select(
        "shipment_id",
        "first_name",
        "last_name",
        "age",
        "role",
        "hub_location",
        "vehicle_type",
        "data_source"
    )
)


ðŸ”¹ 4. Union both datasets into ONE canonical DataFrame

In [0]:
# --------------------------------------------------
# Step 4: Combine both systems into a single dataset
# --------------------------------------------------

combined_df = df_system1_aligned.unionByName(df_system2_aligned)

display(combined_df)
combined_df.printSchema()


#####2. Cleansing, Scrubbing: 
Cleansing (removal of unwanted datasets)<br>
1. Mandatory Column Check - Drop any record where any of the following columns is NULL:shipment_id, role<br>
2. Name Completeness Rule - Drop records where both of the following columns are NULL: first_name, last_name<br>
3. Join Readiness Rule - Drop records where the join key is null: shipment_id<br>

Scrubbing (convert raw to tidy)<br>
4. Age Defaulting Rule - Fill NULL values in the age column with: -1<br>
5. Vehicle Type Default Rule - Fill NULL values in the vehicle_type column with: UNKNOWN<br>
6. Invalid Age Replacement - Replace the following values in age:
"ten" to -1
"" to -1<br>
7. Vehicle Type Normalization - Replace inconsistent vehicle types: 
truck to LMV
bike to TwoWheeler

Mandatory Column Check - Drop any record where any of the following columns is NULL:shipment_id, role

In [0]:
from pyspark.sql.functions import col

# --------------------------------------------------
# Mandatory Column Check
# shipment_id and role are mandatory business fields
# --------------------------------------------------

# Records failing mandatory column check
rejected_mandatory_df = combined_df.filter(
    col("shipment_id").isNull() | col("role").isNull()
)

# Records passing mandatory column check
valid_mandatory_df = combined_df.dropna(
    subset=["shipment_id", "role"]
)

# Audit counts
print("Total records before check:", combined_df.count())
print("Records rejected (mandatory fields missing):", rejected_mandatory_df.count())
print("Records retained after check:", valid_mandatory_df.count())

# Display for validation
display(rejected_mandatory_df)
display(valid_mandatory_df)



Name Completeness Rule - Drop records where both of the following columns are NULL: first_name, last_name

In [0]:

from pyspark.sql.functions import col

# --------------------------------------------------
# Name Completeness Rule
# Drop records where BOTH first_name AND last_name are NULL
# --------------------------------------------------

rejected_name_df = valid_mandatory_df.filter(
    col("first_name").isNull() & col("last_name").isNull()
)

valid_name_df = valid_mandatory_df.filter(
    ~(col("first_name").isNull() & col("last_name").isNull())
)

print("Records before name completeness check:", valid_mandatory_df.count())
print("Records rejected (both names missing):", rejected_name_df.count())
print("Records retained after name completeness check:", valid_name_df.count())

display(rejected_name_df)
display(valid_name_df)


ðŸ”¹ Join Readiness Rule

In [0]:
from pyspark.sql.functions import col

# --------------------------------------------------
# Join Readiness Rule
# Drop records where shipment_id is NULL
# --------------------------------------------------

rejected_join_df = valid_name_df.filter(
    col("shipment_id").isNull()
)

join_ready_df = valid_name_df.filter(
    col("shipment_id").isNotNull()
)

print("Records before join readiness check:", valid_name_df.count())
print("Records rejected (shipment_id is NULL):", rejected_join_df.count())
print("Records ready for join:", join_ready_df.count())

display(rejected_join_df)
display(join_ready_df)


In [0]:
join_ready_df.printSchema

In [0]:
from pyspark.sql.functions import coalesce, expr, lit

# --------------------------------------------------
# Scrubbing Rule 4: Age Defaulting
# --------------------------------------------------
# Business Rule:
# - If `age` is NULL or non-numeric, replace with -1
# - Valid numeric values are preserved
# --------------------------------------------------

join_ready_df = (
    join_ready_df
        .withColumn(
            "age",
            coalesce(expr("try_cast(age as bigint)"), lit(-1))
        )
)

# Verify the result
display(join_ready_df)
