In [0]:
from pyspark.sql import SparkSession

# Creating SparkSession
spark = (
    SparkSession.builder
    .appName("Logistics_Data_Engineering")
    .getOrCreate()
)



In [0]:
# Databricks Volume paths (raw source data)
source_path_v1 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1"
source_path_v2 = "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2"


1.Combining Data + Schema Merging (Structuring) <br>
Read both files without enforcing schema <br>
Align them into a single canonical schema: shipment_id, first_name, last_name, age, role, hub_location, vehicle_type, data_source <br>
Add data_source column with values as: system1, system2 in the respective dataframes

1. Read both source files (NO schema enforcement)

In [0]:
# --------------------------------------------------
# Step 1: Read raw source files without enforcing schema
# --------------------------------------------------
# All columns are read as STRING to avoid early data loss

df_system1 = (
    spark.read
    .option("header", "true")
    .csv(source_path_v1)
)

df_system2 = (
    spark.read
    .option("header", "true")
    .csv(source_path_v2)
)

display(df_system1)
display(df_system2)




ðŸ”¹ 2. Add data_source column

In [0]:
from pyspark.sql.functions import lit

# --------------------------------------------------
# Step 2: Add data_source column
# --------------------------------------------------

df_system1 = df_system1.withColumn("data_source", lit("system1"))
df_system2 = df_system2.withColumn("data_source", lit("system2"))

display(df_system1)
display(df_system2)


In [0]:
from pyspark.sql.functions import lit

# --------------------------------------------------
# Add missing columns to system1 to match canonical schema
# --------------------------------------------------

df_system1_aligned = (
    df_system1
    .withColumn("hub_location", lit(None))
    .withColumn("vehicle_type", lit(None))
)

display(df_system1_aligned )

In [0]:
# --------------------------------------------------
# Step 3b: Align system2 schema
# --------------------------------------------------

df_system2_aligned = (
    df_system2
    .select(
        "shipment_id",
        "first_name",
        "last_name",
        "age",
        "role",
        "hub_location",
        "vehicle_type",
        "data_source"
    )
)


ðŸ”¹ 4. Union both datasets into ONE canonical DataFrame

In [0]:
# --------------------------------------------------
# Step 4: Combine both systems into a single dataset
# --------------------------------------------------

combined_df = df_system1_aligned.unionByName(df_system2_aligned)

display(combined_df)
combined_df.printSchema()
