In [0]:
#imports

from pyspark.sql.functions import (
    col, when, current_date, lit
)
from pyspark.sql.types import DateType
from delta.tables import DeltaTable

In [0]:
#PATHS

silver_flights_path = "dbfs:/FileStore/tables/sahan_project/silver/flights"
silver_airlines_path = "dbfs:/FileStore/tables/sahan_project/silver/airlines"
silver_airports_path = "dbfs:/FileStore/tables/sahan_project/silver/airports"

gold_flight_fact_path = "dbfs:/FileStore/tables/sahan_project/gold/flight_fact"
gold_airline_dim_scd_path = "dbfs:/FileStore/tables/sahan_project/gold/airline_dim_scd"
gold_airport_dim_path = "dbfs:/FileStore/tables/sahan_project/gold/airport_dim"

In [0]:
#GOLD FACT – FLIGHT FACT (IDEMPOTENT MERGE)

silver_flights = spark.read.format("delta").load(silver_flights_path)

gold_incremental = silver_flights.select(
    col("FLIGHT_DATE"),
    col("AIRLINE").alias("AIRLINE_CODE"),
    col("FLIGHT_NUMBER"),
    col("ORIGIN_AIRPORT"),
    col("DESTINATION_AIRPORT"),
    col("DEPARTURE_DELAY"),
    col("ARRIVAL_DELAY"),
    col("AIR_TIME"),
    col("ELAPSED_TIME"),
    col("CANCELLED"),
    col("DIVERTED"),

    # Business KPIs
    when(col("ARRIVAL_DELAY") <= 0, 1).otherwise(0).alias("ON_TIME_FLAG"),
    when(col("ARRIVAL_DELAY") > 15, 1).otherwise(0).alias("DELAY_15_PLUS_FLAG"),
    when(col("CANCELLED") == 1, 1).otherwise(0).alias("CANCEL_FLAG"),
    when(col("DIVERTED") == 1, 1).otherwise(0).alias("DIVERT_FLAG")
)

if DeltaTable.isDeltaTable(spark, gold_flight_fact_path):
    gold_fact = DeltaTable.forPath(spark, gold_flight_fact_path)

    (
        gold_fact.alias("t")
        .merge(
            gold_incremental.alias("s"),
            """
            t.FLIGHT_DATE = s.FLIGHT_DATE AND
            t.AIRLINE_CODE = s.AIRLINE_CODE AND
            t.FLIGHT_NUMBER = s.FLIGHT_NUMBER AND
            t.ORIGIN_AIRPORT = s.ORIGIN_AIRPORT AND
            t.DESTINATION_AIRPORT = s.DESTINATION_AIRPORT
            """
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    (
        gold_incremental.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("FLIGHT_DATE")
        .save(gold_flight_fact_path)
    )

In [0]:
#GOLD DIMENSION – AIRLINE (SCD TYPE 2 – FIXED & SAFE)

airline_source = spark.read.format("delta").load(silver_airlines_path).select(
    col("IATA_CODE").alias("AIRLINE_CODE"),
    col("AIRLINE").alias("AIRLINE_NAME")
)

# First-time table creation (schema-safe)
if not DeltaTable.isDeltaTable(spark, gold_airline_dim_scd_path):
    (
        airline_source
        .withColumn("effective_start_date", current_date())
        .withColumn("effective_end_date", lit(None).cast(DateType()))
        .withColumn("is_current", lit(True))
        .write.format("delta")
        .mode("overwrite")
        .save(gold_airline_dim_scd_path)
    )
else:
    airline_target = DeltaTable.forPath(spark, gold_airline_dim_scd_path)

    (
        airline_target.alias("t")
        .merge(
            airline_source.alias("s"),
            "t.AIRLINE_CODE = s.AIRLINE_CODE AND t.is_current = true"
        )
        .whenMatchedUpdate(
            condition="t.AIRLINE_NAME <> s.AIRLINE_NAME",
            set={
                "effective_end_date": "current_date()",
                "is_current": "false"
            }
        )
        .whenNotMatchedInsert(
            values={
                "AIRLINE_CODE": "s.AIRLINE_CODE",
                "AIRLINE_NAME": "s.AIRLINE_NAME",
                "effective_start_date": "current_date()",
                "effective_end_date": "CAST(NULL AS DATE)",
                "is_current": "true"
            }
        )
        .execute()
    )

In [0]:
#GOLD DIMENSION – AIRPORT

airport_dim = (
    spark.read.format("delta").load(silver_airports_path)
    .select(
        col("IATA_CODE").alias("AIRPORT_CODE"),
        "AIRPORT", "CITY", "STATE", "COUNTRY", "LATITUDE", "LONGITUDE"
    )
    .dropDuplicates(["AIRPORT_CODE"])
)

airport_dim.write.format("delta") \
    .mode("overwrite") \
    .save(gold_airport_dim_path)

In [0]:
#PERFORMANCE OPTIMIZATION (CORRECT Z-ORDER)

spark.sql(f"""
OPTIMIZE delta.`{gold_flight_fact_path}`
ZORDER BY (AIRLINE_CODE, ORIGIN_AIRPORT, DESTINATION_AIRPORT)
""")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
#QUICK VALIDATION

print("GOLD FLIGHT FACT SCHEMA")
spark.read.format("delta").load(gold_flight_fact_path).printSchema()

print("GOLD AIRLINE SCD SCHEMA")
spark.read.format("delta").load(gold_airline_dim_scd_path).printSchema()

✅ GOLD FLIGHT FACT SCHEMA
root
 |-- FLIGHT_DATE: date (nullable = true)
 |-- AIRLINE_CODE: string (nullable = true)
 |-- FLIGHT_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- DIVERTED: string (nullable = true)
 |-- ON_TIME_FLAG: integer (nullable = true)
 |-- DELAY_15_PLUS_FLAG: integer (nullable = true)
 |-- CANCEL_FLAG: integer (nullable = true)
 |-- DIVERT_FLAG: integer (nullable = true)

✅ GOLD AIRLINE SCD SCHEMA
root
 |-- AIRLINE_CODE: string (nullable = true)
 |-- AIRLINE_NAME: string (nullable = true)
 |-- effective_start_date: date (nullable = true)
 |-- effective_end_date: date (nullable = true)
 |-- is_current: boolean (nullable = true)

