In [1]:
from utils.spark_utils import create_spark_session
import os
import shutil
from pyspark.sql import functions as F

In [3]:
spark = create_spark_session()
fact_flight = spark.read.csv("../output/FactFlight.csv", header=True, inferSchema=True)
fact_flight.printSchema()

root
 |-- flight_id: string (nullable = true)
 |-- date_id: string (nullable = true)
 |-- departure_time_id: string (nullable = true)
 |-- arrival_time_id: string (nullable = true)
 |-- origin_airport_id: string (nullable = true)
 |-- destination_airport_id: string (nullable = true)
 |-- marketing_airline_id: string (nullable = true)
 |-- operating_airline_id: string (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- dep_delay_minutes: double (nullable = true)
 |-- arr_delay_minutes: double (nullable = true)
 |-- crs_elapsed_time: double (nullable = true)
 |-- actual_elapsed_time: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- status: string (nullable = true)
 |-- flight_complexity_score: double (nullable = true)



In [4]:
missing_counts = fact_flight.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c)
    for c in fact_flight.columns
])
missing_counts.show()

+---------+-------+-----------------+---------------+-----------------+----------------------+--------------------+--------------------+-----------+-----------------+-----------------+----------------+-------------------+--------+------+-----------------------+
|flight_id|date_id|departure_time_id|arrival_time_id|origin_airport_id|destination_airport_id|marketing_airline_id|operating_airline_id|tail_number|dep_delay_minutes|arr_delay_minutes|crs_elapsed_time|actual_elapsed_time|distance|status|flight_complexity_score|
+---------+-------+-----------------+---------------+-----------------+----------------------+--------------------+--------------------+-----------+-----------------+-----------------+----------------+-------------------+--------+------+-----------------------+
|        0|      0|          8873627|        6249737|                0|                     0|                   0|                   0|     268217|           763745|           846884|              22|             

In [5]:
total_rows = fact_flight.count()

missing_percent = fact_flight.select([
    (F.sum(F.col(c).isNull().cast("int")) / total_rows * 100).alias(c)
    for c in fact_flight.columns
])

missing_percent.show()

+---------+-------+-----------------+------------------+-----------------+----------------------+--------------------+--------------------+------------------+------------------+-----------------+--------------------+-------------------+--------+------+-----------------------+
|flight_id|date_id|departure_time_id|   arrival_time_id|origin_airport_id|destination_airport_id|marketing_airline_id|operating_airline_id|       tail_number| dep_delay_minutes|arr_delay_minutes|    crs_elapsed_time|actual_elapsed_time|distance|status|flight_complexity_score|
+---------+-------+-----------------+------------------+-----------------+----------------------+--------------------+--------------------+------------------+------------------+-----------------+--------------------+-------------------+--------+------+-----------------------+
|      0.0|    0.0|30.38459356227013|21.400011361316114|              0.0|                   0.0|                 0.0|                 0.0|0.9184141424348137|2.615174314

In [6]:
clean_flight = fact_flight.filter(
    (F.col("departure_time_id").isNotNull()) &
    (F.col("arrival_time_id").isNotNull()) &
    (F.col("dep_delay_minutes").isNotNull()) &
    (F.col("arr_delay_minutes").isNotNull()) &
    (F.col("actual_elapsed_time").isNotNull()) &
    (F.col("crs_elapsed_time").isNotNull()) &
    (F.col("flight_complexity_score").isNotNull())
)
clean_flight = clean_flight.fillna({"tail_number": "UNKNOWN"})

In [7]:
total_rows = clean_flight.count()

missing_percent = clean_flight.select([
    (F.sum(F.col(c).isNull().cast("int")) / total_rows * 100).alias(c)
    for c in clean_flight.columns
])

missing_percent.show()

+---------+-------+-----------------+---------------+-----------------+----------------------+--------------------+--------------------+-----------+-----------------+-----------------+----------------+-------------------+--------+------+-----------------------+
|flight_id|date_id|departure_time_id|arrival_time_id|origin_airport_id|destination_airport_id|marketing_airline_id|operating_airline_id|tail_number|dep_delay_minutes|arr_delay_minutes|crs_elapsed_time|actual_elapsed_time|distance|status|flight_complexity_score|
+---------+-------+-----------------+---------------+-----------------+----------------------+--------------------+--------------------+-----------+-----------------+-----------------+----------------+-------------------+--------+------+-----------------------+
|      0.0|    0.0|              0.0|            0.0|              0.0|                   0.0|                 0.0|                 0.0|        0.0|              0.0|              0.0|             0.0|             

In [8]:
dimension_files = {
    "Dim_Airport": "../output/Dim_Airport.csv",
    "Dim_Date": "../output/Dim_Date.csv",
    "Dim_Marketing_Airline": "../output/Dim_Marketing_Airline.csv",
    "Dim_Operating_Airline": "../output/Dim_Operating_Airline.csv",
    "Dim_Time": "../output/Dim_Time.csv"
}

def analyze_missing(df, name):
    print(f"\n==== {name} ====")
    df.select([
        F.sum(F.col(c).isNull().cast("int")).alias(c)
        for c in df.columns
    ]).show(truncate=False)

for name, path in dimension_files.items():
    dim_df = spark.read.csv(path, header=True, inferSchema=True)
    analyze_missing(dim_df, name)



==== Dim_Airport ====
+----------+------------+---------+----------+-------+
|airport_id|airport_code|city_name|state_name|country|
+----------+------------+---------+----------+-------+
|0         |0           |0        |0         |0      |
+----------+------------+---------+----------+-------+


==== Dim_Date ====
+-------+-----------+----+-------+-----+------------+------------+----------+
|date_id|flight_date|year|quarter|month|day_of_month|week_of_year|is_weekend|
+-------+-----------+----+-------+-----+------------+------------+----------+
|0      |0          |0   |0      |0    |0           |0           |0         |
+-------+-----------+----+-------+-----+------------+------------+----------+


==== Dim_Marketing_Airline ====
+--------------------+---------+------------+
|marketing_airline_id|iata_code|airline_name|
+--------------------+---------+------------+
|0                   |0        |0           |
+--------------------+---------+------------+


==== Dim_Operating_Airlin

In [18]:
output_dir = "../output"
output_path = os.path.join(output_dir, "CleanFactFlight.csv")

os.makedirs(output_dir, exist_ok=True)
temp_path = "temp_CleanFactFlight"
if os.path.exists(temp_path):
    shutil.rmtree(temp_path)
clean_flight.write.csv(temp_path, header=True, mode="overwrite")

for file in os.listdir(temp_path):
    if file.startswith("part-") and file.endswith(".csv"):
        part_file = os.path.join(temp_path, file)
        shutil.move(part_file, output_path)
        break

shutil.rmtree(temp_path)