## Imports

- Imports Spark + ML librarires
- Enable automatic ML tracking
- Ensure MLFlow experiment exists

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


import random

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


- Save Spark DataFrame to DBFS as Parquet file
- Build base folder path

In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

In [0]:

def checkpoint_dataset(dataset, file_path):
    # Create folder
    section = "2"
    number = "2"
    folder_path = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(folder_path)
    # Save df_weather as a parquet file
    dataset.write.parquet(f"{folder_path}/{file_path}.parquet")
    print(f"Checkpointed {file_path}")

## Datasets

- Load airline dataset
- Count total rows
- Display DataFrame

In [0]:
# Airline Data    
df_flights = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_3m/")
print(df_flights.count())
display(df_flights)

## Column analysis - Can remove later
- OP_UNIQUE_CARRIER == OP_CARRIER_AIRLINE_ID == OP_CARRIER == OP_CARRIER_FL_NUM
- TAIL_NUMBER --> License Plate Number
- ORIGIN_AIRPORT_SEQ_ID == ORIGIN_CITY_MARKET_ID == ORIGIN == ORIGIN_CITY_NAME == ORIGIN_STATE_ABR == ORIGIN_STATE_FIPS (code for state) == ORIGIN_STATE_NM == ORIGIN_WAC (origin airport, world area code)
- DEST_AIRPORT_ID == DEST_AIRPORT_SEQ_ID == DEST_CITY_MARKET_ID == DEST == DEST_CITY_NAME == DEST_STATE_ABR == DEST_STATE_FIPS == DEST_STATE_NAME == DEST_WAC
- CRS_DEP_TIME -> Scheduled departure time in the computer reservation system
---
- DEP_TIME -> Actual Dept time
- DEPT_DELAY -> difference in minutes between scheduled and actual departure time
- DEPT_DELAY_NEW -> early flights are 0
- DEP_DEL15
- DEP_DELAY_GROUP
- DEP_TIME_BLK
- TAXI_OUT -> Taxi out time in minutes
- WHEELS OFF
- WHEELS ON - Time at landing (local time)
- TAXI_IN
- CRS ARR TIME - scheduled arrival time
- ARR_TIME - actual arrival time
- ARR_DELAY
- ARR_DELAY_NEW
- ARR_DEL15
- ARR_DELAY_GROUP
- ARR_TIME_BLK
- CANCELLED, CANELLATION_CODE, DIVERTED
- CRS_ELAPSED_TIME - scheduled flight time
- ACTUAL_ELAPSED_TIME
- AIR_TIME = flight time in minutes
- FLIGHTS = Number of flights (Idk what this means)
- DISTANCE
- DISTANCE_GROUP (every 250 miles)
- CARRIER_DELAY
- WEATHER_DELAY
- NAS_DELAY (National air system delay)
- SECURITY_DELAY
- LATE_AIRCRAFT_DELAY
- FIRST_DEP_TIME - first gate departure time at origin airport (nulls see what to do with them)
- TOTAL_ADD_GTIME - total ground time away from gate for gate return or cancelled flight
- LONGEST_ADD_GTME - longest time away from gate for gate return or cancelled flight
- A BUNCH OF DIVERTED AIRPORT COLUMNS (do some eda, they seem empty)

# Things to keep in mind
- Predict two hours before
- Remove all the delay columns
- Are we only predicting departure delays or arrival delays also? For example, the pilot misses the landing, and has to circle back for 20 minutes. Should we solve for that? I don't think we should.

## Preprocessing / Cleanup

- Cache the dataset
- Create a combined timestamp column
- Imput missing TAIL_NUM values

In [0]:
# Make a copy of the dataset
df_flights = df_flights.cache()

In [0]:
# combine date and scheduled departure time

df_flights = df_flights.withColumn(
    "utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

In [0]:
# impute null values for tail numbers
#   - all replaced tail numbers will start with 'X' followed by a randomized 5 digit number

df_flights = df_flights.withColumn(
    "TAIL_NUM",
    F.when(
        F.col("TAIL_NUM").isNull(),
        F.concat(
            F.lit("X"),
            (F.floor(F.rand() * 89999) + 10000).cast("string")
        )
    ).otherwise(F.col("TAIL_NUM"))
)

## Model Iterations

- Load pre-saed training and validation dataframes previously checkpointed into DBFS
- 

In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
dataset_path = f"{checkpoint_path}/1_year_custom_joined/raw_data/training_splits"

# Read datasets from checkpoint
train_df = spark.read.parquet(f"{dataset_path}/train.parquet")
validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")

In [0]:
# Feature Engineering

## CRS_DEP_TIME is local time so we can use this feature 
## But in order to use it, we have to convert it to minutes since midnight
## Otherwise the timing will be off b/c it's not true UTC

# train_df = train_df.\
#         withColumn("CRS_DEP_MINUTES", (F.floor(F.col("CRS_DEP_TIME") / 100) * 60 + (F.col("CRS_DEP_TIME") % 100))).\
#         drop("CRS_DEP_TIME").\
#         drop("CRS_ARR_TIME")

# validation_df = validation_df.\
#         withColumn("CRS_DEP_MINUTES", (F.floor(F.col("CRS_DEP_TIME") / 100) * 60 + (F.col("CRS_DEP_TIME") % 100))).\
#         drop("CRS_DEP_TIME").\
#         drop("CRS_ARR_TIME")


train_df = train_df.withColumn(
    "CRS_DEP_MINUTES", 
    (F.floor(F.col("CRS_DEP_TIME") / 100) * 60 + (F.col("CRS_DEP_TIME") % 100))
)

validation_df = validation_df.withColumn(
    "CRS_DEP_MINUTES", 
    (F.floor(F.col("CRS_DEP_TIME") / 100) * 60 + (F.col("CRS_DEP_TIME") % 100))
)


### Feature Eng.

#### Was the previous flight delayed? And by how much was the previous flight delayed?

In [0]:
train_df = train_df.cache()

In [0]:
# train_df.columns


In [0]:
train_df = train_df \
    .withColumn("prev_flight_delay_in_minutes", F.lag("DEP_DELAY_NEW", 1) \
        .over(Window.partitionBy("TAIL_NUM") \
        .orderBy("utc_timestamp"))) \
    .withColumn("prev_flight_delay_in_minutes", F.when(F.col("prev_flight_delay_in_minutes").isNull(), -1) \
        .otherwise(F.col("prev_flight_delay_in_minutes"))) \
    .withColumn("prev_flight_delay", F.when(F.col("prev_flight_delay_in_minutes") > 15, 1) \
        .otherwise(F.lit(0)))
    
validation_df = validation_df \
    .withColumn("prev_flight_delay_in_minutes", F.lag("DEP_DELAY_NEW", 1) \
        .over(Window.partitionBy("TAIL_NUM") \
        .orderBy("utc_timestamp"))) \
    .withColumn("prev_flight_delay_in_minutes", F.when(F.col("prev_flight_delay_in_minutes").isNull(), -1) \
        .otherwise(F.col("prev_flight_delay_in_minutes"))) \
    .withColumn("prev_flight_delay", F.when(F.col("prev_flight_delay_in_minutes") > 15, 1) \
        .otherwise(F.lit(0)))


### [Feature] Delay time for flights at departure locations over the past 7 days

In [0]:
window_7d_origin = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_df = train_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
train_df = train_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

validation_df = validation_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
validation_df = validation_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

### [Feature] Number of delayed flights at departure and carrier location over the last 7 days

In [0]:
window_7d_origin_carrier = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID", "OP_UNIQUE_CARRIER") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_df = train_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
train_df = train_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 


validation_df = validation_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
validation_df = validation_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

### [Feature] number of delays in route in the last 7 days

In [0]:
train_df = train_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

validation_df = validation_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

In [0]:
### Save this for future records
train_origin_dest = train_df
val_origin_dest = validation_df

In [0]:
window_7d_route = Window \
    .partitionBy("route") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours


train_df = train_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
train_df = train_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

validation_df = validation_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
validation_df = validation_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

### [Feature] - number of flights per day for one plane

In [0]:
window_flights_24h = Window \
  .partitionBy("TAIL_NUM", "FL_DATE") \
  .orderBy(F.col("utc_timestamp").cast("long"))

train_df = train_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

validation_df = validation_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

### [Feature] time between landed and scheduled flight

In [0]:
window_turnaround = Window \
    .partitionBy("TAIL_NUM") \
    .orderBy(F.col("WHEELS_ON").cast("long")) 


train_df = train_df.withColumn(
    "next_scheduled_dep_ts", 
    F.lead("CRS_DEP_TIME", 1).over(window_turnaround)
)

train_df = train_df.withColumn(
    "LANDING_TIME_DIFF_MINUTES",
    F.coalesce(
        (F.col("next_scheduled_dep_ts").cast("long") - F.col("WHEELS_ON").cast("long")) / 60,
        F.lit(-999) 
    )
).drop("next_scheduled_dep_ts")

train_df.select("TAIL_NUM", "WHEELS_ON", "CRS_DEP_TIME", "LANDING_TIME_DIFF_MINUTES").orderBy("TAIL_NUM", "WHEELS_ON").show(5)

In [0]:
validation_df = validation_df.withColumn(
    "next_scheduled_dep_ts", 
    F.lead("CRS_DEP_TIME", 1).over(window_turnaround)
)

validation_df = validation_df.withColumn(
    "LANDING_TIME_DIFF_MINUTES",
    F.coalesce(
        (F.col("next_scheduled_dep_ts").cast("long") - F.col("WHEELS_ON").cast("long")) / 60,
        F.lit(-999) 
    )
).drop("next_scheduled_dep_ts")

### [Feature] Average Delay time by airport
- by origin airport and by destination

In [0]:
avg_delay_by_airport_train = train_df.groupBy("DEST_AIRPORT_SEQ_ID").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY")
)

avg_delay_by_airport_val = validation_df.groupBy("DEST_AIRPORT_SEQ_ID").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY")
)

In [0]:
# train_df.select("DEST", "ARR_DELAY", "AVG_ARR_DELAY").show(20, False)

In [0]:

window_7d_origin = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days (604800s) to -4 hours (14400s)

train_df = train_df.withColumn(
    'avg_delay_origin_7d_raw', 
    F.avg('ARR_DELAY').over(window_7d_origin)
)

train_df = train_df.withColumn(
    'AVG_ARR_DELAY_ORIGIN', 
    F.coalesce(F.col('avg_delay_origin_7d_raw'), F.lit(0))
).drop('avg_delay_origin_7d_raw') 

validation_df = validation_df.withColumn(
    'avg_delay_origin_7d_raw', 
    F.avg('ARR_DELAY').over(window_7d_origin)
)

validation_df = validation_df.withColumn(
    'AVG_ARR_DELAY_ORIGIN', 
    F.coalesce(F.col('avg_delay_origin_7d_raw'), F.lit(0))
).drop('avg_delay_origin_7d_raw')

### [Feature] Average taxi-out time by airport

In [0]:
train_df = train_df.withColumn(
    'avg_taxi_out_origin_7d_raw', 
    F.avg('TAXI_OUT').over(window_7d_origin)
)

train_df = train_df.withColumn(
    'AVG_TAXI_OUT_ORIGIN', 
    F.coalesce(F.col('avg_taxi_out_origin_7d_raw'), F.lit(0))
).drop('avg_taxi_out_origin_7d_raw') 

validation_df = validation_df.withColumn(
    'avg_taxi_out_origin_7d_raw', 
    F.avg('TAXI_OUT').over(window_7d_origin)
)

validation_df = validation_df.withColumn(
    'AVG_TAXI_OUT_ORIGIN', 
    F.coalesce(F.col('avg_taxi_out_origin_7d_raw'), F.lit(0))
).drop('avg_taxi_out_origin_7d_raw')

### checkpoint features

In [0]:
null_counts = validation_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in validation_df.columns])
display(null_counts)

### Number of delays before in the last 4 hours

In [0]:
useful_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    # "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "CRS_DEP_MINUTES",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DEP_DELAY_NEW",
    "utc_timestamp",
    "prev_flight_delay_in_minutes",
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN"

]

In [0]:
window_4h = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-14400, -7200) # 4 hours to 2 hours before

train_df = train_df \
    .withColumn("origin_delays_4h", F.count(F.when(F.col("DEP_DELAY_NEW") > 15, 1)) \
        .over(window_4h)
    )
validation_df = validation_df \
    .withColumn("origin_delays_4h", F.count(F.when(F.col("DEP_DELAY_NEW") > 15, 1)) \
        .over(window_4h)
    )

In [0]:
# train_df.columns

In [0]:
baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    # "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "CRS_DEP_MINUTES",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DEP_DELAY_NEW",
    "utc_timestamp",
    "prev_flight_delay_in_minutes",
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN"

]


train_df = train_df.filter(F.col("DEP_DELAY_NEW").isNotNull()).select(baselines_columns)
validation_df = validation_df.filter(F.col("DEP_DELAY_NEW").isNotNull()).select(baselines_columns)

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")



In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        # "tail_num_vec",
        "CRS_DEP_MINUTES",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        "prev_flight_delay_in_minutes",
        "prev_flight_delay",
        "origin_delays_4h",
        "delay_origin_7d",
        "delay_origin_carrier_7d",
        "delay_route_7d",
        "flight_count_24h",
        "LANDING_TIME_DIFF_MINUTES",
        "AVG_ARR_DELAY_ORIGIN",
        "AVG_TAXI_OUT_ORIGIN"

    ],
    outputCol="features"
)

In [0]:
# linear regression baseline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from mlflow.models import infer_signature

mlflow.spark.autolog()
with mlflow.start_run(run_name="RF-1y-all ag, mk, so feats"):
    MODEL_NAME = "RF_1y_BASELINE_ALL_FEATS"

    # linear_reg = LinearRegression(
    #     featuresCol="features",
    #     labelCol="DEP_DELAY_NEW",
    #     # Linear Regression has different parameters than Random Forest
    #     maxIter=10, 
    #     regParam=0.3
    # )
    rf = RandomForestRegressor(
        featuresCol="features",  
        labelCol="DEP_DELAY_NEW",   
        numTrees=20,
        maxDepth=10
    )

    # Create pipeline
    # pipeline = Pipeline(stages=[
    #     carrier_indexer, origin_indexer, dest_indexer, tail_num_indexer,
    #     carrier_encoder, origin_encoder, dest_encoder, tail_num_encoder,
    #     assembler,
    #     # linear_reg
    #     rf
    # ])

    pipeline = Pipeline(stages=[
        carrier_indexer, origin_indexer, dest_indexer,
        carrier_encoder, origin_encoder, dest_encoder,
        assembler,
        # linear_reg
        rf
    ])

    model = pipeline.fit(train_df)
    training_predictions = model.transform(train_df)
    validation_predictions = model.transform(validation_df)

    mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

    rmse_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="rmse"
    )

    # Calculate MAE
    mae_t = mae_evaluator.evaluate(training_predictions)
    mae_v = mae_evaluator.evaluate(validation_predictions)
    # Calculate RMSE
    rmse_t = rmse_evaluator.evaluate(training_predictions)
    rmse_v = rmse_evaluator.evaluate(validation_predictions)

    signature = infer_signature(train_df, training_predictions)

    mlflow.spark.log_model(
        model, 
        MODEL_NAME,
        input_example=train_df.limit(1).toPandas(),
        signature=signature,
        registered_model_name="flight_delay_prediction_baseline"
        )

    mlflow.log_metric("train_mae", mae_t)
    mlflow.log_metric("validation_mae", mae_v)
    mlflow.log_metric("train_rmse", rmse_t)
    mlflow.log_metric("validation_rmse", rmse_v)


## (MK) Error Analysis

- Which airlines does the model predict best / worst for?
- Which airports show the highest prediction error?
- Are errors worse at certain times of day, days of week, seasons?
- Does the model systematically underpredict severe delays?

#### Which airlines does the model predict best / worst for?
Intepretation Guidance:
- High MAE: unpredictable airline operations
- Positive bias: model predicts delays higher than actual
- Negative bias: model underpredicts delays (dangerous)

In [0]:
# Create error column
val = validation_predictions.withColumn(
    "error", 
    F.col("prediction") - F.col("DEP_DELAY_NEW")
).withColumn(
    "abs_error", 
    F.abs(F.col("error"))
)

In [0]:
############################
##### Error by Airline #####
############################

carrier_errors = val.groupBy("OP_CARRIER") \
    .agg(
        F.avg("abs_error").alias("mae"),
        F.avg("error").alias("bias"),
        F.count("*").alias("count")
    ) \
    .orderBy(F.desc("mae"))

display(carrier_errors)


Databricks visualization. Run in Databricks to view.

In [0]:
# Collect the data into Pandas for plotting
carrier_errors_pd = carrier_errors.toPandas()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(12,6))
sns.barplot(
    x="OP_CARRIER",
    y="mae",
    data=carrier_errors_pd,
    palette="flare"
)

plt.title("Mean Absolute Error by Airline")
plt.xlabel("Airline (OP_CARRIER)")
plt.ylabel("Mean Absolute Error (minutes)")
plt.xticks(rotation=45)  # rotate x-axis labels for readability
plt.show()


In [0]:
train_df.printSchema()

In [0]:
#################################################
##### Error by Origin / Destination Airport #####
#################################################

from pyspark.sql import functions as F

# For origin airports
origin_map = train_origin_dest.select(
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN",
    "ORIGIN_CITY_NAME",
    "ORIGIN_STATE_ABR"
).distinct()

# For destination airports
dest_map = train_origin_dest.select(
    "DEST_AIRPORT_SEQ_ID",
    "DEST",
    "DEST_CITY_NAME",
    "DEST_STATE_ABR"
).distinct()


In [0]:
origin_errors_named = origin_errors.join(
    origin_map,
    on="ORIGIN_AIRPORT_SEQ_ID",
    how="left"
)

display(origin_errors_named)

dest_errors_named = dest_errors.join(
    dest_map,
    on="DEST_AIRPORT_SEQ_ID",
    how="left"
)

display(dest_errors_named)

In [0]:
origin_errors_named_filtered = origin_errors_named.filter(F.col("count") > 50)
display(origin_errors_named_filtered)

dest_errors_named_filtered = dest_errors_named.filter(F.col("count") > 50)
display(dest_errors_named_filtered)

In [0]:
# Only take top 20 origin airports by MAE
top_origin_errors = origin_errors_named_filtered.orderBy(F.desc("mae")).limit(20).toPandas()

# Only take top 20 destination airports by MAE
top_dest_errors = dest_errors_named_filtered.orderBy(F.desc("mae")).limit(20).toPandas()


In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Origin airports
plt.figure(figsize=(12,6))
sns.barplot(
    x="mae", 
    y="ORIGIN", 
    data=top_origin_errors,
    palette="Reds_r"
)
plt.title("Top 20 Origin Airports by MAE (Prediction Error)")
plt.xlabel("Mean Absolute Error")
plt.ylabel("Origin Airport")
plt.show()

# Destination airports
plt.figure(figsize=(12,6))
sns.barplot(
    x="mae", 
    y="DEST", 
    data=top_dest_errors,
    palette="Blues_r"
)
plt.title("Top 20 Destination Airports by MAE (Prediction Error)")
plt.xlabel("Mean Absolute Error")
plt.ylabel("Destination Airport")
plt.show()


In [0]:
# #################################################
# ##### Error by Origin / Destination Airport #####
# #################################################

# origin_errors = val.groupBy("ORIGIN_AIRPORT_SEQ_ID") \
#     .agg(F.avg("abs_error").alias("mae"),
#          F.count("*").alias("count")) \
#     .orderBy(F.desc("mae"))

# dest_errors = val.groupBy("DEST_AIRPORT_SEQ_ID") \
#     .agg(F.avg("abs_error").alias("mae"),
#          F.count("*").alias("count")) \
#     .orderBy(F.desc("mae"))

# display(origin_errors)
# display(dest_errors)


In [0]:
### Error for origin airports with flight vol > 50 ###
origin_errors_filtered = origin_errors.filter(F.col("count") > 50)
display(origin_errors_filtered)


In [0]:
### Error for destination airports with flight vol > 50 ###
dest_errors_filtered = dest_errors.filter(F.col("count") > 50)
display(dest_errors_filtered)

In [0]:
##########################
##### Error by Route #####
##########################
route_errors = val.groupBy("ORIGIN_AIRPORT_SEQ_ID", "DEST_AIRPORT_SEQ_ID") \
    .agg(
        F.avg("abs_error").alias("mae"),
        F.count("*").alias("count")
    ) \
    .orderBy(F.desc("mae"))

display(route_errors)

In [0]:
################################
##### Error by Time of Day #####
################################

val = val.withColumn("CRS_DEP_HOUR", (F.col("CRS_DEP_MINUTES") / 60).cast("int"))

timeofday_errors = val.groupBy("CRS_DEP_HOUR") \
    .agg(F.avg("abs_error").alias("mae"),
         F.count("*").alias("count")) \
    .orderBy("CRS_DEP_HOUR")

display(timeofday_errors)


In [0]:
########################################
##### Error by Day of Week / Month #####
########################################

dow_errors = val.groupBy("DAY_OF_WEEK") \
    .agg(F.avg("abs_error").alias("mae")) \
    .orderBy("DAY_OF_WEEK")

month_errors = val.groupBy("MONTH") \
    .agg(F.avg("abs_error").alias("mae")) \
    .orderBy("MONTH")

display(dow_errors)
display(month_errors)


In [0]:
val.printSchema()

In [0]:
#########################################
##### Absolute Error vs Actual Delay ####
#########################################

error_vs_actual = val.select("DEP_DELAY_NEW", "abs_error").toPandas()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(10,6))
sns.scatterplot(
    x="DEP_DELAY_NEW",
    y="abs_error",
    data=error_vs_actual,
    alpha=0.4
)

# Optional: smooth trendline (LOWESS)
sns.regplot(
    x="DEP_DELAY_NEW",
    y="abs_error",
    data=error_vs_actual,
    scatter=False,
    color="red",
    lowess=True
)

plt.title("Absolute Error vs Actual Delay")
plt.xlabel("Actual Delay (minutes)")
plt.ylabel("Absolute Error (minutes)")
plt.show()


## Checkpoint results to MLflow?

## Develop and Run Pipeline

## Cross Validation

## Save metrics, pipeline, any other steps

## Previous Feature Engineering Ideas
- Previous flight delay in minutes for the aircraft [DONE] - this added value to the linear regression model!
- Number of delayed flights from 4 hours (DONE)
- Number of delays in the route in the last 30 days
- Time between landing and scheduled current flight
- Airport + utc time type of delay - Ohare at 6PM is always late
- Number of delayed flights in departure and arrival location (total or 4 hours before, 6 hours before, etc.)
- Average delay time by airport
- Average taxi out time by airport/flight