## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


import random

import mlflow
print(mlflow.__version__)

import os


## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

In [0]:
spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")

## Datasets

### Custom Join Dataset - 1 year

In [0]:
%fs ls dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V3/

In [0]:
# Read in custom joined data
custom_joined_path = 'dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V3/custom_join_v3_1y.parquet'

df = spark.read.parquet(custom_joined_path)

df = df.filter(F.col("CANCELLED") != 1)
print(df.count())
display(df.limit(10))

# Things to keep in mind
- Predict two hours before
- Remove all the delay columns
- Are we only predicting departure delays or arrival delays also? For example, the pilot misses the landing, and has to circle back for 20 minutes. Should we solve for that? I don't think we should.

## Preprocessing / Cleanup

In [0]:
df = df.cache() # cache joined dataset

In [0]:
# combine date and scheduled departure time

df = df.withColumn(
    "utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

### Split 3 month joined dataset

In [0]:
from pyspark.sql import Window
TRAIN_SIZE = 0.70
VALIDATION_SIZE = 0.10

# REMOVE ALL CANCELLED FLIGHTS
df = df.filter(F.col("CANCELLED") != 1)

df = df.sort('utc_timestamp')

# Add row number based on timestamp order
window = Window.orderBy('utc_timestamp')
df = df.withColumn("row_num", F.row_number().over(window))

total_rows = df.count()

# Calculate split points
train_end = int(total_rows * TRAIN_SIZE)
validation_end = int(total_rows * (TRAIN_SIZE + VALIDATION_SIZE))  # 70% + 10%

# Split based on row number
train_df = df.filter(F.col("row_num") <= train_end)
validation_df = df.filter((F.col("row_num") > train_end) & (F.col("row_num") <= validation_end))
test_df = df.filter(F.col("row_num") > validation_end)

# Drop the helper column
train_df = train_df.drop("row_num")
validation_df = validation_df.drop("row_num")
test_df = test_df.drop("row_num")

In [0]:
# Get the last utc_timestamp from train_df
last_flight_ts = train_df.agg(F.max("utc_timestamp").alias("last_ts")).collect()[0]["last_ts"]

# Add a 2 hour gap
gap_ts = F.timestamp_add("HOUR", F.lit(2), F.lit(last_flight_ts))

# Filter validation_df to keep everything after the gap timestamp
# validation_after_gap_df = validation_df.filter(F.col("utc_timestamp") > gap_ts)
validation_df = validation_df.filter(F.col("utc_timestamp") > gap_ts)

In [0]:
%fs ls dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V3/

In [0]:
if input("CAREFUL: You're about to write to DBFS. Type 'y' to continue.") == "y":
    checkpoint_dataset(train_df, "1_year_custom_joined/raw_data/training_splits/train")
    checkpoint_dataset(validation_df, "1_year_custom_joined/raw_data/training_splits/validation")
    checkpoint_dataset(test_df, "1_year_custom_joined/raw_data/training_splits/test")

#### check checkpoint files

In [0]:
%fs ls dbfs:/student-groups/Group_2_2/1_year_custom_joined/raw_data/training_splits

## Model Iterations

In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
month_or_year = "5_year_custom_joined"

dataset_path = f"{checkpoint_path}/{month_or_year}/raw_data/training_splits"

# Read datasets from checkpoint
train_df = spark.read.parquet(f"{dataset_path}/train.parquet")
validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")
test_df = spark.read.parquet(f"{dataset_path}/test.parquet")

In [0]:
validation_df.columns == test_df.columns

In [0]:
for col in check_train_df.columns:
    if col not in check_test_df.columns:
        print(col)

### Ignore weather rows with nan's

In [0]:
train_df = train_df.dropna(subset=[
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ])

validation_df = validation_df.dropna(subset=[
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ])

test_df = test_df.dropna(subset=[
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ])

In [0]:
# Feature Engineering

## CRS_DEP_TIME is local time so we can use this feature 
## But in order to use it, we have to convert it to minutes since midnight
## Otherwise the timing will be off b/c it's not true UTC

train_df = train_df.withColumn(
    "CRS_DEP_MINUTES", 
    (F.floor(F.col("CRS_DEP_TIME") / 100) * 60 + (F.col("CRS_DEP_TIME") % 100))
)

validation_df = validation_df.withColumn(
    "CRS_DEP_MINUTES", 
    (F.floor(F.col("CRS_DEP_TIME") / 100) * 60 + (F.col("CRS_DEP_TIME") % 100))
)

test_df = test_df.withColumn(
    "CRS_DEP_MINUTES", 
    (F.floor(F.col("CRS_DEP_TIME") / 100) * 60 + (F.col("CRS_DEP_TIME") % 100))
)


### Feature Eng.

#### Was the previous flight delayed? And by how much was the previous flight delayed?

In [0]:
train_df = train_df.cache()
validation_df = validation_df.cache()
test_df = test_df.cache()

In [0]:
window_4h = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-14400, -7200) # 4 hours to 2 hours before

train_df = train_df \
    .withColumn("origin_delays_4h", F.count(F.when(F.col("DEP_DELAY_NEW") > 15, 1)) \
        .over(window_4h)
    )
validation_df = validation_df \
    .withColumn("origin_delays_4h", F.count(F.when(F.col("DEP_DELAY_NEW") > 15, 1)) \
        .over(window_4h)
    )

test_df = test_df \
    .withColumn("origin_delays_4h", F.count(F.when(F.col("DEP_DELAY_NEW") > 15, 1)) \
        .over(window_4h)
    )

In [0]:
train_df = train_df \
    .withColumn("prev_flight_delay_in_minutes", F.lag("DEP_DELAY_NEW", 1) \
        .over(Window.partitionBy("TAIL_NUM") \
        .orderBy("utc_timestamp"))) \
    .withColumn("prev_flight_delay_in_minutes", F.when(F.col("prev_flight_delay_in_minutes").isNull(), -1) \
        .otherwise(F.col("prev_flight_delay_in_minutes"))) \
    .withColumn("prev_flight_delay", F.when(F.col("prev_flight_delay_in_minutes") > 15, 1) \
        .otherwise(F.lit(0)))
    
validation_df = validation_df \
    .withColumn("prev_flight_delay_in_minutes", F.lag("DEP_DELAY_NEW", 1) \
        .over(Window.partitionBy("TAIL_NUM") \
        .orderBy("utc_timestamp"))) \
    .withColumn("prev_flight_delay_in_minutes", F.when(F.col("prev_flight_delay_in_minutes").isNull(), -1) \
        .otherwise(F.col("prev_flight_delay_in_minutes"))) \
    .withColumn("prev_flight_delay", F.when(F.col("prev_flight_delay_in_minutes") > 15, 1) \
        .otherwise(F.lit(0)))
    
test_df = test_df \
    .withColumn("prev_flight_delay_in_minutes", F.lag("DEP_DELAY_NEW", 1) \
        .over(Window.partitionBy("TAIL_NUM") \
        .orderBy("utc_timestamp"))) \
    .withColumn("prev_flight_delay_in_minutes", F.when(F.col("prev_flight_delay_in_minutes").isNull(), -1) \
        .otherwise(F.col("prev_flight_delay_in_minutes"))) \
    .withColumn("prev_flight_delay", F.when(F.col("prev_flight_delay_in_minutes") > 15, 1) \
        .otherwise(F.lit(0)))


### [Feature] Delay time for flights at departure locations over the past 7 days

In [0]:
window_7d_origin = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_df = train_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
train_df = train_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

validation_df = validation_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
validation_df = validation_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

test_df = test_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
test_df = test_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

### [Feature] Number of delayed flights at departure and carrier location over the last 7 days

In [0]:
window_7d_origin_carrier = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID", "OP_UNIQUE_CARRIER") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_df = train_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
train_df = train_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 


validation_df = validation_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
validation_df = validation_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

test_df = test_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
test_df = test_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

### [Feature] number of delays in route in the last 7 days

In [0]:
train_df = train_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

validation_df = validation_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

test_df = test_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

In [0]:
window_7d_route = Window \
    .partitionBy("route") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours


train_df = train_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
train_df = train_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

validation_df = validation_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
validation_df = validation_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

test_df = test_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
test_df = test_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

### [Feature] - number of flights per day for one plane

In [0]:
window_flights_24h = Window \
  .partitionBy("TAIL_NUM", "FL_DATE") \
  .orderBy(F.col("utc_timestamp").cast("long"))

train_df = train_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

validation_df = validation_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)
test_df = test_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

### [Feature] time between landed and scheduled flight

In [0]:
from pyspark.sql import functions as F

def hhmm_to_time_str(col):
    padded = F.lpad(F.col(col).cast("string"), 4, "0")
    return F.concat_ws(":", padded.substr(1, 2), padded.substr(3, 2))

In [0]:
# train_df = train_df.withColumn(
#     "CRS_ARR_TIME_STR",
#     hhmm_to_time_str("ARR_TIME")
# ).withColumn(
#     "WHEELS_ON_STR",
#     hhmm_to_time_str("WHEELS_ON")
# )

# train_df = train_df.withColumn(
#     "CRS_ARR_TIMESTAMP",
#     F.to_timestamp("CRS_ARR_TIME_STR", "HH:mm")
# ).withColumn(
#     "WHEELS_ON_TIMESTAMP",
#     F.to_timestamp("WHEELS_ON_STR", "HH:mm")
# )

# train_df = train_df.withColumn(
#     "LANDING_TIME_DIFF_MINUTES",
#     F.coalesce(
#         (
#             (F.col("WHEELS_ON_TIMESTAMP").cast("long") - 
#              F.col("CRS_ARR_TIMESTAMP").cast("long")) / 60
#         ),
#         F.lit(0)
#     )
# )

# validation_df = validation_df.withColumn(
#     "CRS_ARR_TIME_STR",
#     hhmm_to_time_str("ARR_TIME")
# ).withColumn(
#     "WHEELS_ON_STR",
#     hhmm_to_time_str("WHEELS_ON")
# )

# validation_df = validation_df.withColumn(
#     "CRS_ARR_TIMESTAMP",
#     F.to_timestamp("CRS_ARR_TIME_STR", "HH:mm")
# ).withColumn(
#     "WHEELS_ON_TIMESTAMP",
#     F.to_timestamp("WHEELS_ON_STR", "HH:mm")
# )

# validation_df = validation_df.withColumn(
#     "LANDING_TIME_DIFF_MINUTES",
#     F.coalesce(
#         (
#             (F.col("WHEELS_ON_TIMESTAMP").cast("long") - 
#              F.col("CRS_ARR_TIMESTAMP").cast("long")) / 60
#         ),
#         F.lit(0)
#     )
# )

In [0]:
window_turnaround = Window \
    .partitionBy("TAIL_NUM") \
    .orderBy(F.col("WHEELS_ON").cast("long")) 


train_df = train_df.withColumn(
    "next_scheduled_dep_ts", 
    F.lead("CRS_DEP_TIME", 1).over(window_turnaround)
)

train_df = train_df.withColumn(
    "LANDING_TIME_DIFF_MINUTES",
    F.coalesce(
        (F.col("next_scheduled_dep_ts").cast("long") - F.col("WHEELS_ON").cast("long")) / 60,
        F.lit(-999) 
    )
).drop("next_scheduled_dep_ts")

train_df.select("TAIL_NUM", "WHEELS_ON", "CRS_DEP_TIME", "LANDING_TIME_DIFF_MINUTES").orderBy("TAIL_NUM", "WHEELS_ON").show(5)

In [0]:
validation_df = validation_df.withColumn(
    "next_scheduled_dep_ts", 
    F.lead("CRS_DEP_TIME", 1).over(window_turnaround)
)

validation_df = validation_df.withColumn(
    "LANDING_TIME_DIFF_MINUTES",
    F.coalesce(
        (F.col("next_scheduled_dep_ts").cast("long") - F.col("WHEELS_ON").cast("long")) / 60,
        F.lit(-999) 
    )
).drop("next_scheduled_dep_ts")

In [0]:
test_df = test_df.withColumn(
    "next_scheduled_dep_ts", 
    F.lead("CRS_DEP_TIME", 1).over(window_turnaround)
)

test_df = test_df.withColumn(
    "LANDING_TIME_DIFF_MINUTES",
    F.coalesce(
        (F.col("next_scheduled_dep_ts").cast("long") - F.col("WHEELS_ON").cast("long")) / 60,
        F.lit(-999) 
    )
).drop("next_scheduled_dep_ts")

### [Feature] Average Delay time by airport
- by origin airport and by destination

In [0]:
avg_delay_by_airport_train = train_df.groupBy("DEST_AIRPORT_SEQ_ID").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY")
)

avg_delay_by_airport_val = validation_df.groupBy("DEST_AIRPORT_SEQ_ID").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY")
)

avg_delay_by_airport_test = test_df.groupBy("DEST_AIRPORT_SEQ_ID").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY")
)

In [0]:
# train_df.select("DEST", "ARR_DELAY", "AVG_ARR_DELAY").show(20, False)

In [0]:

window_7d_origin = Window \
    .partitionBy("ORIGIN_AIRPORT_SEQ_ID") \
    .orderBy(F.col("utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days (604800s) to -4 hours (14400s)

train_df = train_df.withColumn(
    'avg_delay_origin_7d_raw', 
    F.avg('ARR_DELAY').over(window_7d_origin)
)

train_df = train_df.withColumn(
    'AVG_ARR_DELAY_ORIGIN', 
    F.coalesce(F.col('avg_delay_origin_7d_raw'), F.lit(0))
).drop('avg_delay_origin_7d_raw') 

validation_df = validation_df.withColumn(
    'avg_delay_origin_7d_raw', 
    F.avg('ARR_DELAY').over(window_7d_origin)
)

validation_df = validation_df.withColumn(
    'AVG_ARR_DELAY_ORIGIN', 
    F.coalesce(F.col('avg_delay_origin_7d_raw'), F.lit(0))
).drop('avg_delay_origin_7d_raw')


test_df = test_df.withColumn(
    'avg_delay_origin_7d_raw', 
    F.avg('ARR_DELAY').over(window_7d_origin)
)

test_df = test_df.withColumn(
    'AVG_ARR_DELAY_ORIGIN', 
    F.coalesce(F.col('avg_delay_origin_7d_raw'), F.lit(0))
).drop('avg_delay_origin_7d_raw')

### [Feature] Average taxi-out time by airport

In [0]:
train_df = train_df.withColumn(
    'avg_taxi_out_origin_7d_raw', 
    F.avg('TAXI_OUT').over(window_7d_origin)
)

train_df = train_df.withColumn(
    'AVG_TAXI_OUT_ORIGIN', 
    F.coalesce(F.col('avg_taxi_out_origin_7d_raw'), F.lit(0))
).drop('avg_taxi_out_origin_7d_raw') 

validation_df = validation_df.withColumn(
    'avg_taxi_out_origin_7d_raw', 
    F.avg('TAXI_OUT').over(window_7d_origin)
)

validation_df = validation_df.withColumn(
    'AVG_TAXI_OUT_ORIGIN', 
    F.coalesce(F.col('avg_taxi_out_origin_7d_raw'), F.lit(0))
).drop('avg_taxi_out_origin_7d_raw')

test_df = test_df.withColumn(
    'avg_taxi_out_origin_7d_raw', 
    F.avg('TAXI_OUT').over(window_7d_origin)
)

test_df = test_df.withColumn(
    'AVG_TAXI_OUT_ORIGIN', 
    F.coalesce(F.col('avg_taxi_out_origin_7d_raw'), F.lit(0))
).drop('avg_taxi_out_origin_7d_raw')

### [Feature] Is holiday?
US Holidays only

In [0]:
test_df.columns

In [0]:
display(train_df)

In [0]:
from pyspark.sql.functions import date_add, date_sub, to_date
from datetime import date, timedelta

from datetime import date

holidays_2015 = {
    date(2015, 1, 1),   # New Year's Day
    date(2015, 1, 19),  # MLK Jr.'s Birthday
    date(2015, 2, 16),  # Washington's Birthday (Presidents' Day)
    date(2015, 5, 25),  # Memorial Day
    date(2015, 7, 3),   # Independence Day (observed)
    date(2015, 9, 7),   # Labor Day
    date(2015, 10, 12), # Columbus Day
    date(2015, 11, 11), # Veterans Day
    date(2015, 11, 26), # Thanksgiving Day
    date(2015, 12, 25)  # Christmas Day
}

holidays_2016 = {
    date(2016, 1, 1),   # New Year's Day
    date(2016, 1, 18),  # MLK Jr.'s Birthday
    date(2016, 2, 15),  # Washington's Birthday (Presidents' Day)
    date(2016, 5, 30),  # Memorial Day
    date(2016, 7, 4),   # Independence Day
    date(2016, 9, 5),   # Labor Day
    date(2016, 10, 10), # Columbus Day
    date(2016, 11, 11), # Veterans Day
    date(2016, 11, 24), # Thanksgiving Day
    date(2016, 12, 26)  # Christmas Day (observed)
}

holidays_2017 = {
    date(2017, 1, 2),   # New Year's Day (observed)
    date(2017, 1, 16),  # MLK Jr.'s Birthday
    date(2017, 2, 20),  # Washington's Birthday (Presidents' Day)
    date(2017, 5, 29),  # Memorial Day
    date(2017, 7, 4),   # Independence Day
    date(2017, 9, 4),   # Labor Day
    date(2017, 10, 9),  # Columbus Day
    date(2017, 11, 10), # Veterans Day (observed)
    date(2017, 11, 23), # Thanksgiving Day
    date(2017, 12, 25)  # Christmas Day
}

holidays_2018 = {
    date(2018, 1, 1),   # New Year's Day
    date(2018, 1, 15),  # MLK Jr.'s Birthday
    date(2018, 2, 19),  # Washington's Birthday (Presidents' Day)
    date(2018, 5, 28),  # Memorial Day
    date(2018, 7, 4),   # Independence Day
    date(2018, 9, 3),   # Labor Day
    date(2018, 10, 8),  # Columbus Day
    date(2018, 11, 12), # Veterans Day (observed)
    date(2018, 11, 22), # Thanksgiving Day
    date(2018, 12, 25)  # Christmas Day
}

holidays_2019 = {
    date(2019, 1, 1),   # New Year's Day
    date(2019, 1, 21),  # MLK Jr.'s Birthday
    date(2019, 2, 18),  # Washington's Birthday (Presidents' Day)
    date(2019, 5, 27),  # Memorial Day
    date(2019, 7, 4),   # Independence Day
    date(2019, 9, 2),   # Labor Day
    date(2019, 10, 14), # Columbus Day
    date(2019, 11, 11), # Veterans Day
    date(2019, 11, 28), # Thanksgiving Day
    date(2019, 12, 25)  # Christmas Day
}

all_holidays = (
    holidays_2015 | holidays_2016 | holidays_2017 | holidays_2018 | holidays_2019
)

# Convert all unique holiday dates into a set of strings for the UDF
all_holidays_str = {d.strftime('%Y-%m-%d') for d in all_holidays}

holidays_2019_str = [
    d.strftime('%Y-%m-%d')
    for d in holidays_2019
]

def check_holiday_window(flight_date_str, holidays_set_str, window_days=3):
    if flight_date_str is None:
        return 0
        
    try:
        # Convert string back to date object for arithmetic
        flight_date = date.fromisoformat(flight_date_str)
        
        # Check current date and dates +/- window_days
        for i in range(-window_days, window_days + 1):
            target_date = flight_date + timedelta(days=i)
            target_date_str = target_date.strftime('%Y-%m-%d')
            
            if target_date_str in holidays_set_str:
                return 1
        return 0
    except ValueError:
        return 0 # Return 0 if date format is invalid

# Register the UDF with Spark
is_holiday_udf = udf(
    lambda d: check_holiday_window(d, all_holidays_str, window_days=3), 
    IntegerType()
)

In [0]:
from pyspark.sql.functions import when, lit
from pyspark.sql.types import IntegerType

# train
train_df = train_df.withColumn(
    'IS_HOLIDAY', 
    (when(col("FL_DATE").isin(all_holidays_str_list), 1).otherwise(0)).cast("integer")
)

# validation
validation_df = validation_df.withColumn(
    'IS_HOLIDAY', 
    (when(col("FL_DATE").isin(all_holidays_str_list), 1).otherwise(0)).cast("integer")
)

# test
test_df = test_df.withColumn(
    'IS_HOLIDAY', 
    (when(col("FL_DATE").isin(all_holidays_str_list), 1).otherwise(0)).cast("integer")
)

In [0]:
train_df.select('IS_HOLIDAY').summary().show()

### [Feature] Within 3 days of a holiday

In [0]:
# get window
is_in_holiday_window_udf = udf(
    lambda x: check_holiday_window(x, holidays_2019, 3), 
    IntegerType()
)

# train
train_df = train_df.withColumn(
    "FL_DATE_DT", 
    to_date(col("FL_DATE"), "yyyy-MM-dd")
)

train_df = train_df.withColumn(
    'IS_HOLIDAY_WINDOW', 
    is_in_holiday_window_udf(col("FL_DATE_DT"))
)

train_df = train_df.drop("FL_DATE_DT")

# val
validation_df = validation_df.withColumn(
    "FL_DATE_DT", 
    to_date(col("FL_DATE"), "yyyy-MM-dd")
)

validation_df = validation_df.withColumn(
    'IS_HOLIDAY_WINDOW', 
    is_in_holiday_window_udf(col("FL_DATE_DT"))
)

validation_df = validation_df.drop("FL_DATE_DT")

# test
test_df = test_df.withColumn(
    "FL_DATE_DT", 
    to_date(col("FL_DATE"), "yyyy-MM-dd")
)

test_df = test_df.withColumn(
    'IS_HOLIDAY_WINDOW', 
    is_in_holiday_window_udf(col("FL_DATE_DT"))
)

test_df = test_df.drop("FL_DATE_DT")

In [0]:
# train_df.select('IS_HOLIDAY_WINDOW').summary().show()

In [0]:
# display(validation_df)

### [Feature] Airport Size
Based on FAA Hub categories
<br>
hub category = airport's annual passenger boardings/US annual passenger boardings

- [0] Large Hub (P-L)	Handles 1.00% or more of total U.S. annual boardings.
- [1] Medium Hub (P-M)	Handles 0.25% to less than 1.00% of total U.S. annual boardings.
- [2] Small Hub (P-S)	Handles 0.05% to less than 0.25% of total U.S. annual boardings.
- [3] Non-Hub Primary (P-N)	Handles less than 0.05% but has at least 10,000 annual boardings.
- [4] Non-Primary Commercial Service (CS)	Has between 2,500 and 10,000 annual boardings.
- [5] Other	Includes Reliever and General Aviation (GA) airports, or codes that were not valid airport identifiers.



In [0]:
airport_classification_data = {
    # Large Hub (0)
    "ATL": 0, "BOS": 0, "CLT": 0, "DCA": 0, "DEN": 0, "DFW": 0, "DTW": 0, "EWR": 0, 
    "FLL": 0, "IAD": 0, "IAH": 0, "JFK": 0, "LAS": 0, "LAX": 0, "LGA": 0, "MCO": 0, 
    "MIA": 0, "MSP": 0, "ORD": 0, "PHL": 0, "PHX": 0, "SAN": 0, "SEA": 0, "SFO": 0, 
    "SLC": 0,

    # Medium Hub (1)
    "ABQ": 1, "ANC": 1, "BNA": 1, "BWI": 1, "BUR": 1, "CVG": 1, "DAL": 1, "FAI": 1, 
    "GEG": 1, "HNL": 1, "HOU": 1, "IND": 1, "MCI": 1, "MDW": 1, "MEM": 1, "MSY": 1, 
    "OAK": 1, "OGG": 1, "ONT": 1, "PDX": 1, "PIT": 1, "RDU": 1, "RNO": 1, "RSW": 1, 
    "SJC": 1, "SJU": 1, "SNA": 1, "TPA": 1, "TUS": 1, "XNA": 1,
    
    # Small Hub (2)
    "ABE": 2, "ABY": 2, "ACV": 2, "AGS": 2, "ALB": 2, "ALO": 2, "AMA": 2, "APN": 2,
    "ART": 2, "ASE": 2, "ATW": 2, "AVP": 2, "AZO": 2, "BFL": 2, "BGM": 2, "BIL": 2, 
    "BIS": 2, "BJI": 2, "BLI": 2, "BOI": 2, "BQK": 2, "BRO": 2, "BTR": 2, "BTV": 2, 
    "BZN": 2, "CAE": 2, "CAK": 2, "CDC": 2, "CID": 2, "CIU": 2, "CLE": 2, "CLL": 2, 
    "CMH": 2, "CMX": 2, "CNY": 2, "COD": 2, "COS": 2, "COU": 2, "CPR": 2, "CRW": 2, 
    "CSG": 2, "CWA": 2, "CYS": 2, "DAB": 2, "DAY": 2, "DHN": 2, "DLH": 2, "DSM": 2, 
    "DRO": 2, "EAU": 2, "EGE": 2, "EKO": 2, "ELM": 2, "ELP": 2, "ERI": 2, "ESC": 2, 
    "EUG": 2, "EVV": 2, "EWN": 2, "EYW": 2, "FAR": 2, "FAT": 2, "FAY": 2, "FCA": 2, 
    "FLG": 2, "FSD": 2, "FSM": 2, "FWA": 2, "GCK": 2, "GFK": 2, "GJT": 2, "GNV": 2, 
    "GPT": 2, "GRB": 2, "GRI": 2, "GRK": 2, "GRR": 2, "GSO": 2, "GSP": 2, "GTF": 2, 
    "GUM": 2, "HA": 2, "HDN": 2, "HIB": 2, "HLN": 2, "HRL": 2, "HSV": 2, "HTS": 2, 
    "HVN": 2, "HYA": 2, "IDA": 2, "ITH": 2, "JAC": 2, "JAN": 2, "JMS": 2, "JNU": 2, 
    "KTN": 2, "LAN": 2, "LAR": 2, "LAW": 2, "LEX": 2, "LFT": 2, "LGB": 2, "LIH": 2,
    "LIT": 2, "LNK": 2, "LRD": 2, "LSE": 2, "LWS": 2, "LYH": 2, "MAF": 2, "MBS": 2, 
    "MGM": 2, "MHK": 2, "MLI": 2, "MLU": 2, "MMH": 2, "MOB": 2, "MOT": 2, "MQT": 2, 
    "MSO": 2, "MTJ": 2, "MVY": 2, "OAJ": 2, "OGS": 2, "OME": 2, "OTH": 2, "OWB": 2, 
    "PAH": 2, "PBG": 2, "PGV": 2, "PHF": 2, "PIB": 2, "PIH": 2, "PIR": 2, "PLN": 2, 
    "PNS": 2, "PPG": 2, "PRC": 2, "PSC": 2, "PSG": 2, "PSM": 2, "PSP": 2, "PUB": 2, 
    "PVU": 2, "RDM": 2, "RHI": 2, "RKS": 2, "RST": 2, "ROW": 2, "SAF": 2, "SBP": 2, 
    "SCC": 2, "SCE": 2, "SGU": 2, "SHD": 2, "SIT": 2, "SLN": 2, "SMX": 2, "SPN": 2, 
    "SPI": 2, "STC": 2, "STS": 2, "SUX": 2, "SWF": 2, "SWO": 2, "TLH": 2, "TOL": 2, 
    "TRI": 2, "TVC": 2, "TXK": 2, "TYR": 2, "UIN": 2, "VLD": 2, "WRG": 2, "WYS": 2, 
    "YAK": 2, "YUM": 2, "RFD": 2, "LBE": 2, "DRT": 2, 
    
    # Non-Hub Primary (4)
    # Note: Many smaller airports fluctuate between categories, placed here for the enumeration request.
    "ABR": 4, "ACK": 4, "AKN": 4, "AZA": 4, "BGM": 4, "BKG": 4, "BRW": 4, "CDV": 4, 
    "GTR": 4, "LBL": 4, "LCK": 4, "LWB": 4, "MEI": 4, "OGD": 4, "OME": 4, "OTH": 4, 
    "PIE": 4, "PVU": 4, "RFD": 4, "RHI": 4, "SLN": 4, "STT": 4, "SUN": 4, "SWO": 4, 
    "TTN": 4, "VEL": 4, "WRG": 4, "YAK": 4, "YUM": 4,
    
    # Non-Primary Commercial Service (5)
    "ATY": 5, "BFF": 5, "BTM": 5, "CIU": 5, "DBQ": 5, "DLG": 5, "GST": 5, "IMT": 5, 
    "INL": 5, "LWF": 5, "MQT": 5, "PIR": 5, "PSM": 5, "TOL": 5, 

    # Other/General Aviation/Reliever (6)
    "BRD": 6, "CMI": 6, "HHH": 6, "HYS": 6, "LWL": 6, "MMH": 6, "PAE": 6, "PSE": 6, 
    "PSP": 6, "RKS": 6, "USA": 6, "VEL": 6,
    
    # Missing/Uncertain Codes (Set to 6 for consistency)
    "9E": 6, "EV": 6, "MQ": 6, "NK": 6, "OO": 6, "WN": 6, "YV": 6, "YX": 6, 
}

In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

def map_airport_class(airport_code):
    """Looks up the hub class for a given airport code, defaulting to 6."""
    global airport_classification_data  # Good practice if running in Spark
    return airport_classification_data.get(airport_code, 6)

hub_class_udf = udf(map_airport_class, IntegerType())

In [0]:
# train
train_df = train_df.withColumn(
    'AIRPORT_HUB_CLASS', 
    hub_class_udf(col("ORIGIN"))
)

# validation
validation_df = validation_df.withColumn(
    'AIRPORT_HUB_CLASS', 
    hub_class_udf(col("ORIGIN"))
)

# test
test_df = test_df.withColumn(
    'AIRPORT_HUB_CLASS', 
    hub_class_udf(col("ORIGIN"))
)

### [Feature] Airline Sentiment
- get list of unique airlines
- rate airline sentiment/perception on liechert scale 1-5 in some llm
  - based on:
    - on-time performance
    - cancellation rate
    - involuntary denied boarding rate
    - value
    - level of trust
    - mishandled baggage rate
    - pre/post flight experience
    - boarding process
    - flight crew service

In [0]:
airline_sentiment_data = {
    "UA": {
        "rating": 3.8,
        "category": 1
    },
    "NK": {
        "rating": 1.8,
        "category": 3
    },
    "AA": {
        "rating": 3.5,
        "category": 1
    },
    "EV": {
        "rating": 3.0,
        "category": 2
    },
    "B6": {
        "rating": 4.2,
        "category": 1
    },
    "DL": {
        "rating": 4.5,
        "category": 1
    },
    "OO": {
        "rating": 3.0,
        "category": 2
    },
    "F9": {
        "rating": 2.0,
        "category": 3
    },
    "YV": {
        "rating": 2.8,
        "category": 2
    },
    "MQ": {
        "rating": 2.8,
        "category": 2
    },
    "OH": {
        "rating": 2.8,
        "category": 2
    },
    "HA": {
        "rating": 4.0,
        "category": 1
    },
    "G4": {
        "rating": 2.5,
        "category": 3
    },
    "YX": {
        "rating": 2.8,
        "category": 2
    },
    "AS": {
        "rating": 4.3,
        "category": 1
    },
    "WN": {
        "rating": 4.0,
        "category": 1
    },
    "9E": {
        "rating": 3.0,
        "category": 2
    }
}

In [0]:
from pyspark.sql.types import FloatType

def map_rating(carrier_code):
    """Looks up the sentiment rating, defaulting to a neutral 3.0 if not found."""
    # The rating is FloatType
    return airline_sentiment_data.get(carrier_code, {'rating': 3.0}).get('rating')

def map_category(carrier_code):
    """Looks up the airline category, defaulting to 2 (Regional) if not found."""
    # The category is IntegerType (1=Major, 2=Regional, 3=ULCC)
    return airline_sentiment_data.get(carrier_code, {'category': 2}).get('category')

rating_udf = udf(map_rating, FloatType())
category_udf = udf(map_category, IntegerType())

carrier_col_name = "OP_UNIQUE_CARRIER"

In [0]:
# train
train_df = train_df.withColumn(
    'RATING', 
    rating_udf(col(carrier_col_name))
).withColumn(
    'AIRLINE_CATEGORY',
    category_udf(col(carrier_col_name))
)
print("Train DataFrame updated with RATING and AIRLINE_CATEGORY.")

# val
validation_df = validation_df.withColumn(
    'RATING', 
    rating_udf(col(carrier_col_name))
).withColumn(
    'AIRLINE_CATEGORY',
    category_udf(col(carrier_col_name))
)
print("Validation DataFrame updated with RATING and AIRLINE_CATEGORY.")

# test
test_df = test_df.withColumn(
    'RATING', 
    rating_udf(col(carrier_col_name))
).withColumn(
    'AIRLINE_CATEGORY',
    category_udf(col(carrier_col_name))
)
print("Test DataFrame updated with RATING and AIRLINE_CATEGORY.")

# Example of how to check the new columns:
train_df.select(carrier_col_name, "RATING", "AIRLINE_CATEGORY").show(5)

### [Feature] Seasonality
- for 1 year, is summer

## Check for nulls

In [0]:
null_counts = validation_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in validation_df.columns])
display(null_counts)

## Checkpoint results with feature engineering

In [0]:
%fs ls dbfs:/student-groups/Group_2_2/1_year_custom_joined/feature_eng_ph3/training_splits

In [0]:
if input("CAREFUL: You're about to write to DBFS. Type 'y' to continue.") == "y":
    checkpoint_dataset(train_df, f"{month_or_year}/feature_eng_ph3/training_splits/train")
    checkpoint_dataset(validation_df, f"{month_or_year}/feature_eng_ph3/training_splits/validation")
    checkpoint_dataset(test_df, f"{month_or_year}/feature_eng_ph3/training_splits/test")

### Check data checkpoint

In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
dataset_path = f"{checkpoint_path}/5_year_custom_joined/feature_eng_ph3/training_splits/"

# Read datasets from checkpoint
check_train_df = spark.read.parquet(f"{dataset_path}/train.parquet")
check_validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")
check_test_df = spark.read.parquet(f"{dataset_path}/test.parquet")

In [0]:
check_train_df.columns == check_test_df.columns

In [0]:
for col in check_train_df.columns:
    if col not in check_test_df.columns:
        print(col)

In [0]:
display(check_train_df)
# display(check_validation_df)

In [0]:
check_train_df.columns

move to modeling!

# Multitower features, 5 year training eda


In [0]:

import matplotlib.pyplot as plt
import numpy as np

In [0]:
train_multitower_df = spark.read.parquet("dbfs:/student-groups/Group_2_2/5_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/train.parquet/")

In [0]:
train_multitower_df.columns

In [0]:
# get date range for multi-tower training
train_multitower_df.describe(['FL_DATE']).show()

#### Feature eng eda

In [0]:
plot_data_df = train_multitower_df.select(
    "ORIGIN_AIRPORT_SEQ_ID",
    "page_rank",
    "out_degree",
    "weighted_out_degree",
    "closeness",
    "betweenness",
    "avg_origin_dep_delay",
    'prev_flight_delay_in_minutes',
    'prev_flight_delay',
    'RATING',
    'OP_CARRIER_AIRLINE_ID',
    'OP_CARRIER',
    'avg_daily_route_flights',
    'HourlyVisibility',
    'weighted_in_degree',
    'route'
).distinct() # Use distinct to ensure one row per airport/key

# Collect data to Pandas for plotting
plot_data_pd = plot_data_df.toPandas()

### Numerical Features

### carrier vs rating

In [0]:
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your final_flight_results_df is the DataFrame containing all features

# --- 1. PYSPARK AGGREGATION ---
# Calculate the average RATING for each OP_CARRIER
# We will also count how many flights are included for context (count is optional)
carrier_summary_df = (
    train_multitower_df
    .groupBy("OP_CARRIER")
    .agg(
        F.avg("RATING").alias("Avg_Rating"),
        F.count("*").alias("Total_Flights")
    )
    .toPandas()
)

# --- 2. PREPARE DATA FOR PLOTTING ---
# Sort the carriers by their average rating for a cleaner visual comparison
carrier_summary_df.sort_values(by="Avg_Rating", ascending=False, inplace=True)

# --- 3. VISUALIZE WITH BAR CHART ---

plt.figure(figsize=(12, 6))

sns.barplot(
    x='OP_CARRIER',
    y='Avg_Rating',
    data=carrier_summary_df,
    palette='viridis' # A sequential color palette
)

# Add title and labels
plt.title('Average Rating Score by Operating Carrier', fontsize=16)
plt.xlabel('Operating Carrier Airline', fontsize=14)
plt.ylabel('Carrier Rating', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

### Graph Features

##### PageRank by state

In [0]:
# Aggregate the PageRank by state. Use the mean to get a representative value.
page_rank_by_state_df = train_multitower_df.groupBy("ORIGIN_STATE_ABR").agg(
    F.mean("page_rank").alias("avg_page_rank")
).withColumnRenamed("ORIGIN_STATE_ABR", "state_abbr")

# This aggregated DataFrame is small enough to collect and visualize.
page_rank_pd = page_rank_by_state_df.toPandas()

In [0]:
import plotly.express as px
# Assuming the data collected in step 1 is named page_rank_pd

fig = px.choropleth(
    page_rank_pd, 
    locations='state_abbr', 
    locationmode="USA-states", 
    color='avg_page_rank',
    scope="usa",
    color_continuous_scale="Viridis", # Choose a color scale
    title='Average Airport PageRank by State (Network Influence)'
)

fig.show()

##### unique routes vs traffic volumes

In [0]:
# --- Scatter Plot 2: Unique Routes vs. Traffic Volume ---

plt.figure(figsize=(10, 6))
plt.scatter(
    plot_data_pd['out_degree'], 
    plot_data_pd['weighted_out_degree'], 
    alpha=0.6, 
    color='darkblue',
    s=20
)

plt.title('Unique Outbound Routes vs. Total Outbound Traffic Volume', fontsize=14)
plt.xlabel('Out-Degree (Number of Unique Destinations)', fontsize=12)
plt.ylabel('Weighted Out-Degree (Total Flights)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

### Temporal Features

#### previous flight delay in minutes
- by carrier
- by origin
- by route

In [0]:
from pyspark.sql import functions as F

def get_top_n_avg_min_max_delay(df, group_col, n=15):

    # 1. Calculate count and rank to identify top N categories
    df_count = df.groupBy(group_col).count()
    
    # Get the list of the top N categories by count
    top_n_categories = (
        df_count
        .orderBy(F.col("count").desc())
        .limit(n)
        .select(group_col)
        .rdd.flatMap(lambda x: x).collect()
    )
    
    # 2. Filter the main DataFrame to include only the top categories
    df_filtered = df.filter(F.col(group_col).isin(top_n_categories))
    
    # 3. Calculate the average, min, and max delay for the filtered set
    df_agg = (
        df_filtered.groupBy(group_col)
        .agg(
            # NEW: Add MIN and MAX aggregations
            F.avg('prev_flight_delay_in_minutes').alias('Avg_Prev_Delay'),
            F.min('prev_flight_delay_in_minutes').alias('Min_Prev_Delay'),
            F.max('prev_flight_delay_in_minutes').alias('Max_Prev_Delay'),
            F.count('*').alias('Count') 
        )
        .orderBy(F.col('Avg_Prev_Delay').desc())
    )
    
    # 4. Convert to Pandas for visualization/display
    df_agg_pandas = df_agg.toPandas()
    
    return df_agg_pandas

In [0]:
# 1. Carrier
# Includes all carriers if total is <= 15, otherwise top 15
df_carrier = get_top_n_avg_min_max_delay(train_multitower_df, 'OP_CARRIER', n=15) 

# 2. Origin (Top 15)
df_origin = get_top_n_avg_min_max_delay(train_multitower_df, 'ORIGIN', n=15)
df_origin['ORIGIN'] = df_origin['ORIGIN'].astype(str) # String conversion for clean plotting

# 3. Route (Top 15)
df_route = get_top_n_avg_min_max_delay(train_multitower_df, 'route', n=15)


# plot
fig, axes = plt.subplots(3, 1, figsize=(12, 18))
plt.suptitle('Average Previous Flight Delay by Carrier, Origin, and Route (Top 15)', fontsize=18, y=1.02)

# Subplot 1: Carrier
sns.barplot(
    ax=axes[0],
    x='OP_CARRIER',
    y='Avg_Prev_Delay',
    data=df_carrier,
    palette='viridis'
)
axes[0].set_title('Average Previous Delay by Operating Carrier', fontsize=14)
axes[0].set_xlabel('Operating Carrier Airline', fontsize=12)
axes[0].set_ylabel('Avg. Previous Delay (minutes)', fontsize=12)
axes[0].tick_params(axis='x', rotation=45) 

# Subplot 2: Origin
sns.barplot(
    ax=axes[1],
    x='ORIGIN',
    y='Avg_Prev_Delay',
    data=df_origin,
    palette='magma'
)
axes[1].set_title(f'Average Previous Delay by Origin Airport (Top {len(df_origin)})', fontsize=14)
axes[1].set_xlabel('Origin Airport', fontsize=12)
axes[1].set_ylabel('Avg. Previous Delay (minutes)', fontsize=12)
axes[1].tick_params(axis='x', rotation=45) 

# Subplot 3: Route
sns.barplot(
    ax=axes[2],
    x='route',
    y='Avg_Prev_Delay',
    data=df_route,
    palette='cividis'
)
axes[2].set_title(f'Average Previous Delay by Route (Top {len(df_route)})', fontsize=14)
axes[2].set_xlabel('Route', fontsize=12)
axes[2].set_ylabel('Avg. Previous Delay (minutes)', fontsize=12)
axes[2].tick_params(axis='x', rotation=45) 

plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.show()

In [0]:
print(df_carrier.describe().to_markdown())

In [0]:
print(df_origin.describe())

In [0]:
print(df_route.describe())

### Weather - visibility

In [0]:
# --- CONFIGURATION (Change these values) ---
SFO_AIRPORT_ID = 14771 # Replace with SFO's actual ID if different
TARGET_DATE = "2016-03-15" # Use your date of interest

# Assuming 'train_multitower_df' is your PySpark DataFrame with 
from pyspark.sql.types import DateType# 'ORIGIN_AIRPORT_SEQ_ID', 'HourlyVisibility', and a timestamp column (e.g., 'FL_DATE_TIME')


# 1. Filter the data for SFO and the specific date
df_sfo_day = train_multitower_df.filter(
    (F.col("ORIGIN_AIRPORT_SEQ_ID") == SFO_AIRPORT_ID) &
    (F.col("FL_DATE").cast(DateType()) == TARGET_DATE)
).select(
    "FL_DATE",
    "HourlyVisibility"
)

# 2. Extract the hour from the timestamp for the X-axis
df_sfo_day = df_sfo_day.withColumn("HourOfDay", F.hour(F.col("FL_DATE")))

# 3. Aggregate to ensure only one visibility reading per hour (using the average)
df_sfo_agg = df_sfo_day.groupBy("HourOfDay").agg(
    F.avg("HourlyVisibility").alias("Hourly_Visibility")
).orderBy("HourOfDay")

# 4. Convert to Pandas for plotting
df_sfo_pd = df_sfo_agg.toPandas()

In [0]:
df_sfo_pd

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- CRITICAL ---
# You must have run the PySpark aggregation step above to create df_sfo_pd.
# If you need to simulate data for testing, use this block:
# TARGET_DATE = "2024-03-15"
# df_sfo_pd = pd.DataFrame({
#     'HourOfDay': np.arange(0, 24),
#     'Avg_Visibility': np.clip(np.random.normal(9.5, 0.5, 24) - 
#                               np.concatenate([np.zeros(8), np.linspace(0, 7, 8), np.zeros(8)]), 
#                               a_min=0.1, a_max=10.0)
# })
# -----------------

plt.figure(figsize=(12, 6))
ax = plt.gca()

# 1. Plot the Visibility Line
sns.lineplot(
    x='HourOfDay',
    y='Hourly_Visibility',
    data=df_sfo_pd,
    marker='o',
    linestyle='-',
    color='blue',
    linewidth=2
)

# 2. Add Critical Operational Thresholds
# CAT I Minimum Visibility (approx 1/2 mile)
ax.axhline(y=0.5, color='red', linestyle='--', linewidth=1.5, label='CAT I (Critical Visibility)')
ax.text(x=23, y=0.6, s='0.5 Miles', color='red', ha='right')

# VFR Minimum Visibility (approx 3 miles, varies by airspace)
ax.axhline(y=3.0, color='orange', linestyle=':', linewidth=1.5, label='Marginal VFR')
ax.text(x=23, y=3.1, s='3.0 Miles', color='orange', ha='right')


# 3. Titles and Labels
plt.title(f'Hourly Visibility at SFO on {TARGET_DATE}', fontsize=16)
plt.xlabel('Time of Day (Hour)', fontsize=14)
plt.ylabel('Average Hourly Visibility (Miles)', fontsize=14)
plt.xticks(np.arange(0, 24, 2)) # Show every 2nd hour
plt.ylim(0, 10.5)
plt.grid(axis='both', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- 2. Create the Binned Column using F.when / F.otherwise ---
train_multitower_df_spark = train_multitower_df.withColumn(
    "Visibility_Bin",
    F.when(F.col("HourlyVisibility") < 1.0, F.lit("< 1 Mi (Critical)"))
     .when((F.col("HourlyVisibility") >= 1.0) & (F.col("HourlyVisibility") < 3.0), F.lit("1-3 Mi (Low IFR)"))
     .when((F.col("HourlyVisibility") >= 3.0) & (F.col("HourlyVisibility") < 5.0), F.lit("3-5 Mi (Marginal VFR)"))
     .when((F.col("HourlyVisibility") >= 5.0) & (F.col("HourlyVisibility") < 10.0), F.lit("5-10 Mi"))
     # The 'otherwise' handles 10+ Mi (using a very large number as an implicit upper bound)
     .otherwise(F.lit("10+ Mi (Clear)")) 
)

# --- 3. Aggregate in PySpark ---
# Now you can use this new column for aggregation:
df_agg_count_spark = (
    train_multitower_df_spark.groupBy("Visibility_Bin")
    .count()
    .orderBy(F.col("count").desc())
)

# --- 4. Convert to Pandas for Plotting (Final Step) ---
df_agg_count_pd = df_agg_count_spark.toPandas().rename(columns={'count': 'Count'})

# =========================================================================
# === STEP 3: PLOTTING BLOCK ===
# =========================================================================

plt.figure(figsize=(10, 6))

# Plot the counts
sns.barplot(
    x='Visibility_Bin',
    y='Count',
    data=df_agg_count_pd,
    palette='Blues_d' 
)

# Titles and labels
plt.title('Distribution of Hourly Visibility in Training Data (2015-2018)', fontsize=16)
plt.xlabel('Hourly Visibility Range', fontsize=14)
plt.ylabel('Observation Count', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.xticks(rotation=15, ha='right')

# Format y-axis to show counts in a readable format (e.g., thousands)
plt.ticklabel_format(style='plain', axis='y', scilimits=(0, 0))

plt.tight_layout()
plt.show()

In [0]:
# =========================================================================
# === STEP 1: BINNING AND AGGREGATION (Pandas Native) ===
# =========================================================================

# --- 2. Create the Binned Column using F.when / F.otherwise ---
train_multitower_df_spark = train_multitower_df.withColumn(
    "Visibility_Bin",
    F.when(F.col("HourlyVisibility") < 1.0, F.lit("< 1 Mi (Critical)"))
     .when((F.col("HourlyVisibility") >= 1.0) & (F.col("HourlyVisibility") < 3.0), F.lit("1-3 Mi (Low IFR)"))
     .when((F.col("HourlyVisibility") >= 3.0) & (F.col("HourlyVisibility") < 5.0), F.lit("3-5 Mi (Marginal VFR)"))
     .when((F.col("HourlyVisibility") >= 5.0) & (F.col("HourlyVisibility") < 10.0), F.lit("5-10 Mi"))
     # The 'otherwise' handles 10+ Mi (using a very large number as an implicit upper bound)
     .otherwise(F.lit("10+ Mi (Clear)")) 
)

# --- 3. Aggregate in PySpark ---
# Now you can use this new column for aggregation:
df_agg_count_spark = (
    train_multitower_df_spark.groupBy("Visibility_Bin")
    .count()
    .orderBy(F.col("count").desc())
)

# --- 4. Convert to Pandas for Plotting (Final Step) ---
df_agg_count_pd = df_agg_count_spark.toPandas().rename(columns={'count': 'Count'})

total_count = df_agg_count_pd['Count'].sum()
df_agg_count_pd['Percentage'] = (df_agg_count_pd['Count'] / total_count) * 100

# =========================================================================
# === STEP 2: PLOTTING BLOCK WITH ANNOTATIONS ===
# =========================================================================

plt.figure(figsize=(10, 6))
ax = plt.gca()

# Plot the counts
sns.barplot(
    x='Visibility_Bin',
    y='Count',
    data=df_agg_count_pd,
    palette='Blues_d',
    ax=ax
)

# --- Add Annotations (Percentages) ---
for index, row in df_agg_count_pd.iterrows():
    # Get the height (Count) and the formatted percentage string
    count_value = row['Count']
    percentage_str = f'{row["Percentage"]:.1f}%'
    
    # Place the text slightly above the bar
    ax.text(
        index,                  # X position (index of the bar)
        count_value * 1.01,     # Y position (slightly above the bar)
        percentage_str,         # Text to display
        color='black',
        ha="center",            # Horizontal alignment
        va="bottom",            # Vertical alignment
        fontsize=10
    )

# Titles and labels
plt.title('Distribution of Hourly Visibility in Training Data', fontsize=16)
plt.xlabel('Hourly Visibility Range', fontsize=14)
plt.ylabel('Observation Count', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.xticks(rotation=15, ha='right')

# Format y-axis to show counts in a readable format (e.g., thousands)
plt.ticklabel_format(style='plain', axis='y', scilimits=(0, 0))
plt.ylim(0, df_agg_count_pd['Count'].max() * 1.1) # Set Y limit higher to accommodate labels

plt.tight_layout()
plt.show()

#### Categorical - Routes

In [0]:
# Assuming your PySpark DataFrame is named train_multitower_df_spark

# 1. Select the columns that define a route
df_routes = train_multitower_df_spark.select(
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID"
)

# 2. Count the number of distinct (unique) rows in the selected columns
unique_route_count = df_routes.distinct().count()

# 3. Print the result
print(f"The total number of unique routes (Origin-Destination pairs) is: {unique_route_count}")

In [0]:
ROUTE_COLUMN_NAME = "route" # <<< REPLACE THIS

df_agg_routes_spark = (
    train_multitower_df_spark.groupBy(ROUTE_COLUMN_NAME)
    .count()
    .orderBy(F.col("count").desc())
    .limit(20)
)

# Convert to Pandas for display/saving
df_agg_routes_pd = df_agg_routes_spark.toPandas()

display(df_agg_routes_pd)

In [0]:
plt.figure(figsize=(12, 6))
ax = plt.gca()

# Create the bar plot
sns.barplot(
    x=ROUTE_COLUMN_NAME,
    y='count',
    data=df_agg_routes_pd,
    palette='viridis' # A good general-purpose color palette
)

# Titles and labels
plt.title('Top 20 Most Frequent Routes (Training 2015-2018)', fontsize=16)
plt.xlabel('Route', fontsize=14)
plt.ylabel('Total Count of Flights', fontsize=14)
plt.xticks(rotation=45, ha='right') # Rotate X-labels for readability
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Format y-axis to show counts in a readable format
plt.ticklabel_format(style='plain', axis='y', scilimits=(0, 0))

plt.tight_layout()

#### Airline category

In [0]:
import pandas as pd
import seaborn as sns

# Define the category remapping logic using F.when()
df_category_counts = (
    train_multitower_df.groupBy("AIRLINE_CATEGORY")
    .count()
    .withColumnRenamed("count", "Count")
    # Remap the category names using the F.when() structure
    .withColumn(
        "AirlineCategoryRemapped",
        F.when(F.col("AIRLINE_CATEGORY") == 1, '(1) Major Airlines')
        .when(F.col("AIRLINE_CATEGORY") == 2, '(2) National Airlines')
        .when(F.col("AIRLINE_CATEGORY") == 3, '(3) Regional Airlines')
    )
    .orderBy(F.col("Count").desc())
)

# Convert to Pandas for plotting
df_plot = df_category_counts.toPandas()

# --- 2. PLOTTING BLOCK (Pandas/Matplotlib) ---

# Define the explicit sort order for the plot
plot_order = ['(1) Major Airlines', '(2) National Airlines', '(3) Regional Airlines']
df_plot['AirlineCategoryRemapped'] = pd.Categorical(df_plot['AirlineCategoryRemapped'], categories=plot_order, ordered=True)
df_plot.sort_values(by='AirlineCategoryRemapped', inplace=True)

plt.figure(figsize=(10, 6))

sns.barplot(
    x='AirlineCategoryRemapped', # Use the new remapped column
    y='Count',
    data=df_plot,
    palette='viridis' 
)

# Titles and labels
plt.title('Distribution of Flights by Airline Category', fontsize=16)
plt.xlabel('Airline Category', fontsize=14)
plt.ylabel('Total Flight Count', fontsize=14)
plt.xticks(rotation=15, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.ticklabel_format(style='plain', axis='y', scilimits=(0, 0))

plt.tight_layout()
plt.show()

In [0]:
df_subset = train_multitower_df.select("AIRLINE_CATEGORY", "OP_CARRIER")

# To display the first few rows of the new DataFrame
df_subset.show(20)

In [0]:
CATEGORY_COL = "AIRLINE_CATEGORY"
CARRIER_COL = "OP_CARRIER" # Assuming this is the column for the individual carrier code

# 1. Select the two columns and get all distinct combinations
df_mapping_spark = train_multitower_df.select(CATEGORY_COL, CARRIER_COL).distinct()

# 2. Sort the results for a clean presentation: 
#    Sort primarily by Category, then alphabetically by Carrier
df_mapping_spark = df_mapping_spark.orderBy(
    F.col(CATEGORY_COL).asc(),
    F.col(CARRIER_COL).asc()
)

# 3. Convert to Pandas for display/printing the Markdown table
df_mapping_pd = df_mapping_spark.toPandas()

# 4. Rename columns for the final table display
df_mapping_pd.rename(columns={
    CATEGORY_COL: 'Airline Category', 
    CARRIER_COL: 'Carrier Code'
}, inplace=True)

In [0]:
carrier_mapping_dict = {
    '(1) Major Airlines': ['AA', 'AS', 'B6', 'DL', 'HA', 'UA', 'WN'],
    '(2) Regional Airlines': ['9E', 'EV', 'MQ', 'OH', 'OO', 'US', 'VX', 'YV', 'YX'],
    '(3) Ultra Low Cost Airlines': ['F9', 'G4', 'NK']
}

# --- Markdown Table Generation ---

markdown_table = "##  Airline Carrier Category Mapping\n\n"
markdown_table += "This table shows the assumed classification for each carrier code.\n\n"
markdown_table += "| Category Code | Airline Category | Carrier Codes |\n"
markdown_table += "|:---:|:---|:---|\n"

for category_name, carriers in carrier_mapping_dict.items():
    # Split the category name to extract the code and description
    # Example: '(1) Major Airlines' -> code='1', description='Major Airlines'
    code = category_name.split(' ')[0].strip('()')
    description = category_name.split(' ', 1)[1]
    
    # Join the carrier list into a comma-separated string
    carrier_string = ', '.join(carriers)
    
    # Add the row to the table string
    markdown_table += f"| {code} | {description} | {carrier_string} |\n"

# In Databricks, printing the string will typically render the Markdown 
# correctly in the cell output.
print(markdown_table)

In [0]:
# Cyclical fetures

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# --- Assuming your dataframe is named 'df_features' ---
# NOTE: Replace 'your_dataframe' with the actual name of your DataFrame
# NOTE: Replace 'HourlyAltimeterSetting' with the actual feature column name if different
# NOTE: Replace 'Delay_Target_Minutes' with the actual target column name
# NOTE: Replace 'ORIGIN_AIRPORT_CODE' with the actual airport code column name

# 1. Define the variables
FEATURE_COLUMN = 'HourlyAltimeterSetting'
TARGET_COLUMN = 'DEP_DELAY'
AIRPORT_FILTER = 'ATL' # Focus on a high-volume hub for a clear test case

# df_filtered_pd = train_multitower_df[train_multitower_df['ORIGIN_AIRPORT_CODE'] == AIRPORT_FILTER].copy()

# Convert the filtered Spark DataFrame to Pandas
df_filtered_pd = df_filtered.toPandas()

plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_filtered_pd,
    x=FEATURE_COLUMN,
    y=TARGET_COLUMN,
    alpha=0.3,
    s=20
)

plt.title(f'Leakage Check: "{FEATURE_COLUMN}" vs. Delay at {AIRPORT_FILTER}', fontsize=16)
plt.xlabel(f'{FEATURE_COLUMN} (Barometric Pressure)', fontsize=12)
plt.ylabel('Flight Delay (Minutes)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
SUSPICIOUS_VALUE = 2.80
plt.axvline(x=SUSPICIOUS_VALUE, color='red', linestyle='-', linewidth=2, label=f'MI Spike Value ({SUSPICIOUS_VALUE})')
plt.ylim(0, 180)
plt.xlim(20,35)
plt.legend()
plt.show()

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a synthetic DataFrame representing the 7 days of the week (0=Monday, 6=Sunday)
days_of_week = pd.Series(range(7))
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# Calculate the sine and cosine components
dow_sin = np.sin(2 * np.pi * days_of_week / 7)
dow_cos = np.cos(2 * np.pi * days_of_week / 7)

# Create the synthetic DataFrame (mt_test_df) for plotting
mt_test_df = pd.DataFrame({
    'DOW': days_of_week,
    'DayName': day_names,
    'dow_sin': dow_sin,
    'dow_cos': dow_cos
})

# Create the scatter plot
plt.figure(figsize=(7, 7))
ax = plt.gca()

# Scatter plot the points
plt.scatter(mt_test_df['dow_cos'], mt_test_df['dow_sin'], s=100)

# Add labels for each point (Day of Week)
for i, day in enumerate(mt_test_df['DayName']):
    # Adjust position slightly for better visibility
    plt.annotate(day, (mt_test_df['dow_cos'][i] * 1.1, mt_test_df['dow_sin'][i] * 1.1),
                 fontsize=10, ha='center')

# Ensure the aspect ratio is equal to make the circle look round
ax.set_aspect('equal', adjustable='box')

# Add a circle outline to show the unit circle
circle = plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='--', alpha=0.5)
ax.add_artist(circle)

# Set axis limits
plt.xlim(-1.2, 1.2)
plt.ylim(-1.2, 1.2)

# Set title and labels
plt.title('Cyclical Encoding of Day of Week (DOW)')
plt.xlabel('dow_cos')
plt.ylabel('dow_sin')
plt.grid(True, linestyle=':', alpha=0.6)

In [0]:
categorical_cols = [
    "OP_CARRIER",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "route",
]

numerical_cols = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "CRS_DEP_MINUTES",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
]

temporal_cols = [
    "utc_timestamp",
    "prev_flight_delay_in_minutes",
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN",
]

weather_cols = [
    "HourlyDryBulbTemperature",
    "HourlyDewPointTemperature",
    "HourlyRelativeHumidity",
    "HourlyAltimeterSetting",
    "HourlyVisibility",
    "HourlyStationPressure",
    "HourlyWetBulbTemperature",
    "HourlyPrecipitation",
    "HourlyCloudCoverage",
    "HourlyCloudElevation",
    "HourlyWindSpeed",
]


In [0]:
categorical = []