# feature engineering sandbox

Feature eng ideas:
- Number of delayed flights in departure and arrival location (total or 4 hours before, 6 hours before, etc.)
- Number of delays in the route in the last 30 days
- Number of flights plane has flown that day
- Total number of flights plan has flown until a certain time

# Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


import random

import mlflow
print(mlflow.__version__)

import os
os.environ['PYSPARK_PIN_THREAD'] = 'false'
spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



# Data

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}"))

## Helper Function

In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## EDA - training custom join with graph features, 1 year

In [0]:
%fs ls dbfs:/student-groups/Group_2_2/1_year_custom_joined/feature_eng_ph3/training_splits/train.parquet

In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
month_or_year = "1_year_custom_joined"

# 1_year_custom_joined/graph_feature_splits
dataset_path = f"{checkpoint_path}/{month_or_year}/graph_feature_splits/train"

# Read datasets from checkpoint
train_df = spark.read.parquet(f"{dataset_path}")
# validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")
# test_df = spark.read.parquet(f"{dataset_path}/test.parquet")

In [0]:
display(train_df)

In [0]:
# 1474703 # seattle
# 1288903 # las vegas
# 1289208 # LA
# 1402702 # Palm Beach, FL

In [0]:
train_df.columns

In [0]:
# graph in_betweeness

from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt

# 1. AGGREGATION (PySpark): Count the frequency of each unique in_degree value
# This prepares the data for the histogram/bar chart.
in_degree_counts_df = train_df.groupBy("in_degree").count().orderBy("in_degree")

# 2. COLLECTION: Move the aggregated data to the driver for plotting
# NOTE: Ensure the resulting table is small enough for your driver memory.
# If in_degree has too many unique values, you might need to filter first.
in_degree_counts_pd = in_degree_counts_df.toPandas()

# 3. VISUALIZATION (Matplotlib): Create the Bar Chart
plt.figure(figsize=(12, 6))

# Plot in_degree (x-axis) vs. count (y-axis)
plt.bar(in_degree_counts_pd['in_degree'], in_degree_counts_pd['count'], color='#5B9BD5', width=0.8)

# Applying a log scale to the y-axis is crucial for visualizing the skewness
plt.yscale('log') 

plt.title('Distribution of Airport In-Degree', fontsize=16)
plt.xlabel('In-Degree (Number of Unique Incoming Routes)', fontsize=12)
plt.ylabel('Count of Airports (Log Scale)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show() # Use plt.savefig('in_degree_histogram.png') in a production environment

### Page Rank, Choropleth Map

In [0]:
# Aggregate the PageRank by state. Use the mean to get a representative value.
page_rank_by_state_df = train_df.groupBy("ORIGIN_STATE_ABR").agg(
    F.mean("page_rank").alias("avg_page_rank")
).withColumnRenamed("ORIGIN_STATE_ABR", "state_abbr")

# This aggregated DataFrame is small enough to collect and visualize.
page_rank_pd = page_rank_by_state_df.toPandas()

In [0]:
import plotly.express as px
# Assuming the data collected in step 1 is named page_rank_pd

fig = px.choropleth(
    page_rank_pd, 
    locations='state_abbr', 
    locationmode="USA-states", 
    color='avg_page_rank',
    scope="usa",
    color_continuous_scale="Viridis", # Choose a color scale
    title='Average Airport PageRank by State (Network Influence)'
)

fig.show()

### Scatter plots

In [0]:
plot_data_df = train_df.select(
    "ORIGIN_AIRPORT_SEQ_ID",
    "page_rank",
    "out_degree",
    "weighted_out_degree",
    "closeness",
    "betweenness",
    "avg_origin_dep_delay"
).distinct() # Use distinct to ensure one row per airport/key

# Collect data to Pandas for plotting
plot_data_pd = plot_data_df.toPandas()

In [0]:
# --- Scatter Plot 1: PageRank vs. Average Departure Delay ---
plt.figure(figsize=(10, 6))
plt.scatter(
    plot_data_pd['page_rank'], 
    plot_data_pd['avg_origin_dep_delay'], 
    alpha=0.6, 
    color='darkred',
    s=20 # Set size
)

plt.title('Network Influence (PageRank) vs. Average Departure Delay', fontsize=14)
plt.xlabel('PageRank (Network Influence)', fontsize=12)
plt.ylabel('Average Origin Departure Delay (Minutes)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [0]:
# --- Scatter Plot 2: Unique Routes vs. Traffic Volume ---
plt.figure(figsize=(10, 6))
plt.scatter(
    plot_data_pd['out_degree'], 
    plot_data_pd['weighted_out_degree'], 
    alpha=0.6, 
    color='darkblue',
    s=20
)

plt.title('Unique Outbound Routes vs. Total Outbound Traffic Volume', fontsize=14)
plt.xlabel('Out-Degree (Number of Unique Destinations)', fontsize=12)
plt.ylabel('Weighted Out-Degree (Total Flights)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [0]:
# --- Scatter Plot 3: Closeness vs. Betweenness Centrality ---
plt.figure(figsize=(10, 6))
plt.scatter(
    plot_data_pd['closeness'], 
    plot_data_pd['betweenness'], 
    alpha=0.6, 
    color='forestgreen',
    s=20
)

plt.title('Closeness vs. Betweenness Centrality', fontsize=14)
plt.xlabel('Closeness Centrality (Efficiency)', fontsize=12)
plt.ylabel('Betweenness Centrality (Bridging Role)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

### KDE plot

In [0]:
import seaborn as sns

# and joined to the main flight DataFrame, 'df'.
plot_data_df = train_df.select(
    "ORIGIN_AIRPORT_SEQ_ID",
    "avg_origin_dep_delay"
).distinct() # Use distinct to ensure one row per airport

# Collect the data to a Pandas DataFrame for visualization
plot_data_pd = plot_data_df.toPandas()

# 4. Create the KDE Plot using Seaborn/Matplotlib
plt.figure(figsize=(10, 6))

# Plot the KDE (Kernel Density Estimate)
sns.kdeplot(
    data=plot_data_pd, 
    x='avg_origin_dep_delay', 
    fill=True, 
    alpha=0.6, 
    color='purple',
    linewidth=2
)

plt.title('KDE of Average Origin Departure Delay by Airport', fontsize=16)
plt.xlabel('Average Departure Delay (Minutes)', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [0]:
first_date = train_df.select(F.min("FL_DATE")).collect()[0][0]

# --- 1. Define Filter Criteria ---

# Find the earliest date in the dataset (or choose any specific date)
first_date = train_df.select(F.date_trunc('day', F.min("utc_timestamp")).alias("day")).collect()[0][0]

# Find the ID of the airport with the maximum out_degree (likely a major hub)
# single_airport_id = train_df.groupBy("ORIGIN_AIRPORT_SEQ_ID").agg(
#     F.max("out_degree").alias("max_out_degree")
# ).orderBy(F.desc("max_out_degree")).select("ORIGIN_AIRPORT_SEQ_ID").limit(1).collect()[0][0]
single_airport_id = 1295304 # ny

# 1323202 #chicago
# 1379608 # oakland
# 1295304 # ny
# 1402702 # Palm Beach, FL
# 1474703 # seattle
# 1288903 # las vegas
# 1289208 # LA

# --- 2. Filter, Process, and Select Data ---

plot_data_df = train_df.filter(
    # Filter by the single date AND the single location
    (F.date_trunc('day', F.col("utc_timestamp")) == F.lit(first_date)) &
    (F.col("ORIGIN_AIRPORT_SEQ_ID") == single_airport_id)
).select(
    "utc_timestamp",
    "HourlyDryBulbTemperature",
    "HourlyDewPointTemperature",
    "HourlyRelativeHumidity"
).withColumn(
    # Create the 'hour' column for the X-axis
    "hour", 
    F.hour(F.col("utc_timestamp"))
).orderBy(
    "hour"
).distinct() 

# --- 3. Convert for Visualization ---

# Collect the clean, 24-row dataset to Pandas
weather_data_pd = plot_data_df.toPandas()

print(f"Data prepared for Airport ID: {single_airport_id} on date: {first_date}")

In [0]:
def plot_weather_diurnal_cycle_scatter(weather_data_pd):
    """
    Generates a dual-axis plot showing only the scatter points 
    for the diurnal cycle of temperature and relative humidity.
    """
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # --- Primary Y-Axis (Temperature) ---
    color_temp = 'tab:red'
    ax1.set_xlabel('Hour of Day', fontsize=12)
    ax1.set_ylabel('Temperature (F) (Dry Bulb / Dew Point)', color=color_temp, fontsize=12)

    # Plot Dry Bulb Temperature (ONLY SCATTER)
    ax1.scatter(weather_data_pd['hour'], weather_data_pd['HourlyDryBulbTemperature'], 
                color=color_temp, marker='o', label='Dry Bulb Temp.')

    # Plot Dew Point Temperature (ONLY SCATTER)
    ax1.scatter(weather_data_pd['hour'], weather_data_pd['HourlyDewPointTemperature'], 
                color='red', marker='x', label='Dew Point Temp.')

    ax1.tick_params(axis='y', labelcolor=color_temp)
    ax1.grid(axis='y', linestyle='--', alpha=0.5)
    
    # --- Secondary Y-Axis (Relative Humidity) ---
    ax2 = ax1.twinx()  # Instantiate a second axes that shares the same x-axis
    color_humidity = 'tab:blue'
    ax2.set_ylabel('Relative Humidity (%)', color=color_humidity, fontsize=12)

    # Plot Relative Humidity (ONLY SCATTER)
    ax2.scatter(weather_data_pd['hour'], weather_data_pd['HourlyRelativeHumidity'], 
                color=color_humidity, marker='s', label='Relative Humidity')

    ax2.tick_params(axis='y', labelcolor=color_humidity)

    # --- Final Touches ---
    plt.title('Hourly Weather Data (New York, NY - Jan 1, 2019)', fontsize=16)
    fig.tight_layout() 
    
    # Combine legends from both axes
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

    # Display or save the plot
    plt.show()

plot_weather_diurnal_cycle_scatter(weather_data_pd)

### Combined dataset

In [0]:
# paths
custom_join_3m_path = "dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V2/custom_join_v2_3m.parquet"
custom_join_1y_path ='dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V2/custom_join_v2_1y.parquet'

In [0]:
join_data_3m = spark.read.parquet(custom_join_3m_path)

join_data_3m_df = join_data_3m.cache()

# drop null flight_uid
join_data_3m_df = join_data_3m.dropna(subset=['flight_uid'])
display(join_data_3m_df)

In [0]:
join_data_1y = spark.read.parquet(custom_join_1y_path)

join_data_1y_df = join_data_1y.cache()

# drop null flight_uid
join_data_1y_df = join_data_1y.dropna(subset=['flight_uid'])
display(join_data_1y_df)

In [0]:
# add utc time for departure date
join_data_3m_df = join_data_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

In [0]:
display(join_data_3m_df)

In [0]:
# add utc time for departure date
join_data_1y_df = join_data_1y_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

## Drop some hourly fields for now from weather columns

In [0]:
join_data_3m_df = join_data_3m_df.dropna(subset=[
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ])

### Filter cancelled flights

In [0]:
join_data_3m_df = join_data_3m_df.filter(F.col("CANCELLED") != 1)
display(join_data_3m_df.limit(10))

In [0]:
# check for nulls
for column_name in join_data_3m_df.columns:
    print(f"{column_name} ------> {join_data_3m_df.filter(F.col(column_name).isNull()).count()}")

Feature eng ideas:

- Number of delayed flights in departure location (total or 4 hours before, 6 hours before, etc.)
  - per airline?
- Number of delays in the route in the last 30 days
- Number of flights plane has flown that day

### Feature - number of delayed flights at depature location over the last 7 days

In [0]:
from pyspark.sql.functions import col, sum, asc

# add column for total flight delay in the last 7 days for each origin
window_7d_origin = window_4h = Window \
    .partitionBy("ORIGIN") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

# display(join_data_3m_df)

In [0]:
# apply window to 1y data

join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

display(join_data_1y_df)

### Feature - number of delayed flights at depature and carrier location over the last 7 days

In [0]:
window_7d_origin_carrier = window_4h = Window \
    .partitionBy("ORIGIN", "OP_UNIQUE_CARRIER") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

# display(join_data_3m_df)

In [0]:
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_carrier_7d', 
    sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

# display(join_data_1y_df)

### Feature - Number of delays in the route in the last 7 days
- route: origin to destination

In [0]:
join_data_3m_df = join_data_3m_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

join_data_1y_df = join_data_1y_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

In [0]:
window_7d_route = window_4h = Window \
    .partitionBy("route") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours


join_data_3m_df = join_data_3m_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_3m_df = join_data_3m_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

# display(join_data_3m_df)


In [0]:
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

# display(join_data_1y_df)

### Feature - number of flights per day for one plane


In [0]:
window_flights_24h = Window \
  .partitionBy("TAIL_NUM", "FL_DATE") \
  .orderBy(F.col("crs_dep_utc_timestamp").cast("long"))

join_data_3m_df = join_data_3m_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

# display(join_data_3m_df)

In [0]:
join_data_1y_df = join_data_1y_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

display(join_data_1y_df)

In [0]:
# check for nulls
for column_name in join_data_3m_df.columns:
    print(f"{column_name} ------> {join_data_3m_df.filter(F.col(column_name).isNull()).count()}")

## Get Splits from checkpoint - 3 month data


In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
dataset_path = f"{checkpoint_path}/3_month_custom_joined/raw_data/training_splits"

# Read datasets from checkpoint
train_df = spark.read.parquet(f"{dataset_path}/train.parquet")
validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")

In [0]:
train_3m_df = train_df.cache()
validation_3m_df = validation_df.cache()

### Convert departure time to UTC

In [0]:
# add utc time for departure date
train_3m_df = train_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

validation_3m_df = validation_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

### feature - total delay time for flights at departure locations over the past 7 days

In [0]:
# incorporate features in splits
window_7d_origin = Window \
    .partitionBy("ORIGIN") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_3m_df = train_3m_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
train_3m_df = train_3m_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 



### Feature - number of delayed flights at depature and carrier location over the last 7 days

In [0]:
window_7d_origin_carrier = Window \
    .partitionBy("ORIGIN", "OP_UNIQUE_CARRIER") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_3m_df = train_3m_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
train_3m_df = train_3m_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 


### Feature - number of delays in route in the last 7 days
- route: origin to destination

In [0]:
train_3m_df = train_3m_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

validation_3m_df = validation_3m_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

In [0]:
window_7d_route = Window \
    .partitionBy("route") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours


train_3m_df = train_3m_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
train_3m_df = train_3m_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

validation_3m_df = validation_3m_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
validation_3m_df = validation_3m_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

### Feature - number of flights per day for one plane

In [0]:
window_flights_24h = Window \
  .partitionBy("TAIL_NUM", "FL_DATE") \
  .orderBy(F.col("crs_dep_utc_timestamp").cast("long"))

train_3m_df = train_3m_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

validation_3m_df = validation_3m_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

In [0]:
# check for nulls
for column_name in train_3m_df.columns:
    print(f"{column_name} ------> {train_3m_df.filter(F.col(column_name).isNull()).count()}")

## Model

In [0]:
# get baseline columns

baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    # "CRS_ELAPSED_TIME",
    # "DISTANCE",
    "DEP_DELAY_NEW",
    "crs_dep_utc_timestamp",
    # "prev_flight_delay_in_minutes",
    # "prev_flight_delay",
    # "origin_delays_4h",
    'HourlyDryBulbTemperature',
    'HourlyDewPointTemperature',
    'HourlyRelativeHumidity',
    'HourlyAltimeterSetting',
    'HourlyVisibility',
    'HourlyStationPressure',
    'HourlyWetBulbTemperature',
    'HourlyPrecipitation',
    'HourlyCloudCoverage',
    'HourlyCloudElevation',
    'HourlyWindSpeed'  
]

In [0]:
# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")


In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "tail_num_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ],
    outputCol="features"
)

In [0]:
# linear regression baseline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from mlflow.models import infer_signature

mlflow.spark.autolog()
with mlflow.start_run(run_name="lr - weather baseline 3m"):
    MODEL_NAME = "LR_WEATHER_BASELINE_3M"

    linear_reg = LinearRegression(
        featuresCol="features",
        labelCol="DEP_DELAY_NEW",
        # Linear Regression has different parameters than Random Forest
        maxIter=10, 
        regParam=0.3
    )

    # rf = RandomForestRegressor(
    #     featuresCol="features",  
    #     labelCol="DEP_DELAY_NEW",   
    #     numTrees=20,
    #     maxDepth=10
    # )

    # Create pipeline
    pipeline = Pipeline(stages=[
        carrier_indexer, origin_indexer, dest_indexer, tail_num_indexer,
        carrier_encoder, origin_encoder, dest_encoder, tail_num_encoder,
        assembler,
        linear_reg
        # rf
    ])

    model = pipeline.fit(train_3m_df)
    training_predictions = model.transform(train_3m_df)
    validation_predictions = model.transform(validation_3m_df)

    mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

    rmse_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="rmse"
    )

    # Calculate MAE
    mae_t = mae_evaluator.evaluate(training_predictions)
    mae_v = mae_evaluator.evaluate(validation_predictions)
    # Calculate RMSE
    rmse_t = rmse_evaluator.evaluate(training_predictions)
    rmse_v = rmse_evaluator.evaluate(validation_predictions)

    signature = infer_signature(train_df, training_predictions)

    mlflow.spark.log_model(
        model, 
        MODEL_NAME,
        input_example=train_df.limit(1).toPandas(),
        signature=signature,
        registered_model_name="flight_delay_prediction_baseline"
        )

    mlflow.log_metric("train_mae", mae_t)
    mlflow.log_metric("validation_mae", mae_v)
    mlflow.log_metric("train_rmse", rmse_t)
    mlflow.log_metric("validation_rmse", rmse_v)

In [0]:
from mlflow.tracking import MlflowClient

# Initialize the MLflow client
client = MlflowClient()

# Identify the target experiment
experiment_name = "flight_delay_prediction_baseline"
experiment = client.get_experiment_by_name(experiment_name)

In [0]:
if experiment:
    # Set the filter string to only include runs that successfully finished
    success_filter = "attribute.status = 'FINISHED'"

    # Search runs using the filter
    successful_runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string=success_filter
    )

    # Count the resulting list
    num_successful_runs = len(successful_runs)

    print(f"Total successful runs in '{experiment_name}': {num_successful_runs}")

    # You can inspect the first successful run for verification
    if successful_runs:
        print(f"Example successful Run ID: {successful_runs[0].info.run_id}")

else:
    print(f"Experiment '{experiment_name}' not found.")

In [0]:
from mlflow.tracking import MlflowClient

# Initialize the client
client = MlflowClient()

# --- Step 1: Retrieve All Experiments ---
# 'ACTIVE_ONLY' is the best type to avoid counting deleted experiments
all_experiments = client.search_experiments(view_type='ACTIVE_ONLY')

total_runs = 0
# Create a list to store run counts per experiment for detail
experiment_run_counts = {}

# --- Step 2: Iterate and Sum Runs ---
for exp in all_experiments:
    try:
        # Search all runs within the current experiment
        runs = client.search_runs(
            experiment_ids=[exp.experiment_id],
            filter_string="" # No filter applied, count all
        )
        
        num_runs_in_exp = len(runs)
        total_runs += num_runs_in_exp
        experiment_run_counts[exp.name] = num_runs_in_exp
        
    except Exception as e:
        # Handle cases where an experiment might be corrupted or inaccessible
        print(f"Skipping experiment {exp.name} due to error: {e}")

print("--- MLflow Run Summary ---")
print(f"Total Runs Logged Across All Experiments: **{total_runs}**")
print("---")
# Optionally display the breakdown
# print("Breakdown by Experiment:")
# for name, count in experiment_run_counts.items():
#     print(f"  - {name}: {count} runs")

In [0]:
# 1. Get all *active* experiments
all_experiments = client.search_experiments(view_type='ACTIVE_ONLY') 

# 2. Iterate through each experiment
for exp in all_experiments:
    # 3. Get all runs within that experiment (no status filter applied here)
    runs = client.search_runs(experiment_ids=[exp.experiment_id], filter_string="attributes.status = 'FINISHED'") 
    
    # 4. Sum the count
    total_runs += len(runs)

print(total_runs)