# XGBOOST model baseline - 1 year - hyperparameter tuning testing
- run model from 1_year_combined data with feature engineering
  - TAIL_NUM causes OOM error, comment out for now
- featuring engineering handled in https://dbc-fae72cab-cf59.cloud.databricks.com/editor/notebooks/1792055957780055?o=4021782157704243



## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from xgboost.spark import SparkXGBRegressor

from mlflow.models import infer_signature


import random
import numpy as np
import pandas as pd

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets - custom join
- get checkpoint data
  - 1 year combined join, with feature engineering

In [0]:
# %fs ls dbfs:/student-groups/Group_2_2/1_year_custom_joined/feature_eng

In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
# dataset_path = f"{checkpoint_path}/1_year_custom_joined/raw_data/training_splits"
dataset_path = f"{checkpoint_path}/1_year_custom_joined/feature_eng/training_splits"

# Read datasets from checkpoint
train_df = spark.read.parquet(f"{dataset_path}/train.parquet")
validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")

# Feature Selection

In [0]:
baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    # "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DEP_DELAY_NEW",
    "utc_timestamp",
    "CRS_DEP_MINUTES",            # feature eng start
    "prev_flight_delay_in_minutes", 
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN",        # feature eng end
    'HourlyDryBulbTemperature',     # weather start
    'HourlyDewPointTemperature',
    'HourlyRelativeHumidity',
    'HourlyAltimeterSetting',
    'HourlyVisibility',
    'HourlyStationPressure',
    'HourlyWetBulbTemperature',
    'HourlyPrecipitation',
    'HourlyCloudCoverage',
    'HourlyCloudElevation',
    'HourlyWindSpeed'               # weather end
]

In [0]:
train_df = train_df.filter(F.col("DEP_DELAY_NEW").isNotNull()).select(baselines_columns)
validation_df = validation_df.filter(F.col("DEP_DELAY_NEW").isNotNull()).select(baselines_columns)

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")



In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        # "tail_num_vec",
        "CRS_DEP_MINUTES",                 # feature eng start
        "prev_flight_delay_in_minutes",
        "prev_flight_delay",
        "origin_delays_4h",
        "delay_origin_7d",
        "delay_origin_carrier_7d",
        "delay_route_7d",
        "flight_count_24h",
        "LANDING_TIME_DIFF_MINUTES",
        "AVG_ARR_DELAY_ORIGIN",
        "AVG_TAXI_OUT_ORIGIN",              # feature eng end
        'HourlyDryBulbTemperature',         # weather start
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'                   # weather end
    ],
    outputCol="features"
)

# Hyperparameter Tuning

In [0]:
NUM_PARTITIONS = 400
train_df = train_df.repartition(NUM_PARTITIONS).cache()

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from xgboost.spark import SparkXGBRegressor
from pyspark.ml import Pipeline

# --- Model Estimators ---
preprocessing_stages = [
    carrier_indexer, origin_indexer, dest_indexer, 
    carrier_encoder, origin_encoder, dest_encoder, 
    assembler 
]

# A. XGBoost Regressor
xgb = SparkXGBRegressor(
    features_col="features",
    label_col="DEP_DELAY_NEW",
    num_workers=2, 
)

# B. Random Forest Regressor
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="DEP_DELAY_NEW"
)

# C. Linear Regression
lr = LinearRegression(
    featuresCol="features",
    labelCol="DEP_DELAY_NEW"
)

# --- Parameter Grids ---
# A. XGBoost Grid
# max_depth [2, 4, 6]
# n_estimators [10, 20, 100]
# learning rate [0.05, 0.1, 0.3]
#   * Had to run in chunks, running into:
#       * Executor/Worker Instability -  OOM, heartbeat failures
#       * Resource starvation
#       * Syncronous job failure
xgb_grid = ParamGridBuilder() \
    .addGrid(xgb.max_depth, [2, 4, 6]) \
    .addGrid(xgb.n_estimators, [10, 20, 100]) \
    .addGrid(xgb.learning_rate, [0.05, 0.1, 0.3]) \
    .build()

# B. Random Forest Grid
rf_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [3, 5]) \
    .addGrid(rf.maxBins, [20, 32, 40]) \
    .build()

# C. Linear Regression Grid (Regularization/ElasticNet)
# use this to test if everything links properly, but not for final model
lr_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# --- Evaluator (Use one metric for optimization) ---
rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",
    predictionCol="prediction",
    metricName="rmse" 
)

mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"    
)

In [0]:
param_names = [p.name for p in xgb_grid[0].keys()]

hyperparam_xgb_df = pd.DataFrame(
    columns=param_names + ['train_mae', 'validation_mae', 'train_rmse', 'validation_rmse']
)

hyperparam_xgb_df

In [0]:
# step through grid
alg = 'xgb'
MODEL_NAME = "TEST_XGB_1y_HPTUNE"
for params_ in xgb_grid:

    estimator_with_params = xgb.copy(params_)
    pipeline = Pipeline(stages=preprocessing_stages + [estimator_with_params])

    param_str = "_".join([f"{p.name}_{params_[p]}" for p in params_])
    run_name_suffix = f"{alg}_{param_str}"

    with mlflow.start_run(run_name=f"HPTune_{run_name_suffix}"):
        print(f"Starting tuning for {run_name_suffix}...")

        model = pipeline.fit(train_df, params_)

        training_predictions = model.transform(train_df)
        validation_predictions = model.transform(validation_df)

        mae_evaluator = RegressionEvaluator(
            labelCol="DEP_DELAY_NEW",      
            predictionCol="prediction", 
            metricName="mae"           
        )

        rmse_evaluator = RegressionEvaluator(
            labelCol="DEP_DELAY_NEW",      
            predictionCol="prediction", 
            metricName="rmse"
        )

        # Calculate MAE
        mae_t = mae_evaluator.evaluate(training_predictions)
        mae_v = mae_evaluator.evaluate(validation_predictions)

        # Calculate RMSE
        rmse_t = rmse_evaluator.evaluate(training_predictions)
        rmse_v = rmse_evaluator.evaluate(validation_predictions)

        signature = infer_signature(train_df, training_predictions)

        mlflow.spark.log_model(
            model, 
            MODEL_NAME,
            input_example=train_df.limit(1).toPandas(),
            signature=signature,
            registered_model_name="flight_delay_prediction_baseline"
            )

        mlflow.log_metric("train_mae", mae_t)
        mlflow.log_metric("validation_mae", mae_v)
        mlflow.log_metric("train_rmse", rmse_t)
        mlflow.log_metric("validation_rmse", rmse_v)

        results_row_data = {
            'train_mae': mae_t, 
            'validation_mae': mae_v, 
            'train_rmse': rmse_t, 
            'validation_rmse': rmse_v
        }

        for p in params_.keys():
            results_row_data[p.name] = params_[p]

        results_row = pd.DataFrame([results_row_data], columns=hyperparam_xgb_df.columns)
        hyperparam_xgb_df = pd.concat([hyperparam_xgb_df, results_row], ignore_index=True)

        print('tuning summary:')
        print(hyperparam_xgb_df)

In [0]:
hyperparam_xgb_df