# XGBOOST model baseline - 1 year - hyperparameter tuning testing
- run model from 1_year_combined data with feature engineering
  - TAIL_NUM causes OOM error, comment out for now
- featuring engineering handled in https://dbc-fae72cab-cf59.cloud.databricks.com/editor/notebooks/1792055957780055?o=4021782157704243



## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from xgboost.spark import SparkXGBRegressor

from mlflow.models import infer_signature


import random
import numpy as np
import pandas as pd

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets - custom join
- get checkpoint data
  - 1 year combined join, with feature engineering

In [0]:
# Stephanie's latest features = 1_year_custom_joined/feature_eng_ph3/training_splits/
# Daniel's Graph features = 1_year_custom_joined/graph_feature_splits
# Raw splits = 3_month_custom_joined/raw_data/training_splits/

In [0]:
%fs ls dbfs:/student-groups/Group_2_2/5_year_custom_joined/raw_data/training_splits

In [0]:
train_splits = spark.read.parquet("dbfs:/student-groups/Group_2_2/5_year_custom_joined/raw_data/training_splits/train.parquet/")
val_splits = spark.read.parquet("dbfs:/student-groups/Group_2_2/5_year_custom_joined/raw_data/training_splits/validation.parquet/")
test_splits = spark.read.parquet("dbfs:/student-groups/Group_2_2/5_year_custom_joined/raw_data/training_splits/test.parquet/")

In [0]:
train_splits.count(), val_splits.count(), test_splits.count()
# (951978, 133991, 271994)
# (5100978, 726381, 1457423)
# (19175825, 4791467, 7287112)

# Feature Selection

In [0]:
baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    # "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DEP_DELAY_NEW",
    "utc_timestamp",
    "CRS_DEP_MINUTES",            # feature eng start
    "prev_flight_delay_in_minutes", 
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN",        # feature eng end
    'HourlyDryBulbTemperature',     # weather start
    'HourlyDewPointTemperature',
    'HourlyRelativeHumidity',
    'HourlyAltimeterSetting',
    'HourlyVisibility',
    'HourlyStationPressure',
    'HourlyWetBulbTemperature',
    'HourlyPrecipitation',
    'HourlyCloudCoverage',
    'HourlyCloudElevation',
    'HourlyWindSpeed'               # weather end
]

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")



In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        # "tail_num_vec",
        "CRS_DEP_MINUTES",                 # feature eng start
        "prev_flight_delay_in_minutes",
        "prev_flight_delay",
        "origin_delays_4h",
        "delay_origin_7d",
        "delay_origin_carrier_7d",
        "delay_route_7d",
        "flight_count_24h",
        "LANDING_TIME_DIFF_MINUTES",
        "AVG_ARR_DELAY_ORIGIN",
        "AVG_TAXI_OUT_ORIGIN",              # feature eng end
        'HourlyDryBulbTemperature',         # weather start
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'                   # weather end
    ],
    outputCol="features"
)

# Hyperparameter Tuning

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from xgboost.spark import SparkXGBRegressor
from pyspark.ml import Pipeline

# --- Model Estimators ---
preprocessing_stages = [
    carrier_indexer, origin_indexer, dest_indexer, 
    carrier_encoder, origin_encoder, dest_encoder, 
    assembler 
]

# A. XGBoost Regressor
xgb = SparkXGBRegressor(
    features_col="features",
    label_col="DEP_DELAY_NEW",
    num_workers=2, 
)

# # B. Random Forest Regressor
# rf = RandomForestRegressor(
#     featuresCol="features",
#     labelCol="DEP_DELAY_NEW"
# )

# # C. Linear Regression
# lr = LinearRegression(
#     featuresCol="features",
#     labelCol="DEP_DELAY_NEW"
# )

# --- Parameter Grids ---
# A. XGBoost Grid
# max_depth [2, 4, 6]
# n_estimators [10, 20, 100]
# learning rate [0.05, 0.1, 0.3]
#   * Had to run in chunks, running into:
#       * Executor/Worker Instability -  OOM, heartbeat failures
#       * Resource starvation
#       * Syncronous job failure
xgb_grid = ParamGridBuilder() \
    .addGrid(xgb.max_depth, [4, 6]) \
    .addGrid(xgb.n_estimators, [20, 50, 100]) \
    .addGrid(xgb.learning_rate, [0.05, 0.1]) \
    .build()

# xgb_grid = ParamGridBuilder() \
#     .addGrid(xgb.max_depth, [2, 4, 6]) \
#     .addGrid(xgb.n_estimators, [10, 20, 100]) \
#     .addGrid(xgb.learning_rate, [0.05, 0.1, 0.3]) \
#     .build()


# # B. Random Forest Grid
# rf_grid = ParamGridBuilder() \
#     .addGrid(rf.numTrees, [10, 20]) \
#     .addGrid(rf.maxDepth, [3, 5]) \
#     .addGrid(rf.maxBins, [20, 32, 40]) \
#     .build()

# # C. Linear Regression Grid (Regularization/ElasticNet)
# # use this to test if everything links properly, but not for final model
# lr_grid = ParamGridBuilder() \
#     .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
#     .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
#     .build()

# --- Evaluator (Use one metric for optimization) ---
rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",
    predictionCol="prediction",
    metricName="rmse" 
)

mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"    
)

In [0]:
# param_names = [p.name for p in xgb_grid[0].keys()]

# hyperparam_xgb_df = pd.DataFrame(
#     columns=param_names + ['train_mae', 'validation_mae', 'train_rmse', 'validation_rmse']
# )

# hyperparam_xgb_df

In [0]:
def read_specific_fold(path: str, fold_id: int, split_type: str):
    """
    Read a specific fold from partitioned parquet data.
    Falls back to filtering if direct partition read fails.
    """
    fold_path = f"{path}/fold_id={fold_id}/split_type={split_type}"
    
    try:
        # Try direct partition read
        return spark.read.parquet(fold_path)
    except:
        # Fallback: read all data and filter
        print(f"Direct read failed for fold {fold_id}, using filter method...")
        all_data = spark.read.parquet(path)
        return all_data.filter(
            (all_data.fold_id == fold_id) & 
            (all_data.split_type == split_type)
        )

In [0]:
# Hyperparameter tuning with CV for XGBoost - OPTIMIZED
import numpy as np
import pandas as pd

alg = 'XGB'
n_folds = 5  # CHANGE 1: Reduced from 10 to 5 for 2x speedup
month_or_year = "1_year_custom_joined"

mae_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",      
    predictionCol="prediction", 
    metricName="mae"           
)

rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",      
    predictionCol="prediction", 
    metricName="rmse"
)

# Store results for all hyperparameter combinations
hyperparam_results = []

# Parent run for entire hyperparameter tuning experiment
with mlflow.start_run(run_name="XGB_HPTUNE_WITH_CV_1_YEAR") as hptune_parent_run:
    mlflow.log_param("algorithm", "XGBoost")
    mlflow.log_param("n_folds", n_folds)
    mlflow.log_param("dataset", month_or_year)
    mlflow.log_param("n_param_combinations", len(xgb_grid))
    
    # Iterate through each hyperparameter combination
    for param_idx, params_ in enumerate(xgb_grid):
        estimator_with_params = xgb.copy(params_)
        pipeline = Pipeline(stages=preprocessing_stages + [estimator_with_params])
        
        param_str = "_".join([f"{p.name}_{params_[p]}" for p in params_])
        
        # Child run for each hyperparameter combination
        with mlflow.start_run(run_name=f"params_{param_idx+1}_{param_str}", nested=True) as param_run:
            
            # Log hyperparameters for this combination
            mlflow.log_param("max_depth", params_[xgb.max_depth])
            mlflow.log_param("n_estimators", params_[xgb.n_estimators])
            mlflow.log_param("learning_rate", params_[xgb.learning_rate])
            
            cv_results = []
            fold_metrics = {
                'train_mae': [], 'val_mae': [],
                'train_rmse': [], 'val_rmse': []
            }
            
            print(f"\n{'='*120}")
            print(f"Hyperparameter Combination {param_idx+1}/{len(xgb_grid)}: {param_str}")
            print(f"{'='*120}\n")
            
            # CV loop for this hyperparameter combination
            for fold_id in range(1, n_folds + 1):
                # Nested run for each fold (nested within the param run)
                with mlflow.start_run(run_name=f"fold_{fold_id}", nested=True) as fold_run:
                    
                    # CHANGE 2: Cache data for speed
                    fold_train = read_specific_fold(
                        path=f"dbfs:/student-groups/Group_2_2/{month_or_year}/cv_splits", 
                        fold_id=fold_id, 
                        split_type="train"
                    ).cache()
                    
                    fold_val = read_specific_fold(
                        path=f"dbfs:/student-groups/Group_2_2/{month_or_year}/cv_splits", 
                        fold_id=fold_id, 
                        split_type="validation"
                    ).cache()
                    
                    # Materialize cache with single count
                    fold_train.count()
                    fold_val.count()
                    
                    print(f"Training fold {fold_id}/{n_folds}...")
                    
                    # Train model
                    model = pipeline.fit(fold_train)
                    
                    # Make predictions (keeping both train and val)
                    training_predictions = model.transform(fold_train)
                    validation_predictions = model.transform(fold_val)
                    
                    # Evaluate
                    mae_t = mae_evaluator.evaluate(training_predictions)
                    mae_v = mae_evaluator.evaluate(validation_predictions)
                    rmse_t = rmse_evaluator.evaluate(training_predictions)
                    rmse_v = rmse_evaluator.evaluate(validation_predictions)
                    
                    fold_metrics['train_mae'].append(mae_t)
                    fold_metrics['val_mae'].append(mae_v)
                    fold_metrics['train_rmse'].append(rmse_t)
                    fold_metrics['val_rmse'].append(rmse_v)
                    
                    # Log to fold run
                    mlflow.log_metrics({
                        "train_mae": mae_t,
                        "val_mae": mae_v,
                        "train_rmse": rmse_t,
                        "val_rmse": rmse_v,
                    })
                    
                    cv_results.append({
                        'fold': fold_id,
                        'train_mae': mae_t,
                        'val_mae': mae_v,
                        'train_rmse': rmse_t,
                        'val_rmse': rmse_v
                    })
                    
                    print(f"Fold {fold_id} - Train MAE: {mae_t:.4f}, Val MAE: {mae_v:.4f}")
                    
                    # CHANGE 3: Unpersist cache after fold completes
                    fold_train.unpersist()
                    fold_val.unpersist()
                
                # Log fold metrics to param run (after fold run closes)
                mlflow.log_metrics({
                    f"fold_{fold_id}_train_mae": mae_t,
                    f"fold_{fold_id}_val_mae": mae_v,
                    f"fold_{fold_id}_train_rmse": rmse_t,
                    f"fold_{fold_id}_val_rmse": rmse_v,
                })
            
            # Calculate and log aggregated CV metrics for this param combination
            avg_metrics = {
                "avg_train_mae": np.mean(fold_metrics['train_mae']),
                "avg_val_mae": np.mean(fold_metrics['val_mae']),
                "std_val_mae": np.std(fold_metrics['val_mae']),
                "avg_train_rmse": np.mean(fold_metrics['train_rmse']),
                "avg_val_rmse": np.mean(fold_metrics['val_rmse']),
                "std_val_rmse": np.std(fold_metrics['val_rmse'])
            }
            mlflow.log_metrics(avg_metrics)
            
            # Log CV results table
            results_df = pd.DataFrame(cv_results)
            mlflow.log_table(data=results_df, artifact_file="cv_fold_results.json")
            
            # Store results for comparison across all param combinations
            hyperparam_results.append({
                'param_idx': param_idx + 1,
                'max_depth': params_[xgb.max_depth],
                'n_estimators': params_[xgb.n_estimators],
                'learning_rate': params_[xgb.learning_rate],
                'avg_train_mae': avg_metrics['avg_train_mae'],
                'avg_val_mae': avg_metrics['avg_val_mae'],
                'std_val_mae': avg_metrics['std_val_mae'],
                'avg_train_rmse': avg_metrics['avg_train_rmse'],
                'avg_val_rmse': avg_metrics['avg_val_rmse'],
                'std_val_rmse': avg_metrics['std_val_rmse']
            })
            
            print(f"\nParam Combo {param_idx+1} Complete - Avg Val MAE: {avg_metrics['avg_val_mae']:.4f} ± {avg_metrics['std_val_mae']:.4f}")
            print(f"{'='*120}\n")
    
    # Log summary of all hyperparameter combinations
    hyperparam_df = pd.DataFrame(hyperparam_results)
    mlflow.log_table(data=hyperparam_df, artifact_file="hyperparam_comparison.json")
    
    # Find and log best parameters
    best_idx = hyperparam_df['avg_val_mae'].idxmin()
    best_params = hyperparam_df.iloc[best_idx]
    
    mlflow.log_metrics({
        "best_avg_val_mae": best_params['avg_val_mae'],
        "best_std_val_mae": best_params['std_val_mae'],
    })
    
    mlflow.log_params({
        "best_max_depth": best_params['max_depth'],
        "best_n_estimators": best_params['n_estimators'],
        "best_learning_rate": best_params['learning_rate'],
    })
    
    print("\n" + "="*120)
    print("HYPERPARAMETER TUNING COMPLETE")
    print("="*120)
    print("\nAll Parameter Combinations:")
    print(hyperparam_df.to_string(index=False))
    print(f"\nBest Parameters (by Val MAE):")
    print(f"  max_depth: {best_params['max_depth']}")
    print(f"  n_estimators: {best_params['n_estimators']}")
    print(f"  learning_rate: {best_params['learning_rate']}")
    print(f"  Avg Val MAE: {best_params['avg_val_mae']:.4f} ± {best_params['std_val_mae']:.4f}")
    print("="*120)

In [0]:
# hyperparam_xgb_df