## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from xgboost.spark import SparkXGBRegressor

from mlflow.models import infer_signature

import random
import numpy as np
import pandas as pd

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



In [0]:
# from pyspark.ml import Estimator, Model
# from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
# from pyspark.sql.functions import col, lit, when
# from pyspark.sql import DataFrame
# from pyspark.sql.functions import count as f_count

# class IntervalClassifier(Estimator, DefaultParamsReadable, DefaultParamsWritable):

#     def __init__(self,
#                  lowerEstimator,
#                  upperEstimator,
#                  baseClassifier,
#                  labelCol="delay_minutes",
#                  featuresCol="features",
#                  predictionCol="final_prediction",
#                  threshold=15.0,
#                  quantile_gap=0.1,
#                  qLowCol="low_pred",
#                  qHighCol="high_pred",
#                  clfPredictionCol="clf_prediction",
#                  undersample_majority=True,
#                  undersample_seed=42):
#         super().__init__()
#         self.lowerEstimator = lowerEstimator
#         self.upperEstimator = upperEstimator
#         self.baseClassifier = baseClassifier
#         self.labelCol = labelCol
#         self.featuresCol = featuresCol
#         self.predictionCol = predictionCol
#         self.threshold = float(threshold)
#         self.quantile_gap = float(quantile_gap)
#         self.qLowCol = qLowCol
#         self.qHighCol = qHighCol
#         self.clfPredictionCol = clfPredictionCol
#         self.undersample_majority = undersample_majority
#         self.undersample_seed = undersample_seed

#     def _fit(self, dataset: DataFrame) -> Model:
#         # 1) Fit quantile regressors on ALL data
#         print("Training Lower Estimator...")
#         lowerModel = self.lowerEstimator.fit(dataset)
#         print("Training Upper Estimator...")
#         upperModel = self.upperEstimator.fit(dataset)

#         # ---------------------------------------------------------------
#         """
#         booster = lowerModel.get_booster()

#         # ---- Gain importance from Booster ----
#         # XGBoost uses f0, f1, ... in order of the feature vector
#         gain_dict = booster.get_score(importance_type="gain")  # e.g. {"f0": 0.3, "f2": 0.1, ...}

#         # Map f0, f1, ... back to your feature names
#         rows = []
#         for i, feat_name in enumerate(model_cols_final):
#             key = f"f{i}"
#             gain = gain_dict.get(key, 0.0)
#             rows.append((feat_name, gain))

#         gain_df = pd.DataFrame(rows, columns=["feature", "gain"]).sort_values("gain", ascending=False)
#         print("Gain:")
#         print(gain_df)

#         sample_vec = assembler.transform(train_df).select("features").limit(500)  # or whatever size

#         X_local = np.stack(
#             sample_vec.rdd.map(lambda row: row["features"].toArray()).collect()
#         )

#         # ---- SHAP on the Booster ----
#         explainer = shap.TreeExplainer(booster)
#         shap_values = explainer.shap_values(X_local)

#         # Mean abs SHAP importance per feature
#         shap_importance = (
#             pd.DataFrame({
#                 "feature": model_cols_final,
#                 "shap_importance": np.abs(shap_values).mean(axis=0)
#             })
#             .sort_values("shap_importance", ascending=False)
#         )

#         print(shap_importance.head(20))


#         # ---------------------------------------------------------------
#         print("END")
#         """
        
#         print("Configuring Classifier...")
#         # 2) Add qLow and qHigh predictions
#         df_q = lowerModel.transform(dataset) \
#                          .withColumnRenamed("prediction", self.qLowCol)

#         df_q = upperModel.transform(df_q) \
#                          .withColumnRenamed("prediction", self.qHighCol)

#         # 3) Keep only examples where 15 is between qLow and qHigh
#         print("Filtering ambiguous cases...")
#         thr = lit(self.threshold)
#         df_ambig = df_q.filter(
#             (col(self.qLowCol) <= thr) & (thr <= col(self.qHighCol))
#         )

#         # 4) Build binary label: 1 if delay ≥ threshold, else 0
#         df_ambig = df_ambig.withColumn(
#             "bin_label",
#             (col(self.labelCol) >= thr).cast("double")
#         )

#         # save df_ambig to disk
#         run_name = 'XGB-3m_2_stage_opt_low_0.380_high_1.000'
        
#         ambig_dest = f"dbfs:/student-groups/Group_2_2/2_stage_dev_files/df_ambig_{run_name}.parquet"
#         print("Saving ambig dataset to parquet at", ambig_dest)
#         df_ambig.write.mode("overwrite").parquet(ambig_dest)

#         print("Undersampling...")
#         # --- undersample majority class among ambiguous cases ---
#         if self.undersample_majority:
#             # class counts on driver (only 2 classes)
#             class_counts = (
#                 df_ambig.groupBy("bin_label")
#                         .agg(f_count("*").alias("cnt"))
#                         .collect()
#             )

#             # If we have both classes, find minority/majority and undersample
#             if len(class_counts) != 2:
#                 raise ValueError("Ambiguous cases must contain both classes.")

#             (label0, cnt0), (label1, cnt1) = [
#                 (row["bin_label"], row["cnt"]) for row in class_counts
#             ]

#             if cnt0 <= cnt1:
#                 minority_label, minority_cnt = label0, cnt0
#                 majority_label, majority_cnt = label1, cnt1
#             else:
#                 minority_label, minority_cnt = label1, cnt1
#                 majority_label, majority_cnt = label0, cnt0

#             print(f"Undersampling majority class {majority_label} from {majority_cnt} to {minority_cnt}.")

#             if majority_cnt > 0 and minority_cnt > 0:
#                 frac_majority = float(minority_cnt) / float(majority_cnt)

#                 # sampleBy keeps all minority, downsamples majority
#                 fractions = {
#                     float(minority_label): 1.0,
#                     float(majority_label): frac_majority
#                 }

#                 df_ambig = df_ambig.sampleBy(
#                     "bin_label",
#                     fractions=fractions,
#                     seed=self.undersample_seed
#                 )
#         # --- END undersampling block ---

#         print("Training Base Classifier...")
#         # 5) Fit classifier on (possibly undersampled) ambiguous region
#         # NOTE: If you change the baseClassifier, make sure that these methods still work. If not, look up
#         # what the corresponding methods are to achieve the same functionality as the .setParams and .fit methods
#         clf = self.baseClassifier
        
#         clf.setParams(
#             label_col="bin_label",
#             features_col=self.featuresCol,
#             prediction_col=self.clfPredictionCol
#         )

#         clfModel = clf.fit(df_ambig)

#         return IntervalClassifierModel(
#             lowerModel=lowerModel,
#             upperModel=upperModel,
#             clfModel=clfModel,
#             labelCol=self.labelCol,
#             featuresCol=self.featuresCol,
#             predictionCol=self.predictionCol,
#             threshold=self.threshold,
#             qLowCol=self.qLowCol,
#             qHighCol=self.qHighCol,
#             clfPredictionCol=self.clfPredictionCol
#         )

# class IntervalClassifierModel(Model, DefaultParamsReadable, DefaultParamsWritable):

#     def __init__(self,
#                  lowerModel,
#                  upperModel,
#                  clfModel,
#                  labelCol,
#                  featuresCol,
#                  predictionCol,
#                  threshold,
#                  qLowCol,
#                  qHighCol,
#                  clfPredictionCol):
#         super().__init__()
#         self.lowerModel = lowerModel
#         self.upperModel = upperModel
#         self.clfModel = clfModel
#         self.labelCol = labelCol
#         self.featuresCol = featuresCol
#         self.predictionCol = predictionCol
#         self.threshold = float(threshold)
#         self.qLowCol = qLowCol
#         self.qHighCol = qHighCol
#         self.clfPredictionCol = clfPredictionCol

#     def _transform(self, dataset: DataFrame) -> DataFrame:
#         # 1) Predict quantiles
#         df_q = self.lowerModel.transform(dataset) \
#                               .withColumnRenamed("prediction", self.qLowCol)

#         df_q = self.upperModel.transform(df_q) \
#                               .withColumnRenamed("prediction", self.qHighCol)

#         # 2) Get classifier predictions (on all rows, cheap and simple)
#         print("Classifying **all** points with stage 2 classifier...")
#         # NOTE: make sure that if you change clfModel, you still have a .transform method or some equivalent
#         df_clf = self.clfModel.transform(df_q)

#         thr = lit(self.threshold)

#         # Assume classifier is a binary classifier with predictions 0/1
#         # Override classifier predictions where interval says we are confident.
#         print("Classifying points with ambiguous predictions...")
#         df_final = df_clf.withColumn(
#             self.predictionCol,
#             when(thr < col(self.qLowCol), lit(1.0))      # confidently ≥ threshold
#             .when(thr > col(self.qHighCol), lit(0.0))     # confidently < threshold
#             .otherwise(col(self.clfPredictionCol))      # ambiguous → use classifier
#         )

#         # Strategy column to debug
#         df_final = df_final.withColumn(
#             "decision_source",
#             when(thr < col(self.qLowCol), lit("quantile_high"))
#             .when(thr > col(self.qHighCol), lit("quantile_low"))
#             .otherwise(lit("classifier"))
#         )

#         return df_final

def deduplicate_cols(df):
    unique_cols = []
    seen = set()
    for col_name in df.columns:
        if col_name not in seen:
            unique_cols.append(col_name)
            seen.add(col_name)
    df = df.select(unique_cols)

    return df

## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Baseline 5-year

In [0]:
%fs ls dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday/training_splits/test.parquet/

In [0]:
MONTH_OR_YEAR = "5_year_custom_joined"

train_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{MONTH_OR_YEAR}/fe_graph_and_holiday/training_splits/train.parquet/")
validation_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{MONTH_OR_YEAR}/fe_graph_and_holiday/training_splits/validation.parquet/")
test_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{MONTH_OR_YEAR}/fe_graph_and_holiday/training_splits/test.parquet/")

# train_df = deduplicate_cols(train_df)
# validation_df = deduplicate_cols(validation_df)
# test_df = deduplicate_cols(test_df)

In [0]:
train_df.columns == test_df.columns

In [0]:
from pyspark.sql.functions import year

test_df = test_df.withColumn("year", year("utc_timestamp"))

In [0]:
test_cols = test_df.columns
for column in train_cols:
    if column not in test_cols:
        print(column)

In [0]:
checkpoint_dataset(test_df, f"{MONTH_OR_YEAR}/fe_graph_and_holiday/training_splits/test")

# Feature Selection

In [0]:
baseline_columns = ["QUARTER", "MONTH", "YEAR", "DAY_OF_MONTH", "DAY_OF_WEEK", "OP_CARRIER", "ORIGIN_AIRPORT_SEQ_ID", "DEST_AIRPORT_SEQ_ID", "CRS_ELAPSED_TIME", "DISTANCE", "DEP_DELAY_NEW"]

weather_columns = ['HourlyDryBulbTemperature', 'HourlyDewPointTemperature', 'HourlyRelativeHumidity', 'HourlyAltimeterSetting', 'HourlyVisibility', 'HourlyStationPressure', 'HourlyWetBulbTemperature', 'HourlyPrecipitation', 'HourlyCloudCoverage', 'HourlyCloudElevation', 'HourlyWindSpeed']

graph_columns = ['page_rank', 'out_degree', 'in_degree', 'weighted_out_degree', 'weighted_in_degree', 'N_RUNWAYS', 'betweenness_unweighted', 'closeness', 'betweenness', 'avg_origin_dep_delay', 'avg_dest_arr_delay', 'avg_daily_route_flights', 'avg_route_delay', 'avg_hourly_flights']

engineered_features = ["CRS_DEP_MINUTES", "prev_flight_delay_in_minutes", "prev_flight_delay", "origin_delays_4h", "delay_origin_7d", "delay_origin_carrier_7d", "delay_route_7d", "flight_count_24h", "LANDING_TIME_DIFF_MINUTES", "AVG_ARR_DELAY_ORIGIN", "AVG_TAXI_OUT_ORIGIN"]

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")



In [0]:
# Assemble all features
transformed_baseline = [
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
    ]

assembler = VectorAssembler(
    inputCols=transformed_baseline,
    outputCol="features"
)
train_columns = baseline_columns

In [0]:

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from xgboost.spark import SparkXGBRegressor, SparkXGBClassifier
from pyspark.ml import Pipeline

rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",
    predictionCol="prediction",
    metricName="rmse" 
)

mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"    
)


xgb = SparkXGBRegressor(
    features_col="features",
    label_col="DEP_DELAY_NEW",
    device='cpu',
    learning_rate=0.1,
    n_estimators=100,
    max_depth=6,
    num_workers=4,
    # REDUCE MEMORY PRESSURE
    tree_method='hist',
    max_bin=128,  # Reduced from 256
    subsample=0.6,  # Reduced from 0.8
    colsample_bytree=0.6,  # Reduced from 0.8
    # Limit parallelism per worker
    # nthread=1,
    # Add these for stability
    use_gpu=False,
    missing=np.nan
)


# --- Model Estimators ---
preprocessing_stages = [
    carrier_indexer, origin_indexer, dest_indexer, 
    carrier_encoder, origin_encoder, dest_encoder, 
    assembler 
]
pipeline = Pipeline(stages=preprocessing_stages + [xgb])

with mlflow.start_run(run_name="Ankush-XGB-5-YEAR-BASELINE"):
    MODEL_NAME = "ANKUSH_XGB_5_YEAR_BASELINE"

    print("Training pipeline model...")
    model = pipeline.fit(train_df.select(train_columns))

    print("Generating Training Predictions...")
    training_predictions = model.transform(train_df.select(train_columns))
    print("Generating Validation Predictions...")
    validation_predictions = model.transform(validation_df.select(train_columns))
    print("Generating Test Predictions...")
    test_predictions = model.transform(test_df.select(train_columns))

    # Calculate MAE
    mae_train = mae_evaluator.evaluate(training_predictions)
    mae_val = mae_evaluator.evaluate(validation_predictions)
    mae_test = mae_evaluator.evaluate(test_predictions)
    # Calculate RMSE
    rmse_train = rmse_evaluator.evaluate(training_predictions)
    rmse_val = rmse_evaluator.evaluate(validation_predictions)
    rmse_test = rmse_evaluator.evaluate(test_predictions)

    mlflow.log_metric("train_mae", mae_train)
    mlflow.log_metric("validation_mae", mae_val)
    mlflow.log_metric("test_mae", mae_test)
    mlflow.log_metric("train_rmse", rmse_train)
    mlflow.log_metric("validation_rmse", rmse_val)
    mlflow.log_metric("test_rmse", rmse_test)


In [0]:
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# print("Creating binary label columns for evaluation...")
# def with_binary_label(df, label_col="DEP_DELAY_NEW", out_col="label_bin", threshold=DELAY_THRESHOLD):
#     return df.withColumn(out_col, (col(label_col) >= lit(threshold)).cast("double"))

# training_predictions = with_binary_label(training_predictions)
# validation_predictions = with_binary_label(validation_predictions)
# test_predictions = with_binary_label(test_predictions)

# f2_eval = MulticlassClassificationEvaluator(
#             labelCol="label_bin",
#             predictionCol="final_prediction",
#             metricName="fMeasureByLabel"   # requires setting beta below
#         ).setMetricLabel(1).setBeta(2.0)    # evaluate F2 for the positive class (label=1)

# f2_t = f2_eval.evaluate(training_predictions)
# f2_v = f2_eval.evaluate(validation_predictions)
# f2_test = f2_eval.evaluate(test_predictions)


In [0]:
# %python


# use best hyperparameters from Phase 2
# NOTE: This is the meat of what we will need to adjust through our hyperparameter tuning and model debugging. Don't run this cell, but feel free to use this as a sandbox to change around parameters/model definitions etc. Make sure that if one of the model objects below is called elsewhere in the 

# high_quantile, low_quantile = 0.95, 0.5
# train_columns = baseline_columns


# # Set num_workers to match your cluster
# NUM_WORKERS = 8  # Your current cluster size

# high_quantile, low_quantile = 0.95, 0.5

# xgb_regressor_high = SparkXGBRegressor(
#     objective="reg:quantileerror",
#     quantile_alpha=high_quantile,
#     features_col="features",
#     label_col="DEP_DELAY_NEW",
#     num_workers=NUM_WORKERS,
#     max_depth=6,
#     n_estimators=100,
#     learning_rate=0.05,
#     device='cpu',
#     num_round=200
# )

# xgb_regressor_low = SparkXGBRegressor(
#     objective="reg:quantileerror",
#     quantile_alpha=low_quantile,
#     features_col="features",
#     label_col="DEP_DELAY_NEW",
#     num_workers=NUM_WORKERS,
#     max_depth=6,
#     n_estimators=100,
#     learning_rate=0.05,
#     device='cpu',
#     num_round=200
# )

# classifier = SparkXGBClassifier(
#     features_col="features",
#     label_col="bin_label",
#     prediction_col="clf_prediction",
#     max_depth=6,
#     n_estimators=100,
#     learning_rate=0.05,
#     num_workers=NUM_WORKERS,
#     device='cpu',
#     num_round=200
# )

# # Example classifier
# # NOTE: This will need to be tuned once we have selected optimal low_quantile/high_quantile values
# # NOTE: This xgb classifier may not be the best model type for our use case, please feel free to try others
# # NOTE: Once we select a low_quantile/high_quantile, you can train those again and save the ambiguous training examples to further fine tune the classifier. This will help in model selection and narrowing our hyperparameter/model grid search, but eventually we will need to perform hyperparameter tuning on the entire pipeline end-to-end with no modular training/eval.


# print("Initializing interval classifier...")
# interval_clf = IntervalClassifier(
#     lowerEstimator=xgb_regressor_low,
#     upperEstimator=xgb_regressor_high,
#     baseClassifier=classifier,
#     labelCol="DEP_DELAY_NEW",
#     featuresCol="features",
#     threshold=15.0,
#     predictionCol="final_prediction"
# )

# print("Building pipeline...")
# pipeline = Pipeline(stages=[carrier_indexer, origin_indexer, dest_indexer, 
#     carrier_encoder, origin_encoder, dest_encoder, 
#     assembler,
#     interval_clf
# ])

# DELAY_THRESHOLD = 15.0  # minutes

# metrics_dict = {
#     "f2_train": f2_eval.evaluate(training_predictions),
#     "f2_val": f2_eval.evaluate(validation_predictions),
#     "f2_test": f2_eval.evaluate(test_predictions)
# }
# display(metrics_dict)

## Didn't run the HP.

In [0]:
# Hyperparameter tuning with CV for XGBoost - OPTIMIZED
import numpy as np
import pandas as pd

alg = 'XGB'
n_folds = 5  # CHANGE 1: Reduced from 10 to 5 for 2x speedup
month_or_year = "1_year_custom_joined"

mae_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",      
    predictionCol="prediction", 
    metricName="mae"           
)

rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",      
    predictionCol="prediction", 
    metricName="rmse"
)

# Store results for all hyperparameter combinations
hyperparam_results = []

# Parent run for entire hyperparameter tuning experiment
with mlflow.start_run(run_name="XGB_HPTUNE_WITH_CV_1_YEAR") as hptune_parent_run:
    mlflow.log_param("algorithm", "XGBoost")
    mlflow.log_param("n_folds", n_folds)
    mlflow.log_param("dataset", month_or_year)
    mlflow.log_param("n_param_combinations", len(xgb_grid))
    
    # Iterate through each hyperparameter combination
    for param_idx, params_ in enumerate(xgb_grid):
        estimator_with_params = xgb.copy(params_)
        pipeline = Pipeline(stages=preprocessing_stages + [estimator_with_params])
        
        param_str = "_".join([f"{p.name}_{params_[p]}" for p in params_])
        
        # Child run for each hyperparameter combination
        with mlflow.start_run(run_name=f"params_{param_idx+1}_{param_str}", nested=True) as param_run:
            
            # Log hyperparameters for this combination
            mlflow.log_param("max_depth", params_[xgb.max_depth])
            mlflow.log_param("n_estimators", params_[xgb.n_estimators])
            mlflow.log_param("learning_rate", params_[xgb.learning_rate])
            
            cv_results = []
            fold_metrics = {
                'train_mae': [], 'val_mae': [],
                'train_rmse': [], 'val_rmse': []
            }
            
            print(f"\n{'='*120}")
            print(f"Hyperparameter Combination {param_idx+1}/{len(xgb_grid)}: {param_str}")
            print(f"{'='*120}\n")
            
            # CV loop for this hyperparameter combination
            for fold_id in range(1, n_folds + 1):
                # Nested run for each fold (nested within the param run)
                with mlflow.start_run(run_name=f"fold_{fold_id}", nested=True) as fold_run:
                    
                    # CHANGE 2: Cache data for speed
                    fold_train = read_specific_fold(
                        path=f"dbfs:/student-groups/Group_2_2/{month_or_year}/cv_splits", 
                        fold_id=fold_id, 
                        split_type="train"
                    ).cache()
                    
                    fold_val = read_specific_fold(
                        path=f"dbfs:/student-groups/Group_2_2/{month_or_year}/cv_splits", 
                        fold_id=fold_id, 
                        split_type="validation"
                    ).cache()
                    
                    # Materialize cache with single count
                    fold_train.count()
                    fold_val.count()
                    
                    print(f"Training fold {fold_id}/{n_folds}...")
                    
                    # Train model
                    model = pipeline.fit(fold_train)
                    
                    # Make predictions (keeping both train and val)
                    training_predictions = model.transform(fold_train)
                    validation_predictions = model.transform(fold_val)
                    
                    # Evaluate
                    mae_t = mae_evaluator.evaluate(training_predictions)
                    mae_v = mae_evaluator.evaluate(validation_predictions)
                    rmse_t = rmse_evaluator.evaluate(training_predictions)
                    rmse_v = rmse_evaluator.evaluate(validation_predictions)
                    
                    fold_metrics['train_mae'].append(mae_t)
                    fold_metrics['val_mae'].append(mae_v)
                    fold_metrics['train_rmse'].append(rmse_t)
                    fold_metrics['val_rmse'].append(rmse_v)
                    
                    # Log to fold run
                    mlflow.log_metrics({
                        "train_mae": mae_t,
                        "val_mae": mae_v,
                        "train_rmse": rmse_t,
                        "val_rmse": rmse_v,
                    })
                    
                    cv_results.append({
                        'fold': fold_id,
                        'train_mae': mae_t,
                        'val_mae': mae_v,
                        'train_rmse': rmse_t,
                        'val_rmse': rmse_v
                    })
                    
                    print(f"Fold {fold_id} - Train MAE: {mae_t:.4f}, Val MAE: {mae_v:.4f}")
                    
                    # CHANGE 3: Unpersist cache after fold completes
                    fold_train.unpersist()
                    fold_val.unpersist()
                
                # Log fold metrics to param run (after fold run closes)
                mlflow.log_metrics({
                    f"fold_{fold_id}_train_mae": mae_t,
                    f"fold_{fold_id}_val_mae": mae_v,
                    f"fold_{fold_id}_train_rmse": rmse_t,
                    f"fold_{fold_id}_val_rmse": rmse_v,
                })
            
            # Calculate and log aggregated CV metrics for this param combination
            avg_metrics = {
                "avg_train_mae": np.mean(fold_metrics['train_mae']),
                "avg_val_mae": np.mean(fold_metrics['val_mae']),
                "std_val_mae": np.std(fold_metrics['val_mae']),
                "avg_train_rmse": np.mean(fold_metrics['train_rmse']),
                "avg_val_rmse": np.mean(fold_metrics['val_rmse']),
                "std_val_rmse": np.std(fold_metrics['val_rmse'])
            }
            mlflow.log_metrics(avg_metrics)
            
            # Log CV results table
            results_df = pd.DataFrame(cv_results)
            mlflow.log_table(data=results_df, artifact_file="cv_fold_results.json")
            
            # Store results for comparison across all param combinations
            hyperparam_results.append({
                'param_idx': param_idx + 1,
                'max_depth': params_[xgb.max_depth],
                'n_estimators': params_[xgb.n_estimators],
                'learning_rate': params_[xgb.learning_rate],
                'avg_train_mae': avg_metrics['avg_train_mae'],
                'avg_val_mae': avg_metrics['avg_val_mae'],
                'std_val_mae': avg_metrics['std_val_mae'],
                'avg_train_rmse': avg_metrics['avg_train_rmse'],
                'avg_val_rmse': avg_metrics['avg_val_rmse'],
                'std_val_rmse': avg_metrics['std_val_rmse']
            })
            
            print(f"\nParam Combo {param_idx+1} Complete - Avg Val MAE: {avg_metrics['avg_val_mae']:.4f} ± {avg_metrics['std_val_mae']:.4f}")
            print(f"{'='*120}\n")
    
    # Log summary of all hyperparameter combinations
    hyperparam_df = pd.DataFrame(hyperparam_results)
    mlflow.log_table(data=hyperparam_df, artifact_file="hyperparam_comparison.json")
    
    # Find and log best parameters
    best_idx = hyperparam_df['avg_val_mae'].idxmin()
    best_params = hyperparam_df.iloc[best_idx]
    
    mlflow.log_metrics({
        "best_avg_val_mae": best_params['avg_val_mae'],
        "best_std_val_mae": best_params['std_val_mae'],
    })
    
    mlflow.log_params({
        "best_max_depth": best_params['max_depth'],
        "best_n_estimators": best_params['n_estimators'],
        "best_learning_rate": best_params['learning_rate'],
    })
    
    print("\n" + "="*120)
    print("HYPERPARAMETER TUNING COMPLETE")
    print("="*120)
    print("\nAll Parameter Combinations:")
    print(hyperparam_df.to_string(index=False))
    print(f"\nBest Parameters (by Val MAE):")
    print(f"  max_depth: {best_params['max_depth']}")
    print(f"  n_estimators: {best_params['n_estimators']}")
    print(f"  learning_rate: {best_params['learning_rate']}")
    print(f"  Avg Val MAE: {best_params['avg_val_mae']:.4f} ± {best_params['std_val_mae']:.4f}")
    print("="*120)

In [0]:
# hyperparam_xgb_df