# XGBOOST model baseline - 1 year
- run model from 1_year_combined data with feature engineering
  - TAIL_NUM causes OOM error, comment out for now
- featuring engineering handled in https://dbc-fae72cab-cf59.cloud.databricks.com/editor/notebooks/1792055957780055?o=4021782157704243



## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


import random

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-2-stage-dev"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets - custom join

In [0]:
%fs ls dbfs:/student-groups/Group_2_2/3_month_custom_joined/

In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
# dataset_path = f"{checkpoint_path}/1_year_custom_joined/raw_data/training_splits"
dataset_path = f"{checkpoint_path}/3_month_custom_joined/feature_eng/training_splits"

# Read datasets from checkpoint
train_df = spark.read.parquet(f"{dataset_path}/train.parquet")
validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")

In [0]:
orig_train_df_size = train_df.count()
orig_validation_df_size = validation_df.count()
print(f"Size of train_df: {orig_train_df_size}")
print(f"Size of validation_df: {orig_validation_df_size}")

# Feature Selection

In [0]:
baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    # "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DEP_DELAY_NEW",
    "utc_timestamp",
    "CRS_DEP_MINUTES",            # feature eng start
    "prev_flight_delay_in_minutes", 
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN",        # feature eng end
    'HourlyDryBulbTemperature',     # weather start
    'HourlyDewPointTemperature',
    'HourlyRelativeHumidity',
    'HourlyAltimeterSetting',
    'HourlyVisibility',
    'HourlyStationPressure',
    'HourlyWetBulbTemperature',
    'HourlyPrecipitation',
    'HourlyCloudCoverage',
    'HourlyCloudElevation',
    'HourlyWindSpeed'               # weather end
]

In [0]:
train_df = train_df.filter(F.col("DEP_DELAY_NEW").isNotNull()).select(baselines_columns)
validation_df = validation_df.filter(F.col("DEP_DELAY_NEW").isNotNull()).select(baselines_columns)

In [0]:
print(f"Size of cleaned train_df: {train_df.count()}")
print(f"Size of cleaned validation_df: {validation_df.count()}")

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")



In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        # "tail_num_vec",
        "CRS_DEP_MINUTES",                 # feature eng start
        "prev_flight_delay_in_minutes",
        "prev_flight_delay",
        "origin_delays_4h",
        "delay_origin_7d",
        "delay_origin_carrier_7d",
        "delay_route_7d",
        "flight_count_24h",
        "LANDING_TIME_DIFF_MINUTES",
        "AVG_ARR_DELAY_ORIGIN",
        "AVG_TAXI_OUT_ORIGIN",              # feature eng end
        'HourlyDryBulbTemperature',         # weather start
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'                   # weather end

    ],
    outputCol="features"
)

# Model

In [0]:
# baseline
"""
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from xgboost.spark import SparkXGBRegressor

from mlflow.models import infer_signature

mlflow.spark.autolog()
with mlflow.start_run(run_name="XGB-3m_2_stage_all_feat"):
    MODEL_NAME = "XGB_3m_2_STAGE"

    # linear_reg = LinearRegression(
    #     featuresCol="features",
    #     labelCol="DEP_DELAY_NEW",
    #     # Linear Regression has different parameters than Random Forest
    #     maxIter=10, 
    #     regParam=0.3
    # )
    # rf = RandomForestRegressor(
    #     featuresCol="features",  
    #     labelCol="DEP_DELAY_NEW",   
    #     numTrees=20,
    #     maxDepth=10
    # )

    quantile_width = 0.1

    xgb_regressor_high = SparkXGBRegressor(
        objective="reg:quantileerror",
        quantile_alpha=1 - quantile_width,
        num_round=200,
        features_col="features",
        label_col="DEP_DELAY_NEW",
        num_workers=2,
        max_depth=6,
        n_estimators=100,
        learning_rate=0.3
    )

    xgb_regressor_low = SparkXGBRegressor(
        objective="reg:quantileerror",
        quantile_alpha=quantile_width,
        num_round=200,
        features_col="features",
        label_col="DEP_DELAY_NEW",
        num_workers=2,
        max_depth=6,
        n_estimators=100,
        learning_rate=0.3
    )

    # Create pipeline
    pipeline_high = Pipeline(stages=[
        carrier_indexer, origin_indexer, dest_indexer, 
        carrier_encoder, origin_encoder, dest_encoder, 
        assembler,
        # linear_reg
        # rf
        xgb_regressor_high
    ])
    # Create pipeline
    pipeline_low = Pipeline(stages=[
        carrier_indexer, origin_indexer, dest_indexer, 
        carrier_encoder, origin_encoder, dest_encoder, 
        assembler,
        # linear_reg
        # rf
        xgb_regressor_low
    ])

    model_high = pipeline_high.fit(train_df)
    training_predictions_high = model_high.transform(train_df)
    validation_predictions_high = model_high.transform(validation_df)

    model_low = pipeline_low.fit(train_df)
    training_predictions_low = model_low.transform(train_df)
    validation_predictions_low = model_low.transform(validation_df)

    mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

    rmse_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="rmse"
    )

    # Calculate MAE
    mae_t_high = mae_evaluator.evaluate(training_predictions_high)
    mae_v_high = mae_evaluator.evaluate(validation_predictions_high)
    # Calculate RMSE
    rmse_t_high = rmse_evaluator.evaluate(training_predictions_high)
    rmse_v_high = rmse_evaluator.evaluate(validation_predictions_high)

    signature_high = infer_signature(train_df, training_predictions_high)

    mlflow.spark.log_model(
        model_high, 
        f"{MODEL_NAME}_high",
        input_example=train_df.limit(1).toPandas(),
        signature=signature_high,
        registered_model_name="2_stage_high_dev"
        )

    mlflow.log_metric("train_mae", mae_t_high)
    mlflow.log_metric("validation_mae", mae_v_high)
    mlflow.log_metric("train_rmse", rmse_t_high)
    mlflow.log_metric("validation_rmse", rmse_v_high)


    # ----------- Low -------------
    # Calculate MAE
    mae_t_low = mae_evaluator.evaluate(training_predictions_low)
    mae_v_low = mae_evaluator.evaluate(validation_predictions_low)
    # Calculate RMSE
    rmse_t_low = rmse_evaluator.evaluate(training_predictions_low)
    rmse_v_low = rmse_evaluator.evaluate(validation_predictions_low)

    signature_low = infer_signature(train_df, training_predictions_low)

    mlflow.spark.log_model(
        model_low, 
        f"{MODEL_NAME}_low",
        input_example=train_df.limit(1).toPandas(),
        signature=signature_low,
        registered_model_name="2_stage_low_dev"
        )

    mlflow.log_metric("train_mae", mae_t_low)
    mlflow.log_metric("validation_mae", mae_v_low)
    mlflow.log_metric("train_rmse", rmse_t_low)
    mlflow.log_metric("validation_rmse", rmse_v_low)


"""

aaaaaa = 1


## Save best model results

In [0]:
# best model information
"""
RUN_ID = "100863169f14462fb514efa6483a170e"
ARTIFACT_PATH = "XGB_1y_BASELINE_FEAT_ENG"

MODEL_URI = f"runs:/{RUN_ID}/{ARTIFACT_PATH}"
# Load the model
loaded_model = mlflow.spark.load_model(MODEL_URI)
"""
aaaaaaa = 2

### 2-Stage Modeling Pipeline

In [0]:
from pyspark.ml import Estimator, Model
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import col, lit, when
from pyspark.sql import DataFrame
from pyspark.sql.functions import count as f_count

class IntervalClassifier(Estimator, DefaultParamsReadable, DefaultParamsWritable):

    def __init__(self,
                 lowerEstimator,
                 upperEstimator,
                 baseClassifier,
                 labelCol="delay_minutes",
                 featuresCol="features",
                 predictionCol="final_prediction",
                 threshold=15.0,
                 quantile_gap=0.1,
                 qLowCol="low_pred",
                 qHighCol="high_pred",
                 clfPredictionCol="clf_prediction",
                 undersample_majority=True,
                 undersample_seed=42):
        super().__init__()
        self.lowerEstimator = lowerEstimator
        self.upperEstimator = upperEstimator
        self.baseClassifier = baseClassifier
        self.labelCol = labelCol
        self.featuresCol = featuresCol
        self.predictionCol = predictionCol
        self.threshold = float(threshold)
        self.quantile_gap = float(quantile_gap)
        self.qLowCol = qLowCol
        self.qHighCol = qHighCol
        self.clfPredictionCol = clfPredictionCol
        self.undersample_majority = undersample_majority
        self.undersample_seed = undersample_seed

    def _fit(self, dataset: DataFrame) -> Model:
        # 1) Fit quantile regressors on ALL data
        print("Training Lower Estimator...")
        lowerModel = self.lowerEstimator.fit(dataset)
        print("Training Upper Estimator...")
        upperModel = self.upperEstimator.fit(dataset)

        print("Configuring Classifier...")
        # 2) Add qLow and qHigh predictions
        df_q = lowerModel.transform(dataset) \
                         .withColumnRenamed("prediction", self.qLowCol)

        df_q = upperModel.transform(df_q) \
                         .withColumnRenamed("prediction", self.qHighCol)

        # 3) Keep only examples where 15 is between qLow and qHigh
        print("Filtering ambiguous cases...")
        thr = lit(self.threshold)
        df_ambig = df_q.filter(
            (col(self.qLowCol) <= thr) & (thr <= col(self.qHighCol))
        )

        # 4) Build binary label: 1 if delay ≥ threshold, else 0
        df_ambig = df_ambig.withColumn(
            "bin_label",
            (col(self.labelCol) >= thr).cast("double")
        )
        print("Undersampling...")
        # --- undersample majority class among ambiguous cases ---
        if self.undersample_majority:
            # class counts on driver (only 2 classes)
            class_counts = (
                df_ambig.groupBy("bin_label")
                        .agg(f_count("*").alias("cnt"))
                        .collect()
            )

            # If we have both classes, find minority/majority and undersample
            if len(class_counts) != 2:
                raise ValueError("Ambiguous cases must contain both classes.")

            (label0, cnt0), (label1, cnt1) = [
                (row["bin_label"], row["cnt"]) for row in class_counts
            ]

            if cnt0 <= cnt1:
                minority_label, minority_cnt = label0, cnt0
                majority_label, majority_cnt = label1, cnt1
            else:
                minority_label, minority_cnt = label1, cnt1
                majority_label, majority_cnt = label0, cnt0

            print(f"Undersampling majority class {majority_label} from {majority_cnt} to {minority_cnt}.")

            if majority_cnt > 0 and minority_cnt > 0:
                frac_majority = float(minority_cnt) / float(majority_cnt)

                # sampleBy keeps all minority, downsamples majority
                fractions = {
                    float(minority_label): 1.0,
                    float(majority_label): frac_majority
                }

                df_ambig = df_ambig.sampleBy(
                    "bin_label",
                    fractions=fractions,
                    seed=self.undersample_seed
                )
        # --- END undersampling block ---

        print("Training Base Classifier...")
        # 5) Fit classifier on (possibly undersampled) ambiguous region
        clf = self.baseClassifier
        
        clf.setParams(
            label_col="bin_label",
            features_col=self.featuresCol,
            prediction_col=self.clfPredictionCol
        )

        clfModel = clf.fit(df_ambig)

        return IntervalClassifierModel(
            lowerModel=lowerModel,
            upperModel=upperModel,
            clfModel=clfModel,
            labelCol=self.labelCol,
            featuresCol=self.featuresCol,
            predictionCol=self.predictionCol,
            threshold=self.threshold,
            qLowCol=self.qLowCol,
            qHighCol=self.qHighCol,
            clfPredictionCol=self.clfPredictionCol
        )

class IntervalClassifierModel(Model, DefaultParamsReadable, DefaultParamsWritable):

    def __init__(self,
                 lowerModel,
                 upperModel,
                 clfModel,
                 labelCol,
                 featuresCol,
                 predictionCol,
                 threshold,
                 qLowCol,
                 qHighCol,
                 clfPredictionCol):
        super().__init__()
        self.lowerModel = lowerModel
        self.upperModel = upperModel
        self.clfModel = clfModel
        self.labelCol = labelCol
        self.featuresCol = featuresCol
        self.predictionCol = predictionCol
        self.threshold = float(threshold)
        self.qLowCol = qLowCol
        self.qHighCol = qHighCol
        self.clfPredictionCol = clfPredictionCol

    def _transform(self, dataset: DataFrame) -> DataFrame:
        # 1) Predict quantiles
        df_q = self.lowerModel.transform(dataset) \
                              .withColumnRenamed("prediction", self.qLowCol)

        df_q = self.upperModel.transform(df_q) \
                              .withColumnRenamed("prediction", self.qHighCol)

        # 2) Get classifier predictions (on all rows, cheap and simple)
        print("Classifying **all** points with stage 2 classifier...")
        df_clf = self.clfModel.transform(df_q)

        thr = lit(self.threshold)

        # Assume classifier is a binary classifier with predictions 0/1
        # Override classifier predictions where interval says we are confident.
        print("Classifying points with ambiguous predictions...")
        df_final = df_clf.withColumn(
            self.predictionCol,
            when(thr < col(self.qLowCol), lit(1.0))      # confidently ≥ threshold
            .when(thr > col(self.qHighCol), lit(0.0))     # confidently < threshold
            .otherwise(col(self.clfPredictionCol))      # ambiguous → use classifier
        )

        # Strategy column to debug
        df_final = df_final.withColumn(
            "decision_source",
            when(thr < col(self.qLowCol), lit("quantile_high"))
            .when(thr > col(self.qHighCol), lit("quantile_low"))
            .otherwise(lit("classifier"))
        )

        return df_final


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
#from pyspark.ml.classification import RandomForestClassifier
from xgboost.spark import SparkXGBRegressor, SparkXGBClassifier

from pyspark.sql.functions import col, lit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from mlflow.models.signature import infer_signature
import mlflow
import numpy as np
import math

MODEL_NAME = "XGB_3m_2_STAGE_dev"
print("Starting MLflow autolog setup...")
mlflow.spark.autolog()

#high_quantiles = np.arange(0.75, 0.9, 0.025)
high_quantiles = np.arange(0.98, 1, 0.005)
low_quantiles = np.arange(0.24, 0.4, 0.02)

for high_quantile in high_quantiles:
    for low_quantile in low_quantiles:
        run_name = f"XGB-3m_2_stage_dev_low_{low_quantile:.3f}_high_{high_quantile:.3f}"

        with mlflow.start_run(run_name=run_name):
            print(f"Starting MLflow run for {run_name}")
            print("Defining model parameters and estimators...")

            # use best hyperparameters from Phase 2
            xgb_regressor_high = SparkXGBRegressor(
                    objective="reg:quantileerror",
                    quantile_alpha=high_quantile,
                    num_round=200,
                    features_col="features",
                    label_col="DEP_DELAY_NEW",
                    num_workers=2,
                    max_depth=6,
                    n_estimators=100,
                    learning_rate=0.05
                )

            xgb_regressor_low = SparkXGBRegressor(
                objective="reg:quantileerror",
                quantile_alpha=low_quantile,
                num_round=200,
                features_col="features",
                label_col="DEP_DELAY_NEW",
                num_workers=2,
                max_depth=6,
                n_estimators=100,
                learning_rate=0.05
            )

            max_depth = 6
            n_estimators = 100
            learning_rate = 0.05
            num_round=200
            # Example classifier
            xgb_classifier = SparkXGBClassifier(
                num_round=200,
                features_col="features",
                label_col="bin_label",
                prediction_col="clf_prediction",
                max_depth=6,
                n_estimators=100,
                learning_rate=0.05,
                num_workers=2
            )

            print("Initializing interval classifier...")
            interval_clf = IntervalClassifier(
                lowerEstimator=xgb_regressor_low,
                upperEstimator=xgb_regressor_high,
                baseClassifier=xgb_classifier,
                labelCol="DEP_DELAY_NEW",
                featuresCol="features",
                threshold=15.0,
                predictionCol="final_prediction"
            )

            print("Building pipeline...")
            pipeline = Pipeline(stages=[carrier_indexer, origin_indexer, dest_indexer, 
                carrier_encoder, origin_encoder, dest_encoder, 
                assembler,
                interval_clf
            ])

            DELAY_THRESHOLD = 15.0  # minutes

            print("Training pipeline model...")
            model = pipeline.fit(train_df)
            print("Generating Training Predictions...")
            training_predictions = model.transform(train_df)
            print("Generating Validation Predictions...")
            validation_predictions = model.transform(validation_df)

            print("Creating binary label columns for evaluation...")
            def with_binary_label(df, label_col="DEP_DELAY_NEW", out_col="label_bin", threshold=DELAY_THRESHOLD):
                return df.withColumn(out_col, (col(label_col) >= lit(threshold)).cast("double"))

            training_predictions = with_binary_label(training_predictions)
            validation_predictions = with_binary_label(validation_predictions)

            print("Setting up metric evaluators...")
            precision_eval = MulticlassClassificationEvaluator(
                labelCol="label_bin",
                predictionCol="final_prediction",
                metricName="precisionByLabel"
            ).setMetricLabel(1)

            recall_eval = MulticlassClassificationEvaluator(
                labelCol="label_bin",
                predictionCol="final_prediction",
                metricName="recallByLabel"
            ).setMetricLabel(1)

            f2_eval = MulticlassClassificationEvaluator(
                labelCol="label_bin",
                predictionCol="final_prediction",
                metricName="fMeasureByLabel"   # requires setting beta below
            ).setMetricLabel(1).setBeta(2.0)    # evaluate F2 for the positive class (label=1)

            pr_auc_eval = BinaryClassificationEvaluator(
                labelCol="label_bin",
                rawPredictionCol="rawPrediction",   # or probabilityCol="probability"
                metricName="areaUnderPR"
            )

            print("Computing metrics...")
            precision_t = precision_eval.evaluate(training_predictions)
            precision_v = precision_eval.evaluate(validation_predictions)

            recall_t = recall_eval.evaluate(training_predictions)
            recall_v = recall_eval.evaluate(validation_predictions)

            f2_t = f2_eval.evaluate(training_predictions)
            f2_v = f2_eval.evaluate(validation_predictions)

            pr_auc_t = pr_auc_eval.evaluate(training_predictions)
            pr_auc_v = pr_auc_eval.evaluate(validation_predictions)

            # Classification Source Metrics

            pos_prec_eval = MulticlassClassificationEvaluator(
                labelCol="label_bin",
                predictionCol="final_prediction",
                metricName="precisionByLabel"
            ).setMetricLabel(1)

            pos_rec_eval = MulticlassClassificationEvaluator(
                labelCol="label_bin",
                predictionCol="final_prediction",
                metricName="recallByLabel"
            ).setMetricLabel(1)


            beta = 2.0

            for src in ["quantile_low", "quantile_high", "classifier"]:
                seg = validation_predictions.filter(col("decision_source") == src)

                # basic counts
                seg_count = seg.count()
                mlflow.log_metric(f"val_{src}_count", seg_count)

                if seg_count == 0:
                    # nothing in this segment → log counts = 0 and metrics = NaN
                    mlflow.log_metric(f"val_{src}_label_0_count", 0)
                    mlflow.log_metric(f"val_{src}_label_1_count", 0)
                    mlflow.log_metric(f"val_{src}_pos_precision", float("nan"))
                    mlflow.log_metric(f"val_{src}_pos_recall",    float("nan"))
                    mlflow.log_metric(f"val_{src}_pos_f2",        float("nan"))
                    continue

                # label distribution
                label_counts = {int(r["label_bin"]): r["count"]
                                for r in seg.groupBy("label_bin").count().collect()}

                for lbl, cnt in label_counts.items():
                    mlflow.log_metric(f"val_{src}_label_{lbl}_count", cnt)

                # ---- confusion-matrix counts for positive class (label = 1) ----
                # true positives: label=1, prediction=1
                tp = seg.filter((col("label_bin") == 1) & (col("final_prediction") == 1)).count()
                # false positives: label=0, prediction=1
                fp = seg.filter((col("label_bin") == 0) & (col("final_prediction") == 1)).count()
                # false negatives: label=1, prediction=0
                fn = seg.filter((col("label_bin") == 1) & (col("final_prediction") == 0)).count()

                # optional: true negatives if you care
                # tn = seg.filter((col("label_bin") == 0) & (col("final_prediction") == 0)).count()

                # ---- safe metric calculation ----
                def safe_div(num, den):
                    return float(num) / den if den > 0 else float("nan")

                pos_precision = safe_div(tp, tp + fp)  # P = TP / (TP + FP)
                pos_recall    = safe_div(tp, tp + fn)  # R = TP / (TP + FN)

                if math.isnan(pos_precision) or math.isnan(pos_recall):
                    pos_f2 = float("nan")
                else:
                    denom = (beta**2) * pos_precision + pos_recall
                    pos_f2 = ((1 + beta**2) * pos_precision * pos_recall / denom) if denom > 0 else float("nan")

                mlflow.log_metric(f"val_{src}_pos_precision", pos_precision)
                mlflow.log_metric(f"val_{src}_pos_recall",    pos_recall)
                mlflow.log_metric(f"val_{src}_pos_f2",        pos_f2)

            print("Logging model and metrics to MLflow...")
            signature = infer_signature(train_df, training_predictions)

            mlflow.spark.log_model(
                model,
                MODEL_NAME,
                input_example=train_df.limit(1).toPandas(),
                signature=signature,
                registered_model_name="flight_delay_classification_baseline"
            )

            total_pos = validation_predictions.filter(col("label_bin") == 1).count()
            total_neg = validation_predictions.filter(col("label_bin") == 0).count()

            mlflow.log_metric("val_total_pos", total_pos)
            mlflow.log_metric("val_total_neg", total_neg)
            mlflow.log_metric("val_total_count", total_pos + total_neg)

            mlflow.log_metric("train_precision", precision_t)
            mlflow.log_metric("validation_precision", precision_v)

            mlflow.log_metric("train_recall", recall_t)
            mlflow.log_metric("validation_recall", recall_v)

            mlflow.log_metric("train_f2", f2_t)
            mlflow.log_metric("validation_f2", f2_v)

            mlflow.log_metric("train_pr_auc", pr_auc_t)
            mlflow.log_metric("validation_pr_auc", pr_auc_v)

            mlflow.log_param("max_depth", max_depth)
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("learning_rate", learning_rate)
            mlflow.log_param("num_round", num_round)
            

In [0]:

"""
pos_prec_eval = MulticlassClassificationEvaluator(
    labelCol="label_bin",
    predictionCol="final_prediction",
    metricName="precisionByLabel"
).setMetricLabel(1)

pos_rec_eval = MulticlassClassificationEvaluator(
    labelCol="label_bin",
    predictionCol="final_prediction",
    metricName="recallByLabel"
).setMetricLabel(1)

pos_precision_v = pos_prec_eval.evaluate(validation_predictions)
pos_recall_v    = pos_rec_eval.evaluate(validation_predictions)
pos_f2_v        = f2_eval.evaluate(validation_predictions)

print("validation_pos_precision:", pos_precision_v)
print("validation_pos_recall:",    pos_recall_v)
print("validation_pos_f2:",        pos_f2_v)
"""


In [0]:
display(validation_predictions)

In [0]:
"""

from pyspark.sql.functions import col, lit

DELAY_THRESHOLD = 15.0

validation_labeled = validation_df.withColumn(
    "label_bin", (col("DEP_DELAY_NEW") >= lit(DELAY_THRESHOLD)).cast("double")
)

validation_labeled.groupBy("label_bin").count().show()

"""


In [0]:

"""
pos_prec_eval = MulticlassClassificationEvaluator(
    labelCol="label_bin",
    predictionCol="final_prediction",
    metricName="precisionByLabel"
).setMetricLabel(1)

pos_rec_eval = MulticlassClassificationEvaluator(
    labelCol="label_bin",
    predictionCol="final_prediction",
    metricName="recallByLabel"
).setMetricLabel(1)

for src in ["quantile_low", "quantile_high", "classifier"]:
    print("=== Source:", src, "===")
    seg = validation_predictions.filter(col("decision_source") == src)
    print("count:", seg.count())
    seg.groupBy("label_bin").count().show()

    print("pos_precision:", pos_prec_eval.evaluate(seg))
    print("pos_recall:",    pos_rec_eval.evaluate(seg))
    print("pos_F2:",        f2_eval.evaluate(seg))

"""


In [0]:
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd

client = MlflowClient()

experiment_name = "/Shared/team_2_2/mlflow-2-stage-dev"    # <-- change me
experiment = client.get_experiment_by_name(experiment_name)

runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Save all runs to CSV in DBFS
runs.to_csv(
    "/dbfs/FileStore/mlflow_dev_runs_1.csv",
    index=False
)

print("Saved mlflow_runs.csv in /dbfs/FileStore/")