## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


import random

import mlflow
print(mlflow.__version__)

import os
os.environ['PYSPARK_PIN_THREAD'] = 'false'
spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets

In [0]:
# Read in feature engineered custom joined data
month_or_year = "1_year_custom_joined"
train_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year}/feature_eng/training_splits/train.parquet")
validation_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year}/feature_eng/training_splits/validation.parquet")
test_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year}/feature_eng/training_splits/test.parquet")

df = train_df.unionByName(validation_df)

df = df.filter(F.col("CANCELLED") != 1)
print(df.count())
display(df.limit(10))

In [0]:
baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    # "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DEP_DELAY_NEW",
    "utc_timestamp",
    "CRS_DEP_MINUTES",            # feature eng start
    "prev_flight_delay_in_minutes", 
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN",        # feature eng end
    'HourlyDryBulbTemperature',     # weather start
    'HourlyDewPointTemperature',
    'HourlyRelativeHumidity',
    'HourlyAltimeterSetting',
    'HourlyVisibility',
    'HourlyStationPressure',
    'HourlyWetBulbTemperature',
    'HourlyPrecipitation',
    'HourlyCloudCoverage',
    'HourlyCloudElevation',
    'HourlyWindSpeed'               # weather end
]

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")



In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        # "tail_num_vec",
        "CRS_DEP_MINUTES",                 # feature eng start
        "prev_flight_delay_in_minutes",
        "prev_flight_delay",
        "origin_delays_4h",
        "delay_origin_7d",
        "delay_origin_carrier_7d",
        "delay_route_7d",
        "flight_count_24h",
        "LANDING_TIME_DIFF_MINUTES",
        "AVG_ARR_DELAY_ORIGIN",
        "AVG_TAXI_OUT_ORIGIN",              # feature eng end
        'HourlyDryBulbTemperature',         # weather start
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'                   # weather end
    ],
    outputCol="features"
)

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from xgboost.spark import SparkXGBRegressor
from pyspark.ml import Pipeline
from mlflow.models import infer_signature
MODEL_NAME = "XGB_TUNED_CVD_FINAL"

# --- Model Estimators ---
preprocessing_stages = [
    carrier_indexer, origin_indexer, dest_indexer, 
    carrier_encoder, origin_encoder, dest_encoder, 
    assembler 
]

# A. XGBoost Regressor
xgb = SparkXGBRegressor(
    features_col="features",
    label_col="DEP_DELAY_NEW",
    num_workers=2, 
    max_depth=6,
    n_estimators=100,
    learning_rate=0.1,
    objective="reg:squarederror"

)

# Create pipeline
pipeline = Pipeline(stages=[
    carrier_indexer, origin_indexer, dest_indexer, tail_num_indexer,
    carrier_encoder, origin_encoder, dest_encoder, tail_num_encoder,
    assembler,
    xgb
    # linear_reg
    # rf
])
with mlflow.start_run(run_name="Baseline - random forest"):
    model = pipeline.fit(train_df)
    training_predictions = model.transform(train_df)
    test_predictions = model.transform(test_df)

    # Define DBFS paths
    train_pred_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/train_predictions"
    test_pred_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/test_predictions"
 

    # Save as Parquet to DBFS
    training_predictions.select("DEP_DELAY_NEW", "prediction").write.mode("overwrite").parquet(train_pred_path)
    test_predictions.select("DEP_DELAY_NEW", "prediction").write.mode("overwrite").parquet(test_pred_path)

    # Log artifacts
    mlflow.log_artifacts(train_pred_path, "train_predictions")
    mlflow.log_artifacts(test_pred_path, "test_predictions")

    mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

    rmse_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="rmse"
    )

    # Calculate MAE
    mae_train = mae_evaluator.evaluate(training_predictions)
    mae_test = mae_evaluator.evaluate(test_predictions)
    # Calculate RMSE
    rmse_train = rmse_evaluator.evaluate(training_predictions)
    rmse_test = rmse_evaluator.evaluate(test_predictions)

    signature = infer_signature(df, training_predictions)

    mlflow.spark.log_model(
        model, 
        MODEL_NAME,
        input_example=df.limit(1).toPandas(),
        signature=signature,
        registered_model_name="flight_delay_prediction_baseline"
        )

    mlflow.log_metric("train_mae", mae_train)
    mlflow.log_metric("validation_mae", mae_test)
    mlflow.log_metric("train_rmse", rmse_train)
    mlflow.log_metric("validation_rmse", rmse_test)


## Create table with performance


In [0]:
import mlflow

model_uri = 'runs:/1f03617a78bb42efb5177d7927b819f5/XGB_TUNED_CVD_FINAL'

# Load as Spark model directly
loaded_model = mlflow.spark.load_model(model_uri)



In [0]:
mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",      
    predictionCol="prediction", 
    metricName="rmse"
)

# Make predictions
train_predictions = loaded_model.transform(train_df)
test_predictions = loaded_model.transform(test_df)
validation_predictions = loaded_model.transform(validation_df)


In [0]:
train_mae = mae_evaluator.evaluate(train_predictions)
train_rmse = rmse_evaluator.evaluate(train_predictions)
test_mae = mae_evaluator.evaluate(test_predictions)
test_rmse = rmse_evaluator.evaluate(test_predictions)
validation_mae = mae_evaluator.evaluate(validation_predictions)
validation_rmse = rmse_evaluator.evaluate(validation_predictions)

In [0]:
import pandas as pd

# Create dictionary with metrics
metrics_dict = {
    'Dataset': ['Train', 'Test', 'Validation'],
    'MAE': [train_mae, test_mae, validation_mae],
    'RMSE': [train_rmse, test_rmse, validation_rmse]
}

# Convert to pandas DataFrame
metrics_df = pd.DataFrame(metrics_dict)

# Display
metrics_df