# NN MLP model baseline - 3 month

## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from xgboost.spark import SparkXGBRegressor

from mlflow.models import infer_signature


import random
import numpy as np
import pandas as pd

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-nn-classifier"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets - custom join
- get checkpoint data
  - 3 month combined join, with feature engineering

In [0]:
# %fs ls dbfs:/student-groups/Group_2_2/1_year_custom_joined/feature_eng

display(dbutils.fs.ls("dbfs:/student-groups/Group_2_2/"))

display(dbutils.fs.ls("dbfs:/student-groups/Group_2_2/3_month_custom_joined/")) # feature_eng and cv_splits
display(dbutils.fs.ls("dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/cv_splits/"))
display(dbutils.fs.ls("dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/training_splits/"))

display(dbutils.fs.ls("dbfs:/student-groups/Group_2_2/models/"))

# Feature Selection

In [0]:
baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    # "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DEP_DELAY_NEW",
    "DEP_DEL15",
    "utc_timestamp",
    "CRS_DEP_MINUTES",            # feature eng start
    "prev_flight_delay_in_minutes", 
    "prev_flight_delay",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "LANDING_TIME_DIFF_MINUTES",
    "AVG_ARR_DELAY_ORIGIN",
    "AVG_TAXI_OUT_ORIGIN",        # feature eng end
    'HourlyDryBulbTemperature',     # weather start
    'HourlyDewPointTemperature',
    'HourlyRelativeHumidity',
    'HourlyAltimeterSetting',
    'HourlyVisibility',
    'HourlyStationPressure',
    'HourlyWetBulbTemperature',
    'HourlyPrecipitation',
    'HourlyCloudCoverage',
    'HourlyCloudElevation',
    'HourlyWindSpeed'               # weather end
]

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")



In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        # "tail_num_vec",
        "CRS_DEP_MINUTES",                 # feature eng start
        "prev_flight_delay_in_minutes",
        "prev_flight_delay",
        "origin_delays_4h",
        "delay_origin_7d",
        "delay_origin_carrier_7d",
        "delay_route_7d",
        "flight_count_24h",
        "LANDING_TIME_DIFF_MINUTES",
        "AVG_ARR_DELAY_ORIGIN",
        "AVG_TAXI_OUT_ORIGIN",              # feature eng end
        'HourlyDryBulbTemperature',         # weather start
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'                   # weather end
    ],
    outputCol="features"
)

# Training with Best Hyperparameters

In [0]:
# --- Model Estimators ---
preprocessing_stages = [
    carrier_indexer, origin_indexer, dest_indexer, 
    carrier_encoder, origin_encoder, dest_encoder, 
    assembler 
]

# # A. XGBoost Regressor
# xgb = SparkXGBRegressor(
#     features_col="features",
#     label_col="DEP_DELAY_NEW",
#     num_workers=2, 
#     max_depth=6,
#     n_estimators=100,
#     learning_rate=0.05
# )

# B. MLP Classifier
# num_columns = 32
# num_classes = 2

# mlp = MultilayerPerceptronClassifier(
#     featuresCol="features",
#     labelCol="DEP_DEL15",
#     # predictionCol="prediction",
#     maxIter=100,
#     layers=[num_columns, num_columns//2, num_classes],
#     blockSize=128,
#     stepSize=0.03
# )

# --- Evaluator (Use one metric for optimization) ---
rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",
    predictionCol="prediction",
    metricName="rmse" 
)

mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"    
)

precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="DEP_DEL15",
    predictionCol="prediction",
    metricName="weightedPrecision"
)

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="DEP_DEL15",
    predictionCol="prediction",
    metricName="weightedRecall"
)

f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="DEP_DEL15",
    predictionCol="prediction",
    metricName="f1"
)

f2_evaluator = MulticlassClassificationEvaluator(
    labelCol="DEP_DEL15",
    predictionCol="prediction",
    metricName="weightedFMeasure"
)
f2_evaluator.setBeta(2.0)

f2_evaluator_label = MulticlassClassificationEvaluator(
    labelCol="DEP_DEL15",
    predictionCol="prediction",
    metricName="fMeasureByLabel"
)
f2_evaluator_label.setMetricLabel(1).setBeta(2.0)

auc_evaluator = BinaryClassificationEvaluator(
    labelCol="DEP_DEL15", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderROC"
)

acc_evaluator = MulticlassClassificationEvaluator(
    labelCol="DEP_DEL15", 
    predictionCol="prediction", 
    metricName="accuracy"
)

In [0]:
def read_specific_fold(path: str, fold_id: int, split_type: str):
    """
    Read a specific fold from partitioned parquet data.
    Falls back to filtering if direct partition read fails.
    """
    fold_path = f"{path}/fold_id={fold_id}/split_type={split_type}"
    
    try:
        # Try direct partition read
        return spark.read.parquet(fold_path)
    except:
        # Fallback: read all data and filter
        print(f"Direct read failed for fold {fold_id}, using filter method...")
        all_data = spark.read.parquet(path)
        return all_data.filter(
            (all_data.fold_id == fold_id) & 
            (all_data.split_type == split_type)
        )

In [0]:
month_or_year = "3_month_custom_joined"
cv_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/fe_graph_and_holiday/cv_splits" 
val_fold = read_specific_fold(cv_path, 1, "validation").cache()
import pyspark.sql.functions as F
display(val_fold.groupby("DEP_DEL15").agg(F.count("*")))


In [0]:
# --- Training Loop for XGBoost with best hyperparams ---
n_folds = 5
month_or_year = "3_month_custom_joined"
cv_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/fe_graph_and_holiday/cv_splits" 

xgb = SparkXGBRegressor(
    features_col="features",
    label_col="DEP_DELAY_NEW",
    num_workers=12, 
    tree_method="hist",           
    max_depth=6,
    n_estimators=100,
    learning_rate=0.05
)

with mlflow.start_run(run_name="XGB_BEST_HP_3_MNTH_CV") as run:

    
    # 1. Log Parameters
    mlflow.log_param("model", "XGBoost")
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("dataset", month_or_year)

    # 2. Create Pipeline
    pipeline = Pipeline(stages=preprocessing_stages + [xgb])
    
    fold_metrics = {'train_mae': [], 'val_mae': [], 'train_rmse': [], 'val_rmse': []}
    
    # 3. CV Loop
    for fold_id in range(1, n_folds + 1):
        with mlflow.start_run(run_name=f"fold_{fold_id}", nested=True):
            print(f"  Processing Fold {fold_id}/{n_folds}...")
            
            # Load Data
            train_fold = read_specific_fold(cv_path, fold_id, "train").cache()
            val_fold = read_specific_fold(cv_path, fold_id, "validation").cache()
            display(val_fold)
            
            # Fit & Predict
            model = pipeline.fit(train_fold)
            train_preds = model.transform(train_fold)
            val_preds = model.transform(val_fold)
            
            # Evaluate
            metrics = {
                "train_mae": mae_evaluator.evaluate(train_preds),
                "val_mae": mae_evaluator.evaluate(val_preds),
                "train_rmse": rmse_evaluator.evaluate(train_preds),
                "val_rmse": rmse_evaluator.evaluate(val_preds)
            }
            
            # Log & Print
            mlflow.log_metrics(metrics)
            print(f"    Fold {fold_id}: Val MAE={metrics['val_mae']:.4f}, Val RMSE={metrics['val_rmse']:.4f}")
            
            for k, v in metrics.items():
                fold_metrics[k].append(v)
            
            # Cleanup
            train_fold.unpersist()
            val_fold.unpersist()

    # 4. Log Aggregates
    avg_metrics = {f"avg_{k}": np.mean(v) for k, v in fold_metrics.items()}
    std_metrics = {f"std_{k}": np.std(v) for k, v in fold_metrics.items()}
    
    mlflow.log_metrics({**avg_metrics, **std_metrics})
    
    print("\n" + "="*50)
    print(f"Average Val MAE:  {avg_metrics['avg_val_mae']:.4f} (+/- {std_metrics['std_val_mae']:.4f})")
    print(f"Average Val RMSE: {avg_metrics['avg_val_rmse']:.4f} (+/- {std_metrics['std_val_rmse']:.4f})")
    print("="*50)

In [0]:
# --- Training Loop for basic MLP ---
n_folds = 5
month_or_year = "3_month_custom_joined"
cv_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/fe_graph_and_holiday/cv_splits"

preprocessing_pipeline = Pipeline(stages=preprocessing_stages)

#with mlflow.start_run(run_name="MLP_BASELINE_3_MNTH_CV_64") as run:
with mlflow.start_run(run_name="DANIEL_TEST2_MLP_BASELINE_3_MNTH_CV_64") as run:
    
    mlflow.log_param("model", "MLP Classifier")
    mlflow.log_param("dataset", month_or_year)
    mlflow.log_param("folds", n_folds)
    
    fold_metrics = {'train_f2': [], 'val_f2': [], 'train_f1': [], 'val_f1': [], 'train_precision': [], 'val_precision': [], 'train_recall': [], 'val_recall': [], 'train_auc': [], 'val_auc': [], 'train_acc': [], 'val_acc': [], 'train_f2_label': [], 'val_f2_label': []}
    
    for fold_id in range(1, n_folds + 1):
        with mlflow.start_run(run_name=f"fold_{fold_id}", nested=True):
            print(f"  Processing Fold {fold_id}/{n_folds}...")
            
            # 1. Load Data
            train_fold = read_specific_fold(cv_path, fold_id, "train").cache()
            val_fold = read_specific_fold(cv_path, fold_id, "validation").cache()
            
            # 2. Fit Feature Engineering Pipeline FIRST
            # We need to do this to know the input vector size for the MLP layers
            feat_model = preprocessing_pipeline.fit(train_fold)
            
            train_vec = feat_model.transform(train_fold)
            val_vec = feat_model.transform(val_fold)
            
            # 3. Calculate Input Dimension dynamically
            # Grab one row to check the length of the 'features' vector
            input_dim = len(train_vec.select("features").first()[0])
            print(f"    Detected Input Dimension: {input_dim}")
            
            # 4. Define MLP Architecture
            # Layers: [Input, Hidden, Output]
            layers = [input_dim, 64, 2] 

            mlp = MultilayerPerceptronClassifier(
                featuresCol="features",
                labelCol="DEP_DEL15",
                maxIter=100,
                layers=layers,
                blockSize=128,
                stepSize=0.03
            )

            # 5. Train MLP
            mlp_model = mlp.fit(train_vec)
            
            # 6. Predict
            train_preds = mlp_model.transform(train_vec)
            val_preds = mlp_model.transform(val_vec)
            
            # 7. Evaluate
            metrics = {
                "train_f2": f2_evaluator.evaluate(train_preds),
                "val_f2": f2_evaluator.evaluate(val_preds),
                "train_f2_label": f2_evaluator_label.evaluate(train_preds),
                "val_f2_label": f2_evaluator_label.evaluate(val_preds),
                "train_f1": f1_evaluator.evaluate(train_preds),
                "val_f1": f1_evaluator.evaluate(val_preds),
                "train_precision": precision_evaluator.evaluate(train_preds),
                "val_precision": precision_evaluator.evaluate(val_preds),
                "train_recall": recall_evaluator.evaluate(train_preds),
                "val_recall": recall_evaluator.evaluate(val_preds),
                "train_auc": auc_evaluator.evaluate(train_preds),
                "val_auc": auc_evaluator.evaluate(val_preds),
                "train_acc": acc_evaluator.evaluate(train_preds),
                "val_acc": acc_evaluator.evaluate(val_preds)
            }
            
            # Log & Store
            mlflow.log_metrics(metrics)
            mlflow.log_param("input_dim", input_dim)
            mlflow.log_param("layers", str(layers))
            
            print(f"    Fold {fold_id}: Val AUC={metrics['val_auc']:.4f}, Val Acc={metrics['val_acc']:.4f}, Val Precision={metrics['val_precision']:.4f}, Val Recall={metrics['val_recall']:.4f}, Val F1={metrics['val_f1']:.4f}, Val F2={metrics['val_f2']:.4f}, Val F2_label={metrics['val_f2_label']:.4f}")
            
            for k, v in metrics.items():
                fold_metrics[k].append(v)
            
            # Cleanup
            train_fold.unpersist()
            val_fold.unpersist()

    # 8. Log Averages
    avg_metrics = {f"avg_{k}": np.mean(v) for k, v in fold_metrics.items()}
    std_metrics = {f"std_{k}": np.std(v) for k, v in fold_metrics.items()}
    
    mlflow.log_metrics({**avg_metrics, **std_metrics})
    
    print("\n" + "="*50)
    print(f"Average Val AUC:      {avg_metrics['avg_val_auc']:.4f} (+/- {std_metrics['std_val_auc']:.4f})")
    print(f"Average Val Accuracy: {avg_metrics['avg_val_acc']:.4f} (+/- {std_metrics['std_val_acc']:.4f})")
    print(f"Average Val Precision:  {avg_metrics['avg_val_precision']:.4f} (+/- {std_metrics['std_val_precision']:.4f})")
    print(f"Average Val Recall:   {avg_metrics['avg_val_recall']:.4f} (+/- {std_metrics['std_val_recall']:.4f})")
    print(f"Average Val F1:       {avg_metrics['avg_val_f1']:.4f} (+/- {std_metrics['std_val_f1']:.4f})")
    print(f"Average Val F2:       {avg_metrics['avg_val_f2']:.4f} (+/- {std_metrics['std_val_f2']:.4f})")
    print(f"Average Val F2_label: {avg_metrics['avg_val_f2_label']:.4f} (+/- {std_metrics['std_val_f2_label']:.4f})")
    print("="*50)


# stacked approach
- build XGBoost model and do hyperparamter tuning to find the best hyperparams
- generate the XGBoost regression delay field and output it using the held out data
- use the XGBoost delay field as the input for the NN/MLP model while doing hyperparameter tuning

In [0]:
from pyspark.ml.tuning import ParamGridBuilder
import itertools

# --- 1. Define the Parameter Grid ---
# We use lists for the values we want to test
grid_search_params = {
    "max_depth": [6], # [4, 6]
    "n_estimators": [50, 100], # [20, 50, 100]
    "learning_rate": [0.05, 0.1]
}

# Create a list of all combinations (Cartesian Product)
keys, values = zip(*grid_search_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

print(f"Total Parameter Combinations to Test: {len(param_combinations)}")

# --- 2. Run Tuning Loop ---
n_folds = 5
month_or_year = "3_month_custom_joined"
cv_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/fe_graph_and_holiday/cv_splits"

# Store results to find the winner later
results_list = []

print(f"Starting Hyperparameter Tuning on {month_or_year}...")

with mlflow.start_run(run_name="XGB_GRID_SEARCH_3_MNTH") as parent_run:
    mlflow.log_param("n_combinations", len(param_combinations))
    
    for idx, params in enumerate(param_combinations):
        param_str = f"depth{params['max_depth']}_est{params['n_estimators']}_lr{params['learning_rate']}"
        print(f"\n--- Testing Combo {idx+1}/{len(param_combinations)}: {param_str} ---")
        
        # Define Model with CURRENT params
        current_xgb = SparkXGBRegressor(
            features_col="features",
            label_col="DEP_DELAY_NEW",
            num_workers=6, 
            tree_method="hist",  
            max_depth=params['max_depth'],
            n_estimators=params['n_estimators'],
            learning_rate=params['learning_rate']
        )
        
        # Create pipeline
        pipeline = Pipeline(stages=preprocessing_stages + [current_xgb])
        
        fold_maes = []
        
        with mlflow.start_run(run_name=f"combo_{idx}_{param_str}", nested=True) as child_run:
            # Log Params
            mlflow.log_params(params)
            
            # CV Loop for this combo
            for fold_id in range(1, n_folds + 1):
                # Load Data
                train_fold = read_specific_fold(cv_path, fold_id, "train").cache()
                val_fold = read_specific_fold(cv_path, fold_id, "validation").cache()
                
                # Fit & Predict
                model = pipeline.fit(train_fold)
                val_preds = model.transform(val_fold)
                
                # Metric
                mae = mae_evaluator.evaluate(val_preds)
                fold_maes.append(mae)
                
                # Cleanup
                train_fold.unpersist()
                val_fold.unpersist()
            
            # Aggregate Results
            avg_mae = np.mean(fold_maes)
            mlflow.log_metric("avg_val_mae", avg_mae)
            
            print(f"  Combo {idx+1} Result: Avg MAE = {avg_mae:.4f}")
            
            # Add to results list
            results_list.append({
                **params,
                "avg_val_mae": avg_mae
            })

# --- 3. Identify Best Parameters ---
# Convert to Pandas for easy sorting
results_df = pd.DataFrame(results_list)
best_row = results_df.loc[results_df['avg_val_mae'].idxmin()]

print("\n" + "="*50)
print(f"WINNER FOUND: {best_row.to_dict()}")
print("="*50)

# Extract best params for Phase 2
best_depth = int(best_row['max_depth'])
best_estimators = int(best_row['n_estimators'])
best_lr = float(best_row['learning_rate'])

In [0]:
from pyspark.sql import DataFrame
from functools import reduce

print(f"\nGenerating Stacked Features using Best Params: Depth={best_depth}, Est={best_estimators}, LR={best_lr}")

best_xgb = SparkXGBRegressor(
    features_col="features",
    label_col="DEP_DELAY_NEW",
    num_workers=6,   
    tree_method="hist",            
    max_depth=best_depth,
    n_estimators=best_estimators,
    learning_rate=best_lr
)

pipeline = Pipeline(stages=preprocessing_stages + [best_xgb])

out_of_fold_predictions = []

for fold_id in range(1, n_folds + 1):
    print(f"  Processing Fold {fold_id}/{n_folds} for Stacking...")
    
    train_fold = read_specific_fold(cv_path, fold_id, "train").cache()
    val_fold = read_specific_fold(cv_path, fold_id, "validation").cache()
    
    model = pipeline.fit(train_fold)
    val_preds = model.transform(val_fold)
    
    val_preds_clean = val_preds \
        .withColumnRenamed("prediction", "xgb_predicted_delay") \
        .withColumn("fold_id", F.lit(fold_id))  # <--- Force add fold_id
    out_of_fold_predictions.append(val_preds_clean)
    
    train_fold.unpersist()
    val_fold.unpersist()

stacked_dataset = reduce(DataFrame.union, out_of_fold_predictions)

output_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/fe_graph_and_holiday/stacked_input_optimized"
cols_to_drop = ["features", "carrier_vec", "origin_vec", "dest_vec", "carrier_idx", "origin_idx", "dest_idx"]
stacked_dataset.drop(*cols_to_drop).write.mode("overwrite").parquet(output_path)

print(f"SUCCESS: Optimized stacked dataset saved to: {output_path}")

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import functions as F
import mlflow
import numpy as np
import pandas as pd

# # --- 1. Experiment Setup ---
# EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-nn-classifier"
# mlflow.set_experiment(EXPERIMENT_NAME)

# # Load the OPTIMIZED stacked dataset (Output from previous XGBoost step)
# month_or_year = "3_month_custom_joined"
stacked_input_path = f"dbfs:/student-groups/Group_2_2/{month_or_year}/fe_graph_and_holiday/stacked_input_optimized"

# Load & Repartition for parallelism
stacked_df = spark.read.parquet(stacked_input_path).repartition(12).cache()
print(f"Loaded {stacked_df.count()} rows for MLP tuning.")

# --- 2. Define Hyperparameter Grid (Hidden Layers Only) ---
# We define the "middle" of the network. The code will auto-add Input and Output layers.
# Structure: List of lists representing hidden layer sizes
hidden_layer_grid = [
    [32],               # Shallow, narrow 
    [64, 32],           # Medium depth 
    [128, 64],          # Wider 
    [64, 32, 16],       # Deep
    # [128, 64, 32]       # Deep & Wide
]

print(f"Testing {len(hidden_layer_grid)} different network architectures.")

# --- 3. Feature Pipeline Definition ---
# (Same as before, re-defining since we load from Parquet)
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")

assembler = VectorAssembler(
    inputCols=[
        "QUARTER", "MONTH", "YEAR", "DAY_OF_MONTH", "DAY_OF_WEEK",
        "carrier_vec", "origin_vec", "dest_vec",
        "CRS_ELAPSED_TIME", "DISTANCE", "CRS_DEP_MINUTES",
        "prev_flight_delay_in_minutes", "prev_flight_delay", "origin_delays_4h",
        "delay_origin_7d", "delay_origin_carrier_7d", "delay_route_7d",
        "flight_count_24h", "LANDING_TIME_DIFF_MINUTES", "AVG_ARR_DELAY_ORIGIN",
        "AVG_TAXI_OUT_ORIGIN", 'HourlyDryBulbTemperature', 'HourlyDewPointTemperature',
        'HourlyRelativeHumidity', 'HourlyAltimeterSetting', 'HourlyVisibility',
        'HourlyStationPressure', 'HourlyWetBulbTemperature', 'HourlyPrecipitation',
        'HourlyCloudCoverage', 'HourlyCloudElevation', 'HourlyWindSpeed',
        "xgb_predicted_delay"  # <--- Ensure this is included!
    ],
    outputCol="raw_features"
)

# withMean=False is important to handle the sparse OneHot vectors efficiently
scaler = StandardScaler(inputCol="raw_features", outputCol="scaled_features", withStd=True, withMean=False)

# Update pipeline to include scaler
preprocessing_pipeline = Pipeline(stages=[
    carrier_indexer, origin_indexer, dest_indexer,
    carrier_encoder, origin_encoder, dest_encoder,
    assembler,
    scaler 
])

# --- 4. Evaluators ---
auc_evaluator = BinaryClassificationEvaluator(labelCol="DEP_DEL15", metricName="areaUnderROC")
acc_evaluator = MulticlassClassificationEvaluator(labelCol="DEP_DEL15", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="DEP_DEL15", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="DEP_DEL15", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="DEP_DEL15", metricName="f1")
f2_evaluator = MulticlassClassificationEvaluator(labelCol="DEP_DEL15", metricName="weightedFMeasure")
f2_evaluator.setBeta(2.0)

# --- 5. Tuning Loop ---
n_folds = 5
results_list = []

print(f"Starting MLP Architecture Search...")

with mlflow.start_run(run_name="MLP_3M_ARCHITECTURE_SEARCH_F2") as parent_run:
    
    for idx, hidden_config in enumerate(hidden_layer_grid):
        
        config_str = "-".join(map(str, hidden_config)) 
        print(f"\n--- Testing Arch {idx+1}/{len(hidden_layer_grid)}: Hidden=[{config_str}] ---")
        
        fold_metrics = {'val_f2': []}
        
        with mlflow.start_run(run_name=f"arch_{config_str}", nested=True) as child_run:
            mlflow.log_param("hidden_layers", str(hidden_config))
            
            for fold_id in range(1, n_folds + 1):
                print(f"  Processing Fold {fold_id}/{n_folds}...")
                
                # A. Split Data
                train_fold = stacked_df.filter(F.col("fold_id") != fold_id)
                val_fold = stacked_df.filter(F.col("fold_id") == fold_id)
                
                # B. Fit Feature Pipeline
                feat_model = preprocessing_pipeline.fit(train_fold)
                train_vec = feat_model.transform(train_fold)
                val_vec = feat_model.transform(val_fold)
                
                # C. Dynamic Layer Construction
                input_dim = len(train_vec.select("scaled_features").first()[0])
                output_dim = 2 
                
                full_layers = [input_dim] + hidden_config + [output_dim]
                
                # D. Train MLP
                mlp = MultilayerPerceptronClassifier(
                    featuresCol="scaled_features",
                    labelCol="DEP_DEL15",
                    layers=full_layers,
                    blockSize=128,
                    maxIter=100,
                    stepSize=0.03
                )
                
                mlp_model = mlp.fit(train_vec)
                
                # E. Evaluate
                val_preds = mlp_model.transform(val_vec)
                
                # Calculate F2
                f2 = f2_evaluator.evaluate(val_preds)
                fold_metrics['val_f2'].append(f2)
                
                print(f"    Fold {fold_id}: F2={f2:.4f}")
            
            # Aggregate Results
            avg_f2 = np.mean(fold_metrics['val_f2'])
            
            mlflow.log_metric("avg_val_f2", avg_f2)
            
            results_list.append({
                "hidden_config": str(hidden_config),
                "avg_val_f2": avg_f2
            })
            
            print(f"  Arch [{config_str}] Result: Avg F2 = {avg_f2:.4f}")

# --- 6. Select Best Architecture ---
results_df = pd.DataFrame(results_list)

# Select best based on F2 score
best_row = results_df.loc[results_df['avg_val_f2'].idxmax()] 

print("\n" + "="*50)
print("TOP 3 ARCHITECTURES (Sorted by F2):")
print(results_df.sort_values("avg_val_f2", ascending=False).head(3))
print("="*50)

# Unpersist
stacked_df.unpersist()