## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

import random

import mlflow
print(mlflow.__version__)

import os
os.environ['PYSPARK_PIN_THREAD'] = 'false'
spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets

In [0]:
# Read in feature engineered custom joined data
month_or_year = "1_year_custom_joined"
train_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year}/feature_eng/training_splits/train.parquet")
validation_df = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year}/feature_eng/training_splits/validation.parquet")

df = train_df.unionByName(validation_df)

df = df.filter(F.col("CANCELLED") != 1)
print(df.count())
display(df.limit(10))

In [0]:
XGBoostRegressor

In [0]:
# combine date and scheduled departure time

df = df.withColumn(
    "utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

# Create Splits for CV

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Truncate timestamp to hour level
df_indexed = df.withColumn(
    "hour", 
    F.date_trunc("hour", F.col("utc_timestamp"))
)

# Create time index based on unique hours
window_spec = Window.orderBy("hour")
df_indexed = df_indexed.withColumn(
    "time_idx", 
    F.dense_rank().over(window_spec)
)

df_indexed.display()

# 3 M splits config

In [0]:
max_time_idx = df_indexed.agg(F.max("time_idx")).collect()[0][0]
print(f"  Max time index: {max_time_idx}")

train_size = 720      # 30 days (720 hours)
gap_size = 2          # 2 hours
val_size = 168        # 7 days (168 hours)
step_size = 85       # Calculated to get exactly 10 folds

fold_window_size = train_size + gap_size + val_size
n_folds = (max_time_idx - fold_window_size) // step_size + 1
print(f"Step 2: Calculated {n_folds} folds")

# 1 Year splits config

In [0]:
max_time_idx = df_indexed.agg(F.max("time_idx")).collect()[0][0]
print(f"  Max time index: {max_time_idx}")

train_size = 720*4      # 30 days (720 hours)
gap_size = 2          # 2 hours
val_size = 168*4        # 7 days (168 hours)
step_size = 90*4       # Calculated to get exactly 10 folds

fold_window_size = train_size + gap_size + val_size
n_folds = (max_time_idx - fold_window_size) // step_size + 1
print(f"Step 2: Calculated {n_folds} folds")

In [0]:
fold_mapping = []

for fold_id in range(1, n_folds + 1):
    fold_start = 1 + (fold_id - 1) * step_size
    # print(fold_id, fold_start)
    for t in range(fold_start, fold_start + train_size):
        fold_mapping.append((t, fold_id, "train"))
    
    for t in range(fold_start + train_size, fold_start + train_size + gap_size):
        fold_mapping.append((t, fold_id, "gap"))

    for t in range(fold_start + train_size + gap_size, fold_start + train_size + gap_size + val_size):
        fold_mapping.append((t, fold_id, "validation"))

fold_df = spark.createDataFrame(fold_mapping, ["time_idx", "fold_id", "split_type"])

result = df_indexed.join(
    F.broadcast(fold_df),
    on='time_idx',
    how='inner'
    )


In [0]:
if input("Careful! About to overwrite splits. If you want to continue, type y") == "y":
    result.write \
    .partitionBy("fold_id", "split_type") \
    .mode("overwrite") \
    .parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year}/cv_splits")

## How to read the CV splits

In [0]:
def read_specific_fold(path: str, fold_id: int, split_type: str):
    return spark.read.parquet(f"{path}/fold_id={fold_id}/split_type={split_type}")


In [0]:
# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")


# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "tail_num_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ],
    outputCol="features"
)

In [0]:
def read_specific_fold(path: str, fold_id: int, split_type: str):
    """
    Read a specific fold from partitioned parquet data.
    Falls back to filtering if direct partition read fails.
    """
    fold_path = f"{path}/fold_id={fold_id}/split_type={split_type}"
    
    try:
        # Try direct partition read
        return spark.read.parquet(fold_path)
    except:
        # Fallback: read all data and filter
        print(f"Direct read failed for fold {fold_id}, using filter method...")
        all_data = spark.read.parquet(path)
        return all_data.filter(
            (all_data.fold_id == fold_id) & 
            (all_data.split_type == split_type)
        )


# Your original train function works as-is now
def train_cv_models(n_folds=10, month_or_year="3_month_custom_joined"):
    cv_results = []
    cv_models = []

    linear_reg = LinearRegression(
        featuresCol="features",
        labelCol="DEP_DELAY_NEW",
        maxIter=10, 
        regParam=0.3
    )

    pipeline = Pipeline(stages=[
        carrier_indexer, origin_indexer, dest_indexer, tail_num_indexer,
        carrier_encoder, origin_encoder, dest_encoder, tail_num_encoder,
        assembler,
        linear_reg
    ])

    mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

    rmse_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="rmse"
    )

    mlflow.spark.autolog(disable=True)
    
    with mlflow.start_run(run_name="LR_CV_3_MONTH") as parent_run:
        MODEL_NAME = "LR_CV_3_MONTH"
        
        mlflow.log_param("n_folds", n_folds)
        mlflow.log_param("maxIter", 10)
        mlflow.log_param("regParam", 0.3)
        mlflow.log_param("model_type", "LinearRegression")
        mlflow.log_param("dataset", month_or_year)
        
        fold_metrics = {
            'train_mae': [], 'val_mae': [],
            'train_rmse': [], 'val_rmse': []
        }

        for fold_id in range(1, n_folds + 1):
            # Child run for this fold
            with mlflow.start_run(run_name=f"fold_{fold_id}", nested=True) as fold_run:
                
                fold_train = read_specific_fold(
                    path=f"dbfs:/student-groups/Group_2_2/{month_or_year}/cv_splits", 
                    fold_id=fold_id, 
                    split_type="train"
                )
                fold_val = read_specific_fold(
                    path=f"dbfs:/student-groups/Group_2_2/{month_or_year}/cv_splits", 
                    fold_id=fold_id, 
                    split_type="validation"
                )

                print(f"Fold {fold_id} - Train size: {fold_train.count()}, Val size: {fold_val.count()}")

                print(f"Training fold {fold_id}...")
                model = pipeline.fit(fold_train)
                
                print(f"Making predictions...")
                training_predictions = model.transform(fold_train)
                validation_predictions = model.transform(fold_val)
                
                mae_t = mae_evaluator.evaluate(training_predictions)
                mae_v = mae_evaluator.evaluate(validation_predictions)
                rmse_t = rmse_evaluator.evaluate(training_predictions)
                rmse_v = rmse_evaluator.evaluate(validation_predictions)
                
                fold_metrics['train_mae'].append(mae_t)
                fold_metrics['val_mae'].append(mae_v)
                fold_metrics['train_rmse'].append(rmse_t)
                fold_metrics['val_rmse'].append(rmse_v)
                
                # Log to child run
                mlflow.log_metrics({
                    "train_mae": mae_t,
                    "val_mae": mae_v,
                    "train_rmse": rmse_t,
                    "val_rmse": rmse_v,
                })
                
                cv_results.append({
                    'fold': fold_id,
                    'train_mae': mae_t,
                    'val_mae': mae_v,
                    'train_rmse': rmse_t,
                    'val_rmse': rmse_v
                })
                cv_models.append(model)
                
                print(f"Fold {fold_id} - Train MAE: {mae_t:.4f}, Val MAE: {mae_v:.4f}")
            
            # After child run closes, log to parent run
            # Now we're back in the parent run context automatically
            mlflow.log_metrics({
                f"fold_{fold_id}_train_mae": mae_t,
                f"fold_{fold_id}_val_mae": mae_v,
                f"fold_{fold_id}_train_rmse": rmse_t,
                f"fold_{fold_id}_val_rmse": rmse_v,
            })
            print("="*120)
        
        import numpy as np
        mlflow.log_metrics({
            "avg_train_mae": np.mean(fold_metrics['train_mae']),
            "avg_val_mae": np.mean(fold_metrics['val_mae']),
            "std_val_mae": np.std(fold_metrics['val_mae']),
            "avg_train_rmse": np.mean(fold_metrics['train_rmse']),
            "avg_val_rmse": np.mean(fold_metrics['val_rmse']),
            "std_val_rmse": np.std(fold_metrics['val_rmse'])
        })
        
        # Create DataFrame from cv_results and log as table
        results_df = pd.DataFrame(cv_results)
        mlflow.log_table(data=results_df, artifact_file="cv_fold_results.json")
        
        print(f"\nCV Complete - Avg Val MAE: {np.mean(fold_metrics['val_mae']):.4f} Â± {np.std(fold_metrics['val_mae']):.4f}")
        print(f"\nFold Results:\n{results_df.to_string()}")
    
    return cv_results, cv_models

In [0]:
cv_results, cv_models = train_cv_models()