# XGBOOST model baseline - 1 year - hyperparameter tuning testing
- run model from 1_year_combined data with feature engineering
  - TAIL_NUM causes OOM error, comment out for now
- featuring engineering handled in https://dbc-fae72cab-cf59.cloud.databricks.com/editor/notebooks/1792055957780055?o=4021782157704243



## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from xgboost.spark import SparkXGBRegressor

from mlflow.models import infer_signature


import random
import numpy as np
import pandas as pd

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



## Helper Functions


In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets - custom join
- get checkpoint data
  - 1 year combined join, with feature engineering

In [0]:
%fs ls dbfs:/student-groups/Group_2_2/5_year_custom_joined/feature_eng_ph3/training_splits

In [0]:
# Stephanie's latest features = 1_year_custom_joined/feature_eng_ph3/training_splits/
# Daniel's Graph features = 1_year_custom_joined/graph_feature_splits
# Raw splits = 3_month_custom_joined/raw_data/training_splits/

In [0]:
month_or_year_dataset = "5_year_custom_joined"

graph_features_train = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year_dataset}/graph_feature_splits/train/")
graph_features_validation = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year_dataset}/graph_feature_splits/validation/")
graph_features_test = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year_dataset}/graph_feature_splits/test/")


other_trained_features = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year_dataset}/feature_eng_ph3/training_splits/train.parquet/")
other_validation_features = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year_dataset}/feature_eng_ph3/training_splits/validation.parquet/")
other_test_features = spark.read.parquet(f"dbfs:/student-groups/Group_2_2/{month_or_year_dataset}/feature_eng_ph3/training_splits/test.parquet/")

In [0]:
def combine_datasets(graph_features, other_features):
    features_to_select_from_graph_features = ["flight_uid",'page_rank', 'out_degree', 'in_degree', 'weighted_out_degree', 'weighted_in_degree', 'N_RUNWAYS', 'betweenness_unweighted', 'closeness', 'betweenness', 'avg_origin_dep_delay', 'avg_dest_arr_delay', 'avg_daily_route_flights', 'avg_route_delay', 'avg_hourly_flights']

    graph_features = graph_features.select(features_to_select_from_graph_features)

    df = graph_features.join(other_features, on='flight_uid', how='inner')
    return df

In [0]:
train_df = combine_datasets(graph_features_train, other_trained_features)
validation_df = combine_datasets(graph_features_validation, other_validation_features)
test_df = combine_datasets(graph_features_test, other_test_features)



In [0]:
if input("CAREFUL: You're about to write to DBFS. Type 'y' to continue.") == "y":
    checkpoint_dataset(train_df, f"{month_or_year_dataset}/fe_graph_and_holiday/training_splits/train")
    checkpoint_dataset(validation_df, f"{month_or_year_dataset}/fe_graph_and_holiday/training_splits/validation")
    checkpoint_dataset(test_df, f"{month_or_year_dataset}/fe_graph_and_holiday/training_splits/test")