# feature engineering sandbox

Feature eng ideas:
- Number of delayed flights in departure and arrival location (total or 4 hours before, 6 hours before, etc.)
- Number of delays in the route in the last 30 days
- Number of flights plane has flown that day
- Total number of flights plan has flown until a certain time

# Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


import random

import mlflow
print(mlflow.__version__)

import os
os.environ['PYSPARK_PIN_THREAD'] = 'false'
spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



# Data

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}"))

### Combined dataset

In [0]:
# paths
custom_join_3m_path = "dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V2/custom_join_v2_3m.parquet"
custom_join_1y_path ='dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V2/custom_join_v2_1y.parquet'

In [0]:
join_data_3m = spark.read.parquet(custom_join_3m_path)

join_data_3m_df = join_data_3m.cache()

# drop null flight_uid
join_data_3m_df = join_data_3m.dropna(subset=['flight_uid'])
display(join_data_3m_df)

In [0]:
join_data_1y = spark.read.parquet(custom_join_1y_path)

join_data_1y_df = join_data_1y.cache()

# drop null flight_uid
join_data_1y_df = join_data_1y.dropna(subset=['flight_uid'])
display(join_data_1y_df)

In [0]:
# add utc time for departure date
join_data_3m_df = join_data_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

In [0]:
display(join_data_3m_df)

In [0]:
# add utc time for departure date
join_data_1y_df = join_data_1y_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

## Drop some hourly fields for now from weather columns

In [0]:
join_data_3m_df = join_data_3m_df.dropna(subset=[
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ])

### Filter cancelled flights

In [0]:
join_data_3m_df = join_data_3m_df.filter(F.col("CANCELLED") != 1)
display(join_data_3m_df.limit(10))

In [0]:
# check for nulls
for column_name in join_data_3m_df.columns:
    print(f"{column_name} ------> {join_data_3m_df.filter(F.col(column_name).isNull()).count()}")

Feature eng ideas:

- Number of delayed flights in departure location (total or 4 hours before, 6 hours before, etc.)
  - per airline?
- Number of delays in the route in the last 30 days
- Number of flights plane has flown that day

### Feature - number of delayed flights at depature location over the last 7 days

In [0]:
from pyspark.sql.functions import col, sum, asc

# add column for total flight delay in the last 7 days for each origin
window_7d_origin = window_4h = Window \
    .partitionBy("ORIGIN") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

# display(join_data_3m_df)

In [0]:
# apply window to 1y data

join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

display(join_data_1y_df)

### Feature - number of delayed flights at depature and carrier location over the last 7 days

In [0]:
window_7d_origin_carrier = window_4h = Window \
    .partitionBy("ORIGIN", "OP_UNIQUE_CARRIER") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_3m_df = join_data_3m_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

# display(join_data_3m_df)

In [0]:
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_carrier_7d', 
    sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

# display(join_data_1y_df)

### Feature - Number of delays in the route in the last 7 days
- route: origin to destination

In [0]:
join_data_3m_df = join_data_3m_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

join_data_1y_df = join_data_1y_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

In [0]:
window_7d_route = window_4h = Window \
    .partitionBy("route") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours


join_data_3m_df = join_data_3m_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_3m_df = join_data_3m_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

# display(join_data_3m_df)


In [0]:
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
join_data_1y_df = join_data_1y_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

# display(join_data_1y_df)

### Feature - number of flights per day for one plane


In [0]:
window_flights_24h = Window \
  .partitionBy("TAIL_NUM", "FL_DATE") \
  .orderBy(F.col("crs_dep_utc_timestamp").cast("long"))

join_data_3m_df = join_data_3m_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

# display(join_data_3m_df)

In [0]:
join_data_1y_df = join_data_1y_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

display(join_data_1y_df)

In [0]:
# check for nulls
for column_name in join_data_3m_df.columns:
    print(f"{column_name} ------> {join_data_3m_df.filter(F.col(column_name).isNull()).count()}")

## Get Splits from checkpoint - 3 month data


In [0]:
checkpoint_path = f"dbfs:/student-groups/Group_2_2"
dataset_path = f"{checkpoint_path}/3_month_custom_joined/raw_data/training_splits"

# Read datasets from checkpoint
train_df = spark.read.parquet(f"{dataset_path}/train.parquet")
validation_df = spark.read.parquet(f"{dataset_path}/validation.parquet")
test_df = spark.read.parquet(f"{dataset_path}/test.parquet")

In [0]:
train_3m_df = train_df.cache()
validation_3m_df = validation_df.cache()
test_3m_df = test_df.cache()

### Convert departure time to UTC

In [0]:
# add utc time for departure date
train_3m_df = train_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

validation_3m_df = validation_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

test_3m_df = test_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

### feature - total delay time for flights at departure locations over the past 7 days

In [0]:
# incorporate features in splits
window_7d_origin = Window \
    .partitionBy("ORIGIN") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_3m_df = train_3m_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
train_3m_df = train_3m_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 

test_3m_df = test_3m_df.withColumn(
    'delay_origin_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin)
)

# Handle the nulls by coalescing the raw feature with 0
test_3m_df = test_3m_df.withColumn(
    'delay_origin_7d', 
    F.coalesce(F.col('delay_origin_7d_sum_raw'), F.lit(0))
).drop('delay_origin_7d_sum_raw') 



### Feature - number of delayed flights at depature and carrier location over the last 7 days

In [0]:
window_7d_origin_carrier = Window \
    .partitionBy("ORIGIN", "OP_UNIQUE_CARRIER") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours

train_3m_df = train_3m_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
train_3m_df = train_3m_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
validation_3m_df = validation_3m_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 

test_3m_df = test_3m_df.withColumn(
    'delay_origin_carrier_7d_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_origin_carrier)
)

# Handle the nulls by coalescing the raw feature with 0
test_3m_df = test_3m_df.withColumn(
    'delay_origin_carrier_7d', 
    F.coalesce(F.col('delay_origin_carrier_7d_raw'), F.lit(0))
).drop('delay_origin_carrier_7d_raw') 


### Feature - number of delays in route in the last 7 days
- route: origin to destination

In [0]:
train_3m_df = train_3m_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

validation_3m_df = validation_3m_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

test_3m_df = test_3m_df.withColumn(
  "route",
  F.concat(F.col("ORIGIN"), F.lit("-"), F.col("DEST"))
)

In [0]:
window_7d_route = Window \
    .partitionBy("route") \
    .orderBy(F.col("crs_dep_utc_timestamp").cast("long")) \
    .rangeBetween(-604800, -14400) # -7 days, -4 hours


train_3m_df = train_3m_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
train_3m_df = train_3m_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

validation_3m_df = validation_3m_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
validation_3m_df = validation_3m_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

test_3m_df = test_3m_df.withColumn(
    'delay_route_7d_sum_raw', 
    F.sum('DEP_DELAY_NEW').over(window_7d_route)
)

# Handle the nulls by coalescing the raw feature with 0
test_3m_df = test_3m_df.withColumn(
    'delay_route_7d', 
    F.coalesce(F.col('delay_route_7d_sum_raw'), F.lit(0))
).drop('delay_route_7d_sum_raw') 

### Feature - number of flights per day for one plane

In [0]:
window_flights_24h = Window \
  .partitionBy("TAIL_NUM", "FL_DATE") \
  .orderBy(F.col("crs_dep_utc_timestamp").cast("long"))

train_3m_df = train_3m_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

validation_3m_df = validation_3m_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

test_3m_df = test_3m_df.withColumn(
    'flight_count_24h', 
    F.count("*").over(window_flights_24h)
)

In [0]:
# check for nulls
for column_name in train_3m_df.columns:
    print(f"{column_name} ------> {train_3m_df.filter(F.col(column_name).isNull()).count()}")

## Model

In [0]:
# get baseline columns

baselines_columns = [
    "QUARTER",
    "MONTH",
    "YEAR",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_CARRIER",
    "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    # "CRS_ELAPSED_TIME",
    # "DISTANCE",
    "DEP_DELAY_NEW",
    "crs_dep_utc_timestamp",
    # "prev_flight_delay_in_minutes",
    # "prev_flight_delay",
    # "origin_delays_4h",
    'HourlyDryBulbTemperature',
    'HourlyDewPointTemperature',
    'HourlyRelativeHumidity',
    'HourlyAltimeterSetting',
    'HourlyVisibility',
    'HourlyStationPressure',
    'HourlyWetBulbTemperature',
    'HourlyPrecipitation',
    'HourlyCloudCoverage',
    'HourlyCloudElevation',
    'HourlyWindSpeed'  
]

In [0]:
# Categorical encoding
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="carrier_idx", handleInvalid="keep")
origin_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT_SEQ_ID", outputCol="origin_idx", handleInvalid="keep")
dest_indexer = StringIndexer(inputCol="DEST_AIRPORT_SEQ_ID", outputCol="dest_idx", handleInvalid="keep")
tail_num_indexer = StringIndexer(inputCol="TAIL_NUM", outputCol="tail_num_idx", handleInvalid="keep")

carrier_encoder = OneHotEncoder(inputCol="carrier_idx", outputCol="carrier_vec")
origin_encoder = OneHotEncoder(inputCol="origin_idx", outputCol="origin_vec")
dest_encoder = OneHotEncoder(inputCol="dest_idx", outputCol="dest_vec")
tail_num_encoder = OneHotEncoder(inputCol="tail_num_idx", outputCol="tail_num_vec")


In [0]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "QUARTER",
        "MONTH", 
        "YEAR",
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "carrier_vec",
        "origin_vec",
        "dest_vec",
        "tail_num_vec",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        'HourlyDryBulbTemperature',
        'HourlyDewPointTemperature',
        'HourlyRelativeHumidity',
        'HourlyAltimeterSetting',
        'HourlyVisibility',
        'HourlyStationPressure',
        'HourlyWetBulbTemperature',
        'HourlyPrecipitation',
        'HourlyCloudCoverage',
        'HourlyCloudElevation',
        'HourlyWindSpeed'  
    ],
    outputCol="features"
)

In [0]:
# linear regression baseline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from mlflow.models import infer_signature

mlflow.spark.autolog()
with mlflow.start_run(run_name="lr - weather baseline 3m"):
    MODEL_NAME = "LR_WEATHER_BASELINE_3M"

    linear_reg = LinearRegression(
        featuresCol="features",
        labelCol="DEP_DELAY_NEW",
        # Linear Regression has different parameters than Random Forest
        maxIter=10, 
        regParam=0.3
    )

    # rf = RandomForestRegressor(
    #     featuresCol="features",  
    #     labelCol="DEP_DELAY_NEW",   
    #     numTrees=20,
    #     maxDepth=10
    # )

    # Create pipeline
    pipeline = Pipeline(stages=[
        carrier_indexer, origin_indexer, dest_indexer, tail_num_indexer,
        carrier_encoder, origin_encoder, dest_encoder, tail_num_encoder,
        assembler,
        linear_reg
        # rf
    ])

    model = pipeline.fit(train_3m_df)
    training_predictions = model.transform(train_3m_df)
    validation_predictions = model.transform(validation_3m_df)

    mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

    rmse_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="rmse"
    )

    # Calculate MAE
    mae_t = mae_evaluator.evaluate(training_predictions)
    mae_v = mae_evaluator.evaluate(validation_predictions)
    # Calculate RMSE
    rmse_t = rmse_evaluator.evaluate(training_predictions)
    rmse_v = rmse_evaluator.evaluate(validation_predictions)

    signature = infer_signature(train_df, training_predictions)

    mlflow.spark.log_model(
        model, 
        MODEL_NAME,
        input_example=train_df.limit(1).toPandas(),
        signature=signature,
        registered_model_name="flight_delay_prediction_baseline"
        )

    mlflow.log_metric("train_mae", mae_t)
    mlflow.log_metric("validation_mae", mae_v)
    mlflow.log_metric("train_rmse", rmse_t)
    mlflow.log_metric("validation_rmse", rmse_v)

## Get test results

In [0]:
import mlflow
from pyspark.sql.functions import struct, col

model_uri = 'runs:/e9d6601ea33a42cb99baaa87eddc16bc/RF_1y_BASELINE_ALL_FEATS'

# Load model as a Spark UDF. Override result_type if the model does not return double values.
loaded_model = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri)

# Predict on a Spark DataFrame.
test_3m_df = test_3m_df.withColumn('prediction', loaded_model(struct(*map(col, test_3m_df.columns))))

In [0]:
mae_evaluator = RegressionEvaluator(
        labelCol="DEP_DELAY_NEW",      
        predictionCol="prediction", 
        metricName="mae"           
    )

rmse_evaluator = RegressionEvaluator(
    labelCol="DEP_DELAY_NEW",      
    predictionCol="prediction", 
    metricName="rmse"
)



In [0]:
transformed_test = loaded_model.transform(test_3m_df)
# Calculate MAE
test = mae_evaluator.evaluate(transformed_test)
print(test)

In [0]:
loaded_model()