# Feature Engineering Notebook

# Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


import random

import mlflow
print(mlflow.__version__)

import os
os.environ['PYSPARK_PIN_THREAD'] = 'false'
spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-baseline"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



# Data

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}"))

### Combined dataset

In [0]:
# paths
custom_join_3m_path = "dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V2/custom_join_v2_3m.parquet"
custom_join_1y_path ='dbfs:/mnt/mids-w261/daniel_costa@berkeley.edu/Custom_Joins/V2/custom_join_v2_1y.parquet'

In [0]:
join_data_3m = spark.read.parquet(custom_join_3m_path)

# drop null flight_uid
join_data_3m_df = join_data_3m.dropna(subset=['flight_uid'])
display(join_data_3m_df)

In [0]:
join_data_1y = spark.read.parquet(custom_join_1y_path)

# drop null flight_uid
join_data_1y_df = join_data_1y.dropna(subset=['flight_uid'])
display(join_data_1y_df)

In [0]:
# add utc time for departure date
join_data_3m_df = join_data_3m_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

In [0]:
display(join_data_3m_df)

In [0]:
# add utc time for departure date
join_data_1y_df = join_data_1y_df.withColumn(
    "crs_dep_utc_timestamp",
    F.to_timestamp(
        F.concat(
            F.col("FL_DATE"),
            F.lit(" "),
            F.lpad(F.col("CRS_DEP_TIME").cast("string"), 4, "0")
        ),
        "yyyy-MM-dd HHmm"
    )
)

In [0]:
# join_data_1y_df.printSchema()
join_data_3m_df.printSchema()

## Feature Engineering

Feature list:

- time between landed and scheduled flight
- average delay time by airport
- average taxi out time by airport / flight

### Time between landed and scheduled flight

In [0]:
from pyspark.sql import functions as F

def hhmm_to_time_str(col):
    padded = F.lpad(F.col(col).cast("string"), 4, "0")
    return F.concat_ws(":", padded.substr(1, 2), padded.substr(3, 2))



In [0]:
# 3 mo
df3m = join_data_3m_df

df3m = df3m.withColumn(
    "CRS_ARR_TIME_STR",
    hhmm_to_time_str("CRS_ARR_TIME")
).withColumn(
    "WHEELS_ON_STR",
    hhmm_to_time_str("WHEELS_ON")
)

# 1 yr
df1y = join_data_1y_df

df1y = df1y.withColumn(
    "CRS_ARR_TIME_STR",
    hhmm_to_time_str("CRS_ARR_TIME")
).withColumn(
    "WHEELS_ON_STR",
    hhmm_to_time_str("WHEELS_ON")
)

In [0]:
# 3 mo
df3m = df3m.withColumn(
    "CRS_ARR_TIMESTAMP",
    F.to_timestamp("CRS_ARR_TIME_STR", "HH:mm")
).withColumn(
    "WHEELS_ON_TIMESTAMP",
    F.to_timestamp("WHEELS_ON_STR", "HH:mm")
)

# 1 yr
df1y = df1y.withColumn(
    "CRS_ARR_TIMESTAMP",
    F.to_timestamp("CRS_ARR_TIME_STR", "HH:mm")
).withColumn(
    "WHEELS_ON_TIMESTAMP",
    F.to_timestamp("WHEELS_ON_STR", "HH:mm")
)


In [0]:
# 3 mo
df3m = df3m.withColumn(
    "LANDING_TIME_DIFF_MINUTES",
    (F.col("WHEELS_ON_TIMESTAMP").cast("long") - 
     F.col("CRS_ARR_TIMESTAMP").cast("long")) / 60
)

# 1 yr
df1y = df1y.withColumn(
    "LANDING_TIME_DIFF_MINUTES",
    (F.col("WHEELS_ON_TIMESTAMP").cast("long") - 
     F.col("CRS_ARR_TIMESTAMP").cast("long")) / 60
)


In [0]:
# 3 mo
# df3m.select(
#     "CRS_ARR_TIME", "CRS_ARR_TIME_STR", 
#     "WHEELS_ON", "WHEELS_ON_STR",
#     "CRS_ARR_TIMESTAMP", "WHEELS_ON_TIMESTAMP",
#     "LANDING_TIME_DIFF_MINUTES"
# ).show(20, False)

# 1 yr
df1y.select(
    "CRS_ARR_TIME", "CRS_ARR_TIME_STR", 
    "WHEELS_ON", "WHEELS_ON_STR",
    "CRS_ARR_TIMESTAMP", "WHEELS_ON_TIMESTAMP",
    "LANDING_TIME_DIFF_MINUTES"
).show(20, False)


In [0]:
#display(df3m.limit(10))
display(df1y.limit(10))

In [0]:
null_counts = df1y.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df1y.columns])
display(null_counts)

### Average Delay Time by Airport (by Origin Airport and by Destination)

In [0]:
from pyspark.sql import functions as F

# Group by destination airport and compute average arrival delay
# 3 mo
avg_delay_by_airport = df3m.groupBy("DEST").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY")
)

# 1 yr
avg_delay_by_airport = df1y.groupBy("DEST").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY")
)


In [0]:
# 3 mo
df3m = df3m.join(avg_delay_by_airport, on="DEST", how="left")

# 1 yr
df1y = df1y.join(avg_delay_by_airport, on="DEST", how="left")

In [0]:
# df3m.select("DEST", "ARR_DELAY", "AVG_ARR_DELAY").show(20, False)
df1y.select("DEST", "ARR_DELAY", "AVG_ARR_DELAY").show(20, False)



In [0]:
# 3 mo
avg_delay_by_origin = df3m.groupBy("ORIGIN").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY_ORIGIN")
)

df3m = df3m.join(avg_delay_by_origin, on="ORIGIN", how="left")

# 1 yr
avg_delay_by_origin = df1y.groupBy("ORIGIN").agg(
    F.avg("ARR_DELAY").alias("AVG_ARR_DELAY_ORIGIN")
)

df1y = df1y.join(avg_delay_by_origin, on="ORIGIN", how="left")


In [0]:
# df3m.select("DEST", "ARR_DELAY", "AVG_ARR_DELAY_ORIGIN").show(20, False)

df1y.select("DEST", "ARR_DELAY", "AVG_ARR_DELAY_ORIGIN").show(20, False)

In [0]:
# display(df3m.limit(10))
display(df1y.limit(10))

In [0]:
# 3 mo
# null_counts = df3m.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df3m.columns])
# display(null_counts)

# 1 yr
null_counts = df1y.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df1y.columns])
display(null_counts)

### Average taxi-out time by airport

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Define a window partitioned by ORIGIN (airport)
# 3 mo
w = Window.partitionBy("ORIGIN")

# Add a column with the average TAXI_OUT per airport
df3m = df3m.withColumn("AVG_TAXI_OUT_ORIGIN", F.avg("TAXI_OUT").over(w))

# Show results
df3m.select("ORIGIN", "TAXI_OUT", "AVG_TAXI_OUT_ORIGIN").show(20)

# 1 yr
w = Window.partitionBy("ORIGIN")

# Add a column with the average TAXI_OUT per airport
df1y = df1y.withColumn("AVG_TAXI_OUT_ORIGIN", F.avg("TAXI_OUT").over(w))

# Show results
df1y.select("ORIGIN", "TAXI_OUT", "AVG_TAXI_OUT_ORIGIN").show(20)


In [0]:
# 3 mo
# null_counts = df3m.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df3m.columns])
# display(null_counts)

# 1 yr
null_counts = df1y.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df1y.columns])
display(null_counts)