In [23]:
import pandas as pd

# 1. Load flight data
csv_path = "dataset/archive/Combined_Flights_2021.csv"
flight_data = pd.read_csv(csv_path)

print("flight_data shape:", flight_data.shape)
flight_data.head()



flight_data shape: (6311871, 61)


Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0


In [24]:
# Create TotalDelay = ActualElapsedTime - CRSElapsedTime
flight_data["TotalDelay"] = flight_data["ActualElapsedTime"] - flight_data["CRSElapsedTime"]
flight_data["TotalDelay"] = flight_data["TotalDelay"].fillna(0)

flight_data[["ActualElapsedTime", "CRSElapsedTime", "TotalDelay"]].head()


Unnamed: 0,ActualElapsedTime,CRSElapsedTime,TotalDelay
0,64.0,79.0,-15.0
1,74.0,78.0,-4.0
2,160.0,180.0,-20.0
3,146.0,145.0,1.0
4,85.0,101.0,-16.0


In [25]:
# Define narrow vs broad feature configurations

# Narrow baseline
candidate_features_narrow = [
    "DayOfWeek",             
    "CRSDepTime",
    "Distance",
    "Origin",
    "Dest",
    "IATA_Code_Marketing_Airline",      # if present
]

# Broader 
candidate_features_broad = [
    "Year",
    "Month",
    "DayofMonth",
    "DayOfWeek",             
    "CRSDepTime",
    "Distance",
    "Origin",
    "Dest",
    "IATA_Code_Marketing_Airline",
    "Flight_Number_Marketing_Airline"
]

target_col = "TotalDelay"

# Filter for just columns that exist in flight_data
config_A_features = [c for c in candidate_features_narrow if c in flight_data.columns]
config_B_features = [c for c in candidate_features_broad if c in flight_data.columns]

print("Config A (narrow) features:", config_A_features)
print("Config B (broad) features:", config_B_features)


Config A (narrow) features: ['DayOfWeek', 'CRSDepTime', 'Distance', 'Origin', 'Dest', 'IATA_Code_Marketing_Airline']
Config B (broad) features: ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'Distance', 'Origin', 'Dest', 'IATA_Code_Marketing_Airline', 'Flight_Number_Marketing_Airline']


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import numpy as np

def run_experiment(feature_list, use_pca=False, label=""):
    # Drop rows with missing data in relevant columns
    df_model = flight_data[feature_list + [target_col]].dropna()

    # Subsample to lower training time
    if len(df_model) > 200_000:
        df_model = df_model.sample(n=200_000, random_state=42)

    X = df_model[feature_list]
    y = df_model[target_col]

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True
    )

    numeric_features = [col for col in feature_list if flight_data[col].dtype != "O"]
    categorical_features = [col for col in feature_list if flight_data[col].dtype == "O"]

    preprocess = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ]
    )

    rf = RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )

    if use_pca:
        model = Pipeline(steps=[
            ("preprocess", preprocess),
            ("pca", PCA(n_components=50, random_state=42)),
            ("rf", rf),
        ])
    else:
        model = Pipeline(steps=[
            ("preprocess", preprocess),
            ("rf", rf),
        ])

    # Train
    model.fit(X_train, y_train)

    def eval_split(name, Xs, ys):
        yp = model.predict(Xs)
        mae = mean_absolute_error(ys, yp)
        rmse = np.sqrt(mean_squared_error(ys, yp))
        r2 = r2_score(ys, yp)
        print(f"{name:10s} MAE={mae:7.2f}  RMSE={rmse:7.2f}  R2={r2:6.3f}")

    print(f"\n=== {label} | PCA={use_pca} ===")
    eval_split("Train", X_train, y_train)
    eval_split("Val",   X_val,   y_val)
    eval_split("Test",  X_test,  y_test)

    return model


In [27]:
# Baseline: narrow inputs, no PCA
model_A = run_experiment(config_A_features, use_pca=False, label="Config A (narrow)")

# Broader inputs: no PCA
model_B = run_experiment(config_B_features, use_pca=False, label="Config B (broad)")

# Broader inputs: with PCA
model_B_pca = run_experiment(config_B_features, use_pca=True, label="Config B (broad) + PCA")



=== Config A (narrow) | PCA=False ===
Train      MAE=   4.01  RMSE=   5.91  R2= 0.798
Val        MAE=   9.76  RMSE=  13.76  R2=-0.120
Test       MAE=   9.53  RMSE=  13.42  R2=-0.083

=== Config B (broad) | PCA=False ===
Train      MAE=   3.27  RMSE=   4.68  R2= 0.874
Val        MAE=   8.89  RMSE=  12.63  R2= 0.058
Test       MAE=   8.80  RMSE=  12.48  R2= 0.063

=== Config B (broad) + PCA | PCA=True ===
Train      MAE=   3.34  RMSE=   4.74  R2= 0.870
Val        MAE=   9.04  RMSE=  12.76  R2= 0.038
Test       MAE=   8.98  RMSE=  12.62  R2= 0.042
