In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# 1. Load the dataset
df = pd.read_csv("simulated_data/TAS_Airport_Data_2024_final.csv")



# 2. Handle Missing Values

# a) Impute numerical features with the mean
numerical_features = ["FlightDuration", "DepartureDelay", "ArrivalDelay", "PreviousFlightDelay", 
                      "AircraftTurnaroundTime", "CheckInQueueTime", "SecurityWaitTime",
                      "BaggageClaimBelts", "CustomsOfficers", "ImmigrationOfficers"]

numerical_imputer = SimpleImputer(strategy="mean")
df[numerical_features] = numerical_imputer.fit_transform(df[numerical_features])

# b) Impute categorical features with the most frequent value
categorical_features = ["ReasonForDelay", "GateAvailability", "GateType", "LoungeUsage"]

categorical_imputer = SimpleImputer(strategy="most_frequent")
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

# Justification: Handling missing values is crucial to avoid errors in subsequent steps. 
# We use imputation to fill in missing values with reasonable estimates.

# 3. Feature Engineering

# a) Create a new feature: TotalDelay
df["TotalDelay"] = df["DepartureDelay"] + df["ArrivalDelay"]

# b) Convert ScheduledTime and ScheduledArrivalTime to datetime objects
df["ScheduledTime"] = pd.to_datetime(df["ScheduledTime"])
df["ScheduledArrivalTime"] = pd.to_datetime(df["ScheduledArrivalTime"])

# c) Extract hour of the day from ScheduledTime and ScheduledArrivalTime
df["ScheduledHour"] = df["ScheduledTime"].dt.hour
df["ScheduledArrivalHour"] = df["ScheduledArrivalTime"].dt.hour

# Justification: Feature engineering creates new features or transforms existing ones 
# to improve model performance. We create a `TotalDelay` feature and extract 
# meaningful information from the datetime columns.

# 4. Feature Scaling and Encoding

# a) Define features for scaling and encoding
numerical_features_to_scale = ["FlightDuration", "TotalDelay", "PreviousFlightDelay", 
                               "AircraftTurnaroundTime", "CheckInQueueTime", "SecurityWaitTime",
                               "BaggageClaimBelts", "CustomsOfficers", "ImmigrationOfficers",
                               "ScheduledHour", "ScheduledArrivalHour"] 

categorical_features_to_encode = ["Airline", "WeatherCondition", "ReasonForDelay", 
                                   "GateAvailability", "GateType", "LoungeUsage"]

# b) Create transformers
numerical_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

# c) Create a ColumnTransformer to apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features_to_scale),
        ("cat", categorical_transformer, categorical_features_to_encode),
    ]
)

# d) Fit and transform the data
df

Unnamed: 0,FlightID,Airline,DepartureAirport,ArrivalAirport,OriginAirport,Gate,ScheduledTime,ScheduledArrivalTime,FlightDuration,BaggageHandlers,...,CheckInQueueTime,SecurityWaitTime,LoungeUsage,OnTimePerformance,BaggageClaimBelts,CustomsOfficers,ImmigrationOfficers,TotalDelay,ScheduledHour,ScheduledArrivalHour
0,HY20240101_001D,HY,TAS,FRA,,C18,2024-01-01 22:11:00,NaT,135.0,4,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,4.0,22.0,
1,KE20240101_002D,KE,TAS,KUL,,C3,2024-01-01 08:02:00,NaT,521.0,4,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,123.0,8.0,
2,KE20240101_003D,KE,TAS,IST,,C3,2024-01-01 20:33:00,NaT,291.0,4,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,5.0,20.0,
3,HY20240101_004D,HY,TAS,SIN,,C3,2024-01-01 20:03:00,NaT,462.0,2,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,48.0,20.0,
4,HY20240101_005A,HY,,,FRA,C8,NaT,2024-01-01 11:50:00,192.0,4,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,0.0,,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32935,TK20241231_086A,TK,,,DXB,B1,NaT,2024-12-31 16:50:00,119.0,6,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,5.0,,16.0
32936,HY20241231_087A,HY,,,DXB,B16,NaT,2024-12-31 22:10:00,591.0,2,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,5.0,,22.0
32937,HY20241231_088A,HY,,,LHR,C8,NaT,2024-12-31 19:04:00,103.0,4,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,5.0,,19.0
32938,TK20241231_089A,TK,,,DEL,B20,NaT,2025-01-01 06:42:00,596.0,4,...,7.0,27.0,No,Delayed,2.0,9.0,5.0,0.0,,6.0
