# On-Time Performance

## Imports

In [16]:
import pandas as pd
import numpy as np
import pickle
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data


In [3]:
df = pd.read_csv("TAS_Airport_Data_2024_balanced.csv")
df

Unnamed: 0,FlightID,Airline,DepartureAirport,ArrivalAirport,OriginAirport,Gate,ScheduledTime,ScheduledArrivalTime,FlightDuration,BaggageHandlers,...,GateAvailability,GateType,AircraftTurnaroundTime,CheckInQueueTime,SecurityWaitTime,LoungeUsage,OnTimePerformance,BaggageClaimBelts,CustomsOfficers,ImmigrationOfficers
0,HY20240101_001D,HY,TAS,FRA,,C18,2024-01-01 22:11:00,,135,4,...,Available,International,57,7,27,No,Delayed,2,9,5
1,KE20240101_002D,KE,TAS,KUL,,C3,2024-01-01 08:02:00,,521,4,...,Available,International,57,7,27,No,Delayed,2,9,5
2,KE20240101_003D,KE,TAS,IST,,C3,2024-01-01 20:33:00,,291,4,...,Available,International,57,7,27,No,Delayed,2,9,5
3,HY20240101_004D,HY,TAS,SIN,,C3,2024-01-01 20:03:00,,462,2,...,Available,International,57,7,27,No,OnTime,2,9,5
4,HY20240101_005A,HY,,,FRA,C8,,2024-01-01 11:50:00,192,4,...,Available,International,57,7,27,No,OnTime,2,9,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32935,TK20241231_086A,TK,,,DXB,B1,,2024-12-31 16:50:00,119,6,...,Available,International,57,7,27,No,OnTime,2,9,5
32936,HY20241231_087A,HY,,,DXB,B16,,2024-12-31 22:10:00,591,2,...,Available,International,57,7,27,No,Delayed,2,9,5
32937,HY20241231_088A,HY,,,LHR,C8,,2024-12-31 19:04:00,103,4,...,Available,International,57,7,27,No,Delayed,2,9,5
32938,TK20241231_089A,TK,,,DEL,B20,,2025-01-01 06:42:00,596,4,...,Available,International,57,7,27,No,OnTime,2,9,5


## Feature Engineering

### Handle datetime 

In [4]:
df['ScheduledTime'] = pd.to_datetime(df['ScheduledTime'], errors='coerce')
df['DepartureHour'] = df['ScheduledTime'].dt.hour
df['DepartureDayOfWeek'] = df['ScheduledTime'].dt.dayofweek

### Fill Missing categorical 

In [5]:
categorical_cols = [
    "Airline", "DepartureAirport", "ArrivalAirport", "OriginAirport",
    "AircraftType", "WeatherCondition", "ReasonForDelay", "GateAvailability",
    "GateType", "LoungeUsage"
]
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown")

### Fill numerical

In [6]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

### Label encode

In [7]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


ontime_encoder = LabelEncoder()
df['OnTimePerformance_encoded'] = ontime_encoder.fit_transform(df['OnTimePerformance'])


df['DelayImpact'] = df['RunwayDelay'] + df['DepartureDelay'] + df['ArrivalDelay']
df['ResourceLoad'] = df['GroundCrew'] + df['FuelTrucks'] + df['CleaningCrew'] + df['SecurityChecks']
df['SecurityEfficiency'] = df['SecurityWaitTime'] / (df['SecurityChecks'] + 1)


## Prepare fetures and target 

In [8]:
features = [
    "Airline", "DepartureAirport", "ArrivalAirport", "OriginAirport", "AircraftType",
    "FlightDuration", "Passengers", "WeatherCondition", "PreviousFlightDelay",
    "GateAvailability", "GateType", "AircraftTurnaroundTime", "CheckInQueueTime",
    "SecurityWaitTime", "DepartureHour", "DepartureDayOfWeek",
    "DelayImpact", "ResourceLoad", "SecurityEfficiency"
]

X = df[features]
y = df["OnTimePerformance_encoded"]


### Scaling

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Train-Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  
)

## Model Training 

In [11]:
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
model.fit(X_train, y_train)

In [12]:
print("Classes in y_train:", np.unique(y_train, return_counts=True))
print("Classes in y_test:", np.unique(y_test, return_counts=True))

Classes in y_train: (array([0, 1]), array([14494, 11858]))
Classes in y_test: (array([0, 1]), array([3623, 2965]))


## Evaluvation

In [13]:
y_pred = model.predict(X_test)

print("\n--- On-Time Performance Model ---\n")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


--- On-Time Performance Model ---

              precision    recall  f1-score   support

           0       0.78      0.73      0.75      3623
           1       0.69      0.74      0.72      2965

    accuracy                           0.74      6588
   macro avg       0.74      0.74      0.74      6588
weighted avg       0.74      0.74      0.74      6588

[[2648  975]
 [ 759 2206]]


In [18]:
import joblib

joblib.dump(model, "on_time_performance.joblib")

['on_time_performance.joblib']

## Visualization

In [15]:
# 1. Confusion Matrix Plot
def plot_confusion_matrix(y_true, y_pred, labels, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.show()

# 2. Feature Importance Plot
def plot_feature_importance(model, feature_names, top_n=15, title="Top Feature Importances"):
    importances = model.feature_importances_
    indices = np.argsort(importances)[-top_n:]  
    plt.figure(figsize=(10,6))
    plt.barh(range(len(indices)), importances[indices], align="center")
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel("Relative Importance")
    plt.title(title)
    plt.grid(True)
    plt.show()

print("\n--- Visualizations ---")

plot_confusion_matrix(y_test, y_pred, labels=target_encoder.classes_, title="On-Time Performance Confusion Matrix")
plot_feature_importance(model, features, top_n=10, title="Top 10 Features Influencing On-Time Performance")


--- Visualizations ---


NameError: name 'target_encoder' is not defined