In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn import metrics, linear_model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('aug2023.csv')
data.shape

(602987, 30)

In [3]:
data2 = data.copy()

In [4]:
columns_to_fill = ['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
data2[columns_to_fill] = data2[columns_to_fill].fillna(0)

data2.isna().sum()

FL_DATE                       0
OP_UNIQUE_CARRIER             0
OP_CARRIER_FL_NUM             0
ORIGIN_AIRPORT_ID             0
ORIGIN_AIRPORT_SEQ_ID         0
ORIGIN_CITY_MARKET_ID         0
ORIGIN                        0
ORIGIN_CITY_NAME              0
ORIGIN_STATE_ABR              0
DEST_AIRPORT_ID               0
DEST_AIRPORT_SEQ_ID           0
DEST_CITY_MARKET_ID           0
DEST                          0
DEST_CITY_NAME                0
DEST_STATE_ABR                0
DEP_TIME                   8883
DEP_DELAY                  8886
DEP_DEL15                  8886
ARR_TIME                   9272
ARR_DELAY                 10845
ARR_DEL15                 10845
CANCELLED                     0
CANCELLATION_CODE        593815
DIVERTED                      0
FLIGHTS                       0
CARRIER_DELAY                 0
WEATHER_DELAY                 0
NAS_DELAY                     0
SECURITY_DELAY                0
LATE_AIRCRAFT_DELAY           0
dtype: int64

In [5]:
# convert the column to datetime
data2['FL_DATE'] = pd.to_datetime(data2['FL_DATE'])

# extract month and year
data2['FL_DATE_month'] = pd.to_datetime(data2['FL_DATE']).dt.month
data2['FL_DATE_year'] = pd.to_datetime(data2['FL_DATE']).dt.year


In [11]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602987 entries, 0 to 602986
Data columns (total 32 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   FL_DATE                602987 non-null  datetime64[ns]
 1   OP_UNIQUE_CARRIER      602987 non-null  object        
 2   OP_CARRIER_FL_NUM      602987 non-null  int64         
 3   ORIGIN_AIRPORT_ID      602987 non-null  int64         
 4   ORIGIN_AIRPORT_SEQ_ID  602987 non-null  int64         
 5   ORIGIN_CITY_MARKET_ID  602987 non-null  int64         
 6   ORIGIN                 602987 non-null  object        
 7   ORIGIN_CITY_NAME       602987 non-null  object        
 8   ORIGIN_STATE_ABR       602987 non-null  object        
 9   DEST_AIRPORT_ID        602987 non-null  int64         
 10  DEST_AIRPORT_SEQ_ID    602987 non-null  int64         
 11  DEST_CITY_MARKET_ID    602987 non-null  int64         
 12  DEST                   602987 non-null  obje

In [6]:
cols = ["FL_DATE_month", "OP_UNIQUE_CARRIER",
          "ORIGIN",
          "DEST",
          "DEP_TIME",
          "DEP_DELAY",
          "DEP_DEL15",
          "ARR_TIME",
          "ARR_DEL15", "CANCELLED"]
model_data = data2[cols].copy()

In [7]:
labelEncoder = LabelEncoder()
model_data["ORIGIN"] = labelEncoder.fit_transform(model_data["ORIGIN"])
model_data["DEST"] = labelEncoder.fit_transform(model_data["DEST"])
model_data["OP_UNIQUE_CARRIER"] = labelEncoder.fit_transform(model_data["OP_UNIQUE_CARRIER"])

In [8]:
features = model_data.drop("CANCELLED", axis=1)
labels = np.asarray(model_data["CANCELLED"])

In [9]:

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


#Cancelled Analysis

In [14]:
# sampling the dataset for minority class
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

In [15]:


def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    return accuracy, precision, recall, f1

In [12]:
# ANN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]

accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy on the test set: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy on the test set: 0.9848256189986567


In [16]:
# histogram-based gradient boosting

parameters = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [1, 2, 4],
    'max_iter': [5, 10, 20],
}

gb = HistGradientBoostingClassifier()
grid_search = GridSearchCV(gb, parameters, cv=3, error_score='raise')
grid_search.fit(X_train_resampled, y_train_resampled)

best_gb = grid_search.best_estimator_
best_gb

In [17]:
accuracy_train, precision_train, recall_train, f1_train = evaluate_model(best_gb, X_train_resampled, y_train_resampled)
print(f"Accuracy for the training data: {accuracy_train}")
print(f"Precision for the training data: {precision_train}")
print(f"recall for the training data: {recall_train}")
print(f"f1-score for the training data: {recall_train}")

Accuracy for the training data: 0.9999768443964492
Precision for the training data: 0.9999768454687635
recall for the training data: 0.9999768443964492
f1-score for the training data: 0.9999768443964492


In [18]:
accuracy_test, precision_test, recall_test, f1_test = evaluate_model(best_gb, X_test, y_test)
print(f"Accuracy for the test data: {accuracy_test}")
print(f"Precision for the test data: {precision_test}")
print(f"recall for the test data: {recall_test}")
print(f"f1-score for the test data: {f1_test}")

Accuracy for the test data: 0.9994693112655267
Precision for the test data: 0.9994695970814602
recall for the test data: 0.9994693112655267
f1-score for the test data: 0.9994646602583612


#Delay Analysis

In [20]:
cols = ["FL_DATE_month", "OP_UNIQUE_CARRIER",
          "ORIGIN",
          "DEST",
          "DEP_TIME",
          "DEP_DELAY",
          "DEP_DEL15",
          "ARR_TIME",
          "ARR_DEL15"]
model_data = data2[cols].copy()

model_data = model_data.dropna()

labelEncoder = LabelEncoder()
model_data["ORIGIN"] = labelEncoder.fit_transform(model_data["ORIGIN"])
model_data["DEST"] = labelEncoder.fit_transform(model_data["DEST"])
model_data["OP_UNIQUE_CARRIER"] = labelEncoder.fit_transform(model_data["OP_UNIQUE_CARRIER"])

features = model_data.drop("ARR_DEL15", axis=1)
labels = np.asarray(model_data["ARR_DEL15"])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    return accuracy, precision, recall, f1


# histogram-based gradient boosting
parameters = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [1, 2, 4],
    'max_iter': [5, 10, 20],
}

gb = HistGradientBoostingClassifier()
grid_search = GridSearchCV(gb, parameters, cv=3, error_score='raise')
grid_search.fit(X_train, y_train)

best_gb = grid_search.best_estimator_

best_gb

In [24]:
accuracy_train, precision_train, recall_train, f1_train = evaluate_model(best_gb, X_train, y_train)
print(f"Accuracy for the training data: {accuracy_train}")
print(f"Precision for the training data: {precision_train}")
print(f"recall for the training data: {recall_train}")
print(f"f1-score for the training data: {recall_train}")

accuracy_test, precision_test, recall_test, f1_test = evaluate_model(best_gb, X_test, y_test)
print(f"Accuracy for the test data: {accuracy_test}")
print(f"Precision for the test data: {precision_test}")
print(f"recall for the test data: {recall_test}")
print(f"f1-score for the test data: {f1_test}")

Accuracy for the training data: 0.9310025268464239
Precision for the training data: 0.9304116989424406
recall for the training data: 0.9310025268464239
f1-score for the training data: 0.9310025268464239
Accuracy for the test data: 0.9315539268253553
Precision for the test data: 0.9309176575269105
recall for the test data: 0.9315539268253553
f1-score for the test data: 0.9289328528265938


In [25]:
# ANN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]

accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy on the test set: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy on the test set: 0.9310304064038369
