In [1]:
# Given the dataset full_data_flightdelay.csv,
# the task is to predict whether the delay of a flight will be larger than 15 minutes.
# The data set contains the following columns:
# MONTH:				Month
# DAY_OF_WEEK:			Day of Week
# DEP_DEL15: 			TARGET Binary of a departure delay over 15 minutes (1 is yes)
# DEP_TIME_BLK:			Departure time block
# DISTANCE_GROUP:			Distance group to be flown by departing aircraft
# SEGMENT_NUMBER:			The segment that this tail number is on for the day
# CONCURRENT_FLIGHTS:		Concurrent flights leaving from the airport in the same departure block
# NUMBER_OF_SEATS:		Number of seats on the aircraft
# CARRIER_NAME:			Carrier
# AIRPORT_FLIGHTS_MONTH:		Avg Airport Flights per Month
# AIRLINE_FLIGHTS_MONTH:		Avg Airline Flights per Month
# AIRLINE_AIRPORT_FLIGHTS_MONTH:	Avg Flights per month for Airline AND Airport
# AVG_MONTHLY_PASS_AIRPORT:	Avg Passengers for the departing airport for the month
# AVG_MONTHLY_PASS_AIRLINE:	Avg Passengers for airline for month
# FLT_ATTENDANTS_PER_PASS:	Flight attendants per passenger for airline
# GROUND_SERV_PER_PASS:		Ground service employees (service desk) per passenger for airline
# PLANE_AGE:			Age of departing aircraft
# DEPARTING_AIRPORT:		Departing Airport
# LATITUDE:			Latitude of departing airport
# LONGITUDE:			Longitude of departing airport
# PREVIOUS_AIRPORT:		Previous airport that aircraft departed from
# PRCP:				Inches of precipitation for day
# SNOW:				Inches of snowfall for day
# SNWD:				Inches of snow on ground for day
# TMAX:				Max temperature for day
# AWND:				Max wind speed for day
#
# Final input datapoints provided by the user will consist of:
# MONTH, DAY_OF_WEEK, DEP_TIME_BLK, DISTANCE_GROUP, CARRIER_NAME, DEPARTING_AIRPORT, PRCP, SNOW
# PRCP and SNOW will be converted into [0,1] values indicating wheter there is rainfall or snowfall at all.
# DISTANCE_GROUP will be changed to SHORT_FLIGHT[0,1] to indicate whether this was a shor flight or not.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Define the path to the dataset
path = '../../full_data_flightdelay.csv'

In [3]:
# importing and checking the dataset
df = pd.read_csv(path)
df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,0800-0859,2,1,25,143,Southwest Airlines Co.,13056,...,8,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
1,1,7,0,0700-0759,7,1,29,191,Delta Air Lines Inc.,13056,...,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
2,1,7,0,0600-0659,7,1,27,199,Delta Air Lines Inc.,13056,...,18,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
3,1,7,0,0600-0659,9,1,27,180,Delta Air Lines Inc.,13056,...,2,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0001-0559,7,1,10,182,Spirit Air Lines,13056,...,1,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91


In [4]:
# deleting unwanted columns
columns_to_delete = ['SEGMENT_NUMBER', 'CONCURRENT_FLIGHTS', 'NUMBER_OF_SEATS', 'AIRPORT_FLIGHTS_MONTH', 'AIRLINE_FLIGHTS_MONTH',
                     'AIRLINE_AIRPORT_FLIGHTS_MONTH', 'AVG_MONTHLY_PASS_AIRPORT', 'AVG_MONTHLY_PASS_AIRLINE', 'FLT_ATTENDANTS_PER_PASS',
                     'GROUND_SERV_PER_PASS', 'PLANE_AGE', 'LATITUDE', 'LONGITUDE', 'PREVIOUS_AIRPORT', 'SNWD', 'TMAX', 'AWND']

df.drop(columns=columns_to_delete, inplace=True)
df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,CARRIER_NAME,DEPARTING_AIRPORT,PRCP,SNOW
0,1,7,0,0800-0859,2,Southwest Airlines Co.,McCarran International,0.0,0.0
1,1,7,0,0700-0759,7,Delta Air Lines Inc.,McCarran International,0.0,0.0
2,1,7,0,0600-0659,7,Delta Air Lines Inc.,McCarran International,0.0,0.0
3,1,7,0,0600-0659,9,Delta Air Lines Inc.,McCarran International,0.0,0.0
4,1,7,0,0001-0559,7,Spirit Air Lines,McCarran International,0.0,0.0


In [5]:
# encode the categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
mapping_dict = {}

def clean_labels_encoder(list_of_labels, df):
    for label in list_of_labels:
        df[label] = le.fit_transform(df[label])
        mapping_dict[label] = dict(zip(le.classes_, le.transform(le.classes_)))
    return df

# clean the labels
list_of_labels = ['DEP_TIME_BLK', 'CARRIER_NAME', 'DEPARTING_AIRPORT']
df = clean_labels_encoder(list_of_labels, df)

# print old, unique values for each column, and their corresponding new values
for label, mapping in mapping_dict.items():
    print(f"Column: {label}")
    print(mapping)

# show head of the dataset
df.head()

Column: DEP_TIME_BLK
{'0001-0559': 0, '0600-0659': 1, '0700-0759': 2, '0800-0859': 3, '0900-0959': 4, '1000-1059': 5, '1100-1159': 6, '1200-1259': 7, '1300-1359': 8, '1400-1459': 9, '1500-1559': 10, '1600-1659': 11, '1700-1759': 12, '1800-1859': 13, '1900-1959': 14, '2000-2059': 15, '2100-2159': 16, '2200-2259': 17, '2300-2359': 18}
Column: CARRIER_NAME
{'Alaska Airlines Inc.': 0, 'Allegiant Air': 1, 'American Airlines Inc.': 2, 'American Eagle Airlines Inc.': 3, 'Atlantic Southeast Airlines': 4, 'Comair Inc.': 5, 'Delta Air Lines Inc.': 6, 'Endeavor Air Inc.': 7, 'Frontier Airlines Inc.': 8, 'Hawaiian Airlines Inc.': 9, 'JetBlue Airways': 10, 'Mesa Airlines Inc.': 11, 'Midwest Airline, Inc.': 12, 'SkyWest Airlines Inc.': 13, 'Southwest Airlines Co.': 14, 'Spirit Air Lines': 15, 'United Air Lines Inc.': 16}
Column: DEPARTING_AIRPORT
{'Adams Field': 0, 'Albany International': 1, 'Albuquerque International Sunport': 2, 'Anchorage International': 3, 'Atlanta Municipal': 4, 'Austin - Bergs

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,CARRIER_NAME,DEPARTING_AIRPORT,PRCP,SNOW
0,1,7,0,3,2,14,44,0.0,0.0
1,1,7,0,2,7,6,44,0.0,0.0
2,1,7,0,1,7,6,44,0.0,0.0
3,1,7,0,1,9,6,44,0.0,0.0
4,1,7,0,0,7,15,44,0.0,0.0


In [6]:
# fill the missing values with mean
df.fillna(df.mean(), inplace=True)

In [7]:
# Replace detailed data with categorical columns to indicate a short flight, rainy conditions and snowy contitions.
df['SHORT_FLIGHT'] = (df['DISTANCE_GROUP'] <= 3).astype(int)
df['RAINS'] = (df['PRCP'] > 0).astype(int)
df['SNOWS'] = (df['SNOW'] > 0).astype(int)

columns_to_delete = ['DISTANCE_GROUP', 'PRCP', 'SNOW']
df.drop(columns=columns_to_delete, inplace=True)

df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,CARRIER_NAME,DEPARTING_AIRPORT,SHORT_FLIGHT,RAINS,SNOWS
0,1,7,0,3,14,44,1,0,0
1,1,7,0,2,6,44,0,0,0
2,1,7,0,1,6,44,0,0,0
3,1,7,0,1,6,44,0,0,0
4,1,7,0,0,15,44,0,0,0


In [23]:
# Verify column datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6489062 entries, 0 to 6489061
Data columns (total 9 columns):
 #   Column             Dtype
---  ------             -----
 0   MONTH              int64
 1   DAY_OF_WEEK        int64
 2   DEP_DEL15          int64
 3   DEP_TIME_BLK       int32
 4   CARRIER_NAME       int32
 5   DEPARTING_AIRPORT  int32
 6   SHORT_FLIGHT       int32
 7   RAINS              int32
 8   SNOWS              int32
dtypes: int32(6), int64(3)
memory usage: 297.0 MB


In [52]:
# Shuffle the rows
df = df.sample(frac=1, random_state=42)

# Reset the index after shuffling
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,CARRIER_NAME,DEPARTING_AIRPORT,SHORT_FLIGHT,RAINS,SNOWS
0,3,1,0,7,16,5,0,0,0
1,11,1,0,13,14,79,0,0,0
2,5,6,0,1,2,48,1,0,0
3,4,6,0,15,2,62,1,0,0
4,5,3,0,2,13,86,1,1,0


In [36]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

# Create Long Short-Term Memory ML model (using keras)

# Define the input and target variables
X = df.drop(columns=['DEP_DEL15'])
y = df['DEP_DEL15']

# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Scale the data using a MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for input to the LSTM model
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define the model architecture
model = Sequential()
model.add(LSTM(512, input_shape=(1, X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.AUC(),
                                                                    keras.metrics.Recall()])

# Train the model
history = model.fit(X_train, y_train, epochs=2, batch_size=512, validation_data=(X_test, y_test), verbose=2, shuffle=False)

# Evaluate the model on the test set
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test precision:', score[2])
print('Test AUC_ROC:', score[3])
print('Test recall:', score[4])

  super().__init__(**kwargs)


Epoch 1/2
10140/10140 - 257s - 25ms/step - accuracy: 0.8107 - auc_3: 0.6422 - loss: 0.4656 - precision_3: 0.2738 - recall_3: 1.6697e-04 - val_accuracy: 0.8111 - val_auc_3: 0.6536 - val_loss: 0.4614 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 2/2
10140/10140 - 250s - 25ms/step - accuracy: 0.8108 - auc_3: 0.6559 - loss: 0.4611 - precision_3: 0.5092 - recall_3: 0.0044 - val_accuracy: 0.8113 - val_auc_3: 0.6616 - val_loss: 0.4589 - val_precision_3: 0.5401 - val_recall_3: 0.0081
Test loss: 0.45892372727394104
Test accuracy: 0.8113449215888977
Test precision: 0.5401419997215271
Test AUC_ROC: 0.6615602970123291
Test recall: 0.00806908868253231


In [35]:
# Results after 10 epochs:
#Test loss: 0.4490787982940674
#Test accuracy: 0.8135802149772644
#Test precision: 0.5773495435714722
#Test AUC_ROC: 0.6869732141494751
#Test recall: 0.04864298179745674

In [53]:
import joblib

# Save the ML model
model.save("./utils/test_prediction_model.keras")

# Save th scaler
joblib.dump(scaler, './utils/scaler.pkl')

['./utils/scaler.pkl']

In [54]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TFSMLayer
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

# Load saved model and scaler
reconstructed_model = keras.models.load_model("./utils/test_prediction_model.keras")
scaler_loaded = joblib.load('./utils/scaler.pkl')

# Prepare the data so that the model can make a prediction
# Possibly change my_dataset to DataFrame in the future so that scaler doesn't complaing about the lack of feature names
my_dataset = np.array([10, 2, 10, 6, 11, 1, 1, 0])
my_dataset = my_dataset.reshape(1, -1)
my_dataset = scaler_loaded.transform(my_dataset)
x = my_dataset.reshape(-1, 1, my_dataset.shape[1])
print(x)

# Predict flight delay for the given dataset.
# prediction > 0.5 -> flight delayed by at least 15 minutes
# prediction < 0.5 -> flight delayed by at most 15 minutes
prediction = reconstructed_model.predict(x)
print(prediction)



[[[0.81818182 0.16666667 0.55555556 0.375      0.11578947 1.
   1.         0.        ]]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[[0.20788525]]
