In [12]:
# GivEN the dataset full_data_flightdelay.csv,
# the task is to predict whether the delay of a flight will be larger than 15 minutes.
# The data set contains the following columns:
# MONTH:				Month
# DAY_OF_WEEK:			Day of Week
# DEP_DEL15: 			TARGET Binary of a departure delay over 15 minutes (1 is yes)
# DEP_TIME_BLK:			Departure time block
# DISTANCE_GROUP:			Distance group to be flown by departing aircraft
# SEGMENT_NUMBER:			The segment that this tail number is on for the day
# CONCURRENT_FLIGHTS:		Concurrent flights leaving from the airport in the same departure block
# NUMBER_OF_SEATS:		Number of seats on the aircraft
# CARRIER_NAME:			Carrier
# AIRPORT_FLIGHTS_MONTH:		Avg Airport Flights per Month
# AIRLINE_FLIGHTS_MONTH:		Avg Airline Flights per Month
# AIRLINE_AIRPORT_FLIGHTS_MONTH:	Avg Flights per month for Airline AND Airport
# AVG_MONTHLY_PASS_AIRPORT:	Avg Passengers for the departing airport for the month
# AVG_MONTHLY_PASS_AIRLINE:	Avg Passengers for airline for month
# FLT_ATTENDANTS_PER_PASS:	Flight attendants per passenger for airline
# GROUND_SERV_PER_PASS:		Ground service employees (service desk) per passenger for airline
# PLANE_AGE:			Age of departing aircraft
# DEPARTING_AIRPORT:		Departing Airport
# LATITUDE:			Latitude of departing airport
# LONGITUDE:			Longitude of departing airport
# PREVIOUS_AIRPORT:		Previous airport that aircraft departed from
# PRCP:				Inches of precipitation for day
# SNOW:				Inches of snowfall for day
# SNWD:				Inches of snow on ground for day
# TMAX:				Max temperature for day
# AWND:				Max wind speed for day
#
# Final input datapoints provided by the user will consist of:
# MONTH, DAY_OF_WEEK, DEP_TIME_BLK, CARRIER_NAME, DEPARTING_AIRPORT, PRCP, SNOW
# PRCP and SNOW will be converted into [0, 1] values indicating wheter there is rainfall or snowfall at all.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Define the path to the dataset
path = '../../full_data_flightdelay.csv'

In [2]:
# importing the dataset
df = pd.read_csv(path)

# checking the dataset
df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,0800-0859,2,1,25,143,Southwest Airlines Co.,13056,...,8,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
1,1,7,0,0700-0759,7,1,29,191,Delta Air Lines Inc.,13056,...,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
2,1,7,0,0600-0659,7,1,27,199,Delta Air Lines Inc.,13056,...,18,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
3,1,7,0,0600-0659,9,1,27,180,Delta Air Lines Inc.,13056,...,2,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0001-0559,7,1,10,182,Spirit Air Lines,13056,...,1,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91


In [22]:
pd.Series(print(df['SNOW'].unique())).sort_values()
pd.Series(print(df['PRCP'].unique())).sort_values()
pd.Series(print(df['CARRIER_NAME'].unique())).sort_values()
pd.Series(print(df['DEPARTING_AIRPORT'].unique())).sort_values()
pd.Series(print(df['DEP_TIME_BLK'].unique())).sort_values()

[ 0.   0.2  2.1  1.   4.4  2.2  0.1  0.5  0.4  7.8  0.8  1.6  4.7  2.9
  0.7  0.3  3.   3.7  0.9  5.8  2.7  1.9  1.7  6.9  0.6  1.8  4.5  2.5
  7.7  1.1  4.9 13.6  3.1  6.2  1.4  1.5  4.6  4.8  1.3  1.2  5.6  6.4
  2.6 17.2  3.9  4.   2.8  5.   2.3  3.6  3.3  3.8  5.1  3.5  2.   7.4
  2.4  8.3  6.1  4.1  3.2  5.4  6.8  9.9  7.1  4.2  4.3  3.4  6.6  8.5
  8.7  5.5  8.2  8.4  8.8]
[0.000e+00 1.000e-02 6.200e-01 2.200e-01 3.200e-01 4.700e-01 1.600e-01
 3.400e-01 7.000e-02 1.000e-01 1.700e-01 2.160e+00 6.100e-01 4.800e-01
 7.300e-01 1.100e-01 1.480e+00 1.000e+00 1.240e+00 6.700e-01 8.000e-01
 5.400e-01 1.200e-01 3.300e-01 1.540e+00 7.600e-01 8.600e-01 2.450e+00
 6.500e-01 5.700e-01 3.800e-01 4.500e-01 6.900e-01 4.000e-02 1.800e-01
 8.200e-01 3.000e-02 1.170e+00 4.100e-01 8.000e-02 9.500e-01 8.800e-01
 2.800e-01 2.400e-01 1.080e+00 5.600e-01 1.330e+00 3.700e-01 1.710e+00
 7.000e-01 9.000e-02 9.800e-01 8.100e-01 2.320e+00 1.030e+00 4.600e-01
 1.900e-01 2.000e-01 5.000e-02 1.070e+00 6.000e-02

Series([], dtype: object)

In [3]:
# encode the categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def clean_labels_encoder(list_of_labels, df):
    for label in list_of_labels:
        df[label] = le.fit_transform(df[label])
    return df

# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'DEP_TIME_BLK']
df = clean_labels_encoder(list_of_labels, df)

# show head of the dataset
df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,3,2,1,25,143,14,13056,...,8,44,36.08,-115.152,216,0.0,0.0,0.0,65.0,2.91
1,1,7,0,2,7,1,29,191,6,13056,...,3,44,36.08,-115.152,216,0.0,0.0,0.0,65.0,2.91
2,1,7,0,1,7,1,27,199,6,13056,...,18,44,36.08,-115.152,216,0.0,0.0,0.0,65.0,2.91
3,1,7,0,1,9,1,27,180,6,13056,...,2,44,36.08,-115.152,216,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0,7,1,10,182,15,13056,...,1,44,36.08,-115.152,216,0.0,0.0,0.0,65.0,2.91


In [9]:
# fill the missing values with mean
df.fillna(df.mean(), inplace=True)

In [None]:
# split the data into features and target
# delete unwanted features
# leave only MONTH, DAY_OF_WEEK, DEP_TIME_BLK, CARRIER_NAME, DEPARTING_AIRPORT, PRCP, SNOW
# target is DEP_DEL15

X = df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]].values
y = df.iloc[:, 2].values

X

In [None]:
# covert the data into 0 - 1 range using minmax scaler

from sklearn.preprocessing import MinMaxScaler

def scale_data(X):
    scaler = MinMaxScaler()
    X_scaler = scaler.fit_transform(X)
    return X_scaler

X_scaler = scale_data(X)
df = pd.DataFrame(X_scaler)

df.head()

In [None]:
# split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.3, random_state=42)

X_train.shape

In [None]:
# Build a classification model using various supervised machine 
# learning models and check which model gives you the best accuracy

# use the following models
# 1. Logistic Regression
# 2. Decision Tree
# 3. GaussianNB
# 4. MLPClassifier

In [None]:
# create a function to train the models
# check the accuracy of the model
from sklearn.metrics import confusion_matrix, classification_report

def separator(count = 50):
    print('-'*count)

def train_model_and_print_accuracy(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    # score train and test sets
    scoreTest = model.score(X_train, y_train)
    scoreTrain = model.score(X_test, y_test)

    # predict the test data
    predict_test = model.predict(X_test)

    cm_result = confusion_matrix(y_test, predict_test)
    cr_result = classification_report(y_test,predict_test)

    model_name = str(model).split('(')[0]

    # print model name in blue color
    print('\033[1m' + model_name + '\033[0m')
    # print -----------------------------------
    separator()
    print('Train Score for '+str(model_name)+': ', (scoreTest))
    separator()
    print('Test Score for '+str(model_name)+': ', (scoreTrain))
    separator()
    print('Confusion Matrix for '+str(model_name)+' for test : \n', (cm_result))
    separator()
    print('Classification Report for '+str(model_name)+' for test : \n', str(cr_result))
    separator()


In [None]:
# using logistic regression
# 1. Logistic Regression
# 2. Decision Tree
# 3. GaussianNB
# 4. MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

log_reg = LogisticRegression()
dt = DecisionTreeClassifier(max_depth=3)
gnb = GaussianNB()
mlp = MLPClassifier(random_state=1, max_iter=300)

# train the models and print the accuracy
models = [log_reg, dt, gnb, mlp]

for model in models:
    train_model_and_print_accuracy(model, X_train, y_train, X_test, y_test)

In [None]:
# conclusion before PCA - principal component analysis

# show that best modal is MLPClassifier with 0.816 accuracy (approx)

In [None]:
# Now use PCA to reduce the dimensionality of the data and
# retrain the models to see what impacts it has on your model in terms of accuracy.
# keep in mind that many times doing PCA can actually decrease the accuracy of your model
# but computation is much lighter and that's trade off you need to consider while build models in real life

In [None]:
# use PCA to reduce the dimensionality of the data

from sklearn.decomposition import PCA

pca = PCA(0.95)

x_pca = pca.fit_transform(X_scaler)

# Show the number of components
x_pca.shape

In [None]:
# split the data into training and testing sets using the new data

X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.3, random_state=42)

In [None]:
# now retrain the models
print('*'*25, 'PCA', '*'*25)
for model in models:
    train_model_and_print_accuracy(model, X_train, y_train, X_test, y_test)

In [None]:
# conclusion before PCA - principal component analysis

# show that best modal is MLPClassifier with 0.811 accuracy (approx)

In [None]:
# conclusion before PCA - principal component analysis
# However, it's important to note that accuracy alone is not always the best metric 
# for evaluating the performance of AI models. Depending on the task at hand, 
# other metrics such as precision, recall, F1-score, or AUC-ROC might be more appropriate.

# there LogisticRegression, DecisionTreeClassifier and GaussianNB are the best to use after PCA

In [None]:
# importing the dataset
df = pd.read_csv(path)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# encode the categorical data
# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'DEP_TIME_BLK']
df = clean_labels_encoder(list_of_labels, df)

# split the data into features and target
# target is DEP_DEL15

X = df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]].values
y = df.iloc[:, 2].values

# covert the data into 0 - 1 range using minmax scaler

X_scaler = scale_data(X)
df = pd.DataFrame(X_scaler)

# split the data into training and testing sets

# split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.3, random_state=42)

X_train.shape

# checking the dataset
df.head()

In [None]:
# using some deep learning models

# 1. using Long Short-Term Memory - using keras

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout


path = '../input/2019-airline-delays-and-cancellations/full_data_flightdelay.csv'
# Load the dataset and preprocess the data
df = pd.read_csv(path)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# encode the categorical data
# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'DEP_TIME_BLK']
df = clean_labels_encoder(list_of_labels, df)

# Define the input and target variables
X = df.drop(columns=['DEP_DEL15'])
y = df['DEP_DEL15']

# Split the data into training and testing sets
train_size = int(len(X) * 0.7)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Scale the data using a MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for input to the LSTM model
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define the model architecture
model = Sequential()
model.add(LSTM(64, input_shape=(1, X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2, shuffle=False)

# Evaluate the model on the test set
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
# conclusion so far shows that using Long Short-Term Memory - using keras is a good model
# with 0.83 accuracy (approx) however, it's important to note that accuracy alone is not always the best metric
# and LSTM too a lot of time to train.
