In [None]:
# Giving the data set full_data_flightdelay.csv in the archive,
# the task is to predict the delay of a flight in minutes.
# The data set contains the following columns:
# MONTH:				Month
# DAY_OF_WEEK:			Day of Week
# DEP_DEL15: 			TARGET Binary of a departure delay over 15 minutes (1 is yes)
# DEP_TIME_BLK:			Departure time block
# DISTANCE_GROUP:			Distance group to be flown by departing aircraft
# SEGMENT_NUMBER:			The segment that this tail number is on for the day
# CONCURRENT_FLIGHTS:		Concurrent flights leaving from the airport in the same departure block
# NUMBER_OF_SEATS:		Number of seats on the aircraft
# CARRIER_NAME:			Carrier
# AIRPORT_FLIGHTS_MONTH:		Avg Airport Flights per Month
# AIRLINE_FLIGHTS_MONTH:		Avg Airline Flights per Month
# AIRLINE_AIRPORT_FLIGHTS_MONTH:	Avg Flights per month for Airline AND Airport
# AVG_MONTHLY_PASS_AIRPORT:	Avg Passengers for the departing airport for the month
# AVG_MONTHLY_PASS_AIRLINE:	Avg Passengers for airline for month
# FLT_ATTENDANTS_PER_PASS:	Flight attendants per passenger for airline
# GROUND_SERV_PER_PASS:		Ground service employees (service desk) per passenger for airline
# PLANE_AGE:			Age of departing aircraft
# DEPARTING_AIRPORT:		Departing Airport
# LATITUDE:			Latitude of departing airport
# LONGITUDE:			Longitude of departing airport
# PREVIOUS_AIRPORT:		Previous airport that aircraft departed from
# PRCP:				Inches of precipitation for day
# SNOW:				Inches of snowfall for day
# SNWD:				Inches of snow on ground for day
# TMAX:				Max temperature for day
# AWND:				Max wind speed for day



# Build a classification model using various supervised machine and unsupervised machine learning models
# learning models and check which model gives you the best accuracy

# use the following models for supervised machine learning
# 1. Logistic Regression
# 2. Decision Tree
# 3. GaussianNB
# 4. MLPClassifier

# using some deep learning models
# 1. Long Short-Term Memory (LSTM)


# using hyperparameter tuning to improve the accuracy of the models


In [None]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Define the path to the data set
# '../input/2019-airline-delays-and-cancellations/full_data_flightdelay.csv'
path = '../input/2019-airline-delays-and-cancellations/full_data_flightdelay.csv'

In [None]:
# importing the dataset
df = pd.read_csv(path)

# checking the dataset
df.head()

In [None]:
# show the shape of the dataset and the number of rows and columns
df.shape

In [None]:
# check the data types of the columns
df.info()

In [None]:
# encode the categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def clean_labels_encoder(list_of_labels, df):
    for label in list_of_labels:
        df[label] = le.fit_transform(df[label])
    return df

# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'DEP_TIME_BLK']
df = clean_labels_encoder(list_of_labels, df)

# show head of the dataset
df.head()

In [None]:
# check the data types of the columns
df.info()

In [None]:
# describe the dataset
df.describe()

In [None]:
# fill the missing values with mean
df.fillna(df.mean(), inplace=True)

# show correlation
df.corr()

In [None]:
# show correlation in a heatmap

import matplotlib.pyplot as plt
import seaborn as sns

# show the correlation in a plt figure
def show_correlation(df):
    plt.figure(figsize=(20, 10))
    sns.set(style='whitegrid', context='notebook')
    cols = [0, 1, 2]
    sns.heatmap(df.corr(), annot=True, square=False, cmap='coolwarm')
    plt.show()

# show the correlation
show_correlation(df)


In [None]:
# split the data into features and target
# target is DEP_DEL15

X = df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]].values
y = df.iloc[:, 2].values

X

In [None]:
# covert the data into 0 - 1 range using minmax scaler

from sklearn.preprocessing import MinMaxScaler

def scale_data(X):
    scaler = MinMaxScaler()
    X_scaler = scaler.fit_transform(X)
    return X_scaler

X_scaler = scale_data(X)
df = pd.DataFrame(X_scaler)

df.head()

In [None]:
# split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.3, random_state=42)

X_train.shape

In [None]:
# Build a classification model using various supervised machine 
# learning models and check which model gives you the best accuracy

# use the following models
# 1. Logistic Regression
# 2. Decision Tree
# 3. GaussianNB
# 4. MLPClassifier

In [None]:
# create a function to train the models
# check the accuracy of the model
from sklearn.metrics import confusion_matrix, classification_report

def separator(count = 50):
    print('-'*count)

def train_model_and_print_accuracy(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    # score train and test sets
    scoreTest = model.score(X_train, y_train)
    scoreTrain = model.score(X_test, y_test)

    # predict the test data
    predict_test = model.predict(X_test)

    cm_result = confusion_matrix(y_test, predict_test)
    cr_result = classification_report(y_test,predict_test)

    model_name = str(model).split('(')[0]

    # print model name in blue color
    print('\033[1m' + model_name + '\033[0m')
    # print -----------------------------------
    separator()
    print('Train Score for '+str(model_name)+': ', (scoreTest))
    separator()
    print('Test Score for '+str(model_name)+': ', (scoreTrain))
    separator()
    print('Confusion Matrix for '+str(model_name)+' for test : \n', (cm_result))
    separator()
    print('Classification Report for '+str(model_name)+' for test : \n', str(cr_result))
    separator()


In [None]:
# using logistic regression
# 1. Logistic Regression
# 2. Decision Tree
# 3. GaussianNB
# 4. MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

log_reg = LogisticRegression()
dt = DecisionTreeClassifier(max_depth=3)
gnb = GaussianNB()
mlp = MLPClassifier(random_state=1, max_iter=300)

# train the models and print the accuracy
models = [log_reg, dt, gnb, mlp]

for model in models:
    train_model_and_print_accuracy(model, X_train, y_train, X_test, y_test)

In [None]:
# conclusion before PCA - principal component analysis

# show that best modal is MLPClassifier with 0.816 accuracy (approx)

In [None]:
# Now use PCA to reduce the dimensionality of the data and
# retrain the models to see what impacts it has on your model in terms of accuracy.
# keep in mind that many times doing PCA can actually decrease the accuracy of your model
# but computation is much lighter and that's trade off you need to consider while build models in real life

In [None]:
# use PCA to reduce the dimensionality of the data

from sklearn.decomposition import PCA

pca = PCA(0.95)

x_pca = pca.fit_transform(X_scaler)

# Show the number of components
x_pca.shape

In [None]:
# split the data into training and testing sets using the new data

X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.3, random_state=42)

In [None]:
# now retrain the models
print('*'*25, 'PCA', '*'*25)
for model in models:
    train_model_and_print_accuracy(model, X_train, y_train, X_test, y_test)

In [None]:
# conclusion before PCA - principal component analysis

# show that best modal is MLPClassifier with 0.811 accuracy (approx)

In [None]:
# conclusion before PCA - principal component analysis
# However, it's important to note that accuracy alone is not always the best metric 
# for evaluating the performance of AI models. Depending on the task at hand, 
# other metrics such as precision, recall, F1-score, or AUC-ROC might be more appropriate.

# there LogisticRegression, DecisionTreeClassifier and GaussianNB are the best to use after PCA

In [None]:
# importing the dataset
df = pd.read_csv(path)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# encode the categorical data
# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'DEP_TIME_BLK']
df = clean_labels_encoder(list_of_labels, df)

# split the data into features and target
# target is DEP_DEL15

X = df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]].values
y = df.iloc[:, 2].values

# covert the data into 0 - 1 range using minmax scaler

X_scaler = scale_data(X)
df = pd.DataFrame(X_scaler)

# split the data into training and testing sets

# split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.3, random_state=42)

X_train.shape

# checking the dataset
df.head()

In [None]:
# using some deep learning models

# 1. using Long Short-Term Memory - using keras

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout


path = '../input/2019-airline-delays-and-cancellations/full_data_flightdelay.csv'
# Load the dataset and preprocess the data
df = pd.read_csv(path)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# encode the categorical data
# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'DEP_TIME_BLK']
df = clean_labels_encoder(list_of_labels, df)

# Define the input and target variables
X = df.drop(columns=['DEP_DEL15'])
y = df['DEP_DEL15']

# Split the data into training and testing sets
train_size = int(len(X) * 0.7)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Scale the data using a MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for input to the LSTM model
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define the model architecture
model = Sequential()
model.add(LSTM(64, input_shape=(1, X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2, shuffle=False)

# Evaluate the model on the test set
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
# conclusion so far shows that using Long Short-Term Memory - using keras is a good model
# with 0.83 accuracy (approx) however, it's important to note that accuracy alone is not always the best metric
# and LSTM too a lot of time to train.
