# Flight Delay Prediction Airline Wise

The large amount of data within the flight delay data set (over 10 million rows) made operating on new variables computationally slow. Thus, it has been split airline wise for data anaylsis and exploration. 

In [1]:
#Imported the necessary python libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pylab as plt 

  import pandas.util.testing as tm


In [2]:
#Loaded Allegiant Air dataset to a dataframe from airline specific csv file
df=pd.read_csv('AirlineG4.csv')

# Handling Missing values

In [3]:
df=df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,FL_NUM,Route,ORIGIN,...,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,PASSENGERS,EMPFULL,EMPPART,EMPTOTAL,EMPFTE,NET_INCOME,OP_REVENUES
0,2018,1,1,3,3,2018-01-03,G4,1775,5,ABE,...,43.0,0.0,0.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
1,2018,1,1,6,6,2018-01-06,G4,1775,5,ABE,...,0.0,0.0,11.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
2,2018,1,1,10,3,2018-01-10,G4,1775,5,ABE,...,,,,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
3,2018,1,1,13,6,2018-01-13,G4,1775,5,ABE,...,15.0,0.0,0.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
4,2018,1,1,17,3,2018-01-17,G4,1775,5,ABE,...,32.0,0.0,89.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150346,2019,2,6,24,1,2019-06-24,G4,1976,6678,XNA,...,,,,2705.0,3729.0,462.0,4191.0,3960.0,91340.63,466845.71
150347,2019,2,6,25,2,2019-06-25,G4,1976,6678,XNA,...,,,,2705.0,3729.0,462.0,4191.0,3960.0,91340.63,466845.71
150348,2019,2,6,26,3,2019-06-26,G4,1913,6678,XNA,...,,,,2705.0,3729.0,462.0,4191.0,3960.0,91340.63,466845.71
150349,2019,2,6,28,5,2019-06-28,G4,1976,6678,XNA,...,,,,2705.0,3729.0,462.0,4191.0,3960.0,91340.63,466845.71


In [4]:
#checking for null values
df.isna().sum()

YEAR                        0
QUARTER                     0
MONTH                       0
DAY_OF_MONTH                0
DAY_OF_WEEK                 0
FL_DATE                     0
CARRIER                     0
FL_NUM                      0
Route                       0
ORIGIN                      0
DEST                        0
DEST_CITY                   0
DEST_STATE                  0
CRS_DEP_TIME                0
DEP_TIME                 1017
DEP_DELAY                1050
DEP_DELAY_NEW            1050
DEP_DEL15                1050
DEP_DELAY_GROUP          1050
DEP_TIME_BLK                0
TAXI_OUT                 1050
WHEELS_OFF               1050
WHEELS_ON                1145
TAXI_IN                  1145
CRS_ARR_TIME                0
ARR_TIME                 1145
ARR_DELAY                1503
ARR_DELAY_NEW            1503
ARR_DEL15                1503
ARR_DELAY_GROUP          1503
ARR_TIME_BLK                0
CANCELED                    0
CANCELLATION_CODE      149265
DIVERTED  

In [5]:
#dropping null values from columns within the dataset
df=df.dropna(subset=['ARR_DEL15'])

In [6]:
#Imputing the missing values with 0 since the null values represent no delay
df=df.fillna(0)

In [7]:
#checking number of Allegiant Air Flights for the year 2018 and 2019
df['FL_NUM'].nunique()

2621

# Categorical encoding of Nominal Variables

In [8]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['CARRIER','ORIGIN','DEST','FL_NUM'])
df = encoder.fit_transform(df)

df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER_0,FL_NUM_0,FL_NUM_1,FL_NUM_2,...,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,PASSENGERS,EMPFULL,EMPPART,EMPTOTAL,EMPFTE,NET_INCOME,OP_REVENUES
0,2018,1,1,3,3,2018-01-03,1,0,0,0,...,43.0,0.0,0.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
1,2018,1,1,6,6,2018-01-06,1,0,0,0,...,0.0,0.0,11.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
2,2018,1,1,10,3,2018-01-10,1,0,0,0,...,0.0,0.0,0.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
3,2018,1,1,13,6,2018-01-13,1,0,0,0,...,15.0,0.0,0.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86
4,2018,1,1,17,3,2018-01-17,1,0,0,0,...,32.0,0.0,89.0,1345.0,3541.0,415.0,3956.0,3749.0,71091.85,412680.86


In [9]:
#checking columns for consistency
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'CARRIER_0', 'FL_NUM_0', 'FL_NUM_1', 'FL_NUM_2', 'FL_NUM_3', 'FL_NUM_4',
       'FL_NUM_5', 'FL_NUM_6', 'FL_NUM_7', 'FL_NUM_8', 'FL_NUM_9', 'FL_NUM_10',
       'FL_NUM_11', 'FL_NUM_12', 'Route', 'ORIGIN_0', 'ORIGIN_1', 'ORIGIN_2',
       'ORIGIN_3', 'ORIGIN_4', 'ORIGIN_5', 'ORIGIN_6', 'ORIGIN_7', 'DEST_0',
       'DEST_1', 'DEST_2', 'DEST_3', 'DEST_4', 'DEST_5', 'DEST_6', 'DEST_7',
       'DEST_CITY', 'DEST_STATE', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY',
       'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_DELAY_GROUP', 'DEP_TIME_BLK',
       'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME',
       'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15',
       'ARR_DELAY_GROUP', 'ARR_TIME_BLK', 'CANCELED', 'CANCELLATION_CODE',
       'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME',
       'DISTANCE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY',
       'SECURITY_DELAY', 'LATE_AIR

In [10]:
df['CRS_ARR_TIME'].values

array([1215, 1215, 1215, ..., 1819, 1714, 1500], dtype=int64)

In [30]:
#checking correlation among variables to steer better feature selection
df[['CRS_ARR_TIME','ARR_DELAY']].corr()

Unnamed: 0,CRS_ARR_TIME,ARR_DELAY
CRS_ARR_TIME,1.0,0.117043
ARR_DELAY,0.117043,1.0


In [48]:
df[['PASSENGERS','ARR_DELAY']].corr()

Unnamed: 0,PASSENGERS,ARR_DELAY
PASSENGERS,1.0,-0.004537
ARR_DELAY,-0.004537,1.0


# Categorical Encoding for Ordinal Variables

In [11]:
#Info method to get an overall overview of the dataframe 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148848 entries, 0 to 150350
Data columns (total 76 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YEAR                 148848 non-null  int64  
 1   QUARTER              148848 non-null  int64  
 2   MONTH                148848 non-null  int64  
 3   DAY_OF_MONTH         148848 non-null  int64  
 4   DAY_OF_WEEK          148848 non-null  int64  
 5   FL_DATE              148848 non-null  object 
 6   CARRIER_0            148848 non-null  int64  
 7   FL_NUM_0             148848 non-null  int64  
 8   FL_NUM_1             148848 non-null  int64  
 9   FL_NUM_2             148848 non-null  int64  
 10  FL_NUM_3             148848 non-null  int64  
 11  FL_NUM_4             148848 non-null  int64  
 12  FL_NUM_5             148848 non-null  int64  
 13  FL_NUM_6             148848 non-null  int64  
 14  FL_NUM_7             148848 non-null  int64  
 15  FL_NUM_8         

In [12]:
#Used label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:,39] = labelencoder.fit_transform(df.iloc[:,39].values)
df.iloc[:,46] = labelencoder.fit_transform(df.iloc[:,46].values)
df.iloc[:,47] = labelencoder.fit_transform(df.iloc[:,47].values)
df.iloc[:,48] = labelencoder.fit_transform(df.iloc[:,48].values)
df.iloc[:,49] = labelencoder.fit_transform(df.iloc[:,49].values)
df.iloc[:,50] = labelencoder.fit_transform(df.iloc[:,50].values)

In [13]:
#checking the columns for consistency
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'CARRIER_0', 'FL_NUM_0', 'FL_NUM_1', 'FL_NUM_2', 'FL_NUM_3', 'FL_NUM_4',
       'FL_NUM_5', 'FL_NUM_6', 'FL_NUM_7', 'FL_NUM_8', 'FL_NUM_9', 'FL_NUM_10',
       'FL_NUM_11', 'FL_NUM_12', 'Route', 'ORIGIN_0', 'ORIGIN_1', 'ORIGIN_2',
       'ORIGIN_3', 'ORIGIN_4', 'ORIGIN_5', 'ORIGIN_6', 'ORIGIN_7', 'DEST_0',
       'DEST_1', 'DEST_2', 'DEST_3', 'DEST_4', 'DEST_5', 'DEST_6', 'DEST_7',
       'DEST_CITY', 'DEST_STATE', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY',
       'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_DELAY_GROUP', 'DEP_TIME_BLK',
       'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME',
       'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15',
       'ARR_DELAY_GROUP', 'ARR_TIME_BLK', 'CANCELED', 'CANCELLATION_CODE',
       'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME',
       'DISTANCE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY',
       'SECURITY_DELAY', 'LATE_AIR

# Converting features to numerical values to feed the algorithm

In [16]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict, which is ARRIVAL DELAY here
labels = np.array(df['ARR_DELAY'])

# Selecting the encoded variables as features 
features= df[['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
       'CARRIER_0', 'FL_NUM_0', 'FL_NUM_1', 'FL_NUM_2', 'FL_NUM_3', 'FL_NUM_4',
       'FL_NUM_5', 'FL_NUM_6', 'FL_NUM_7', 'FL_NUM_8', 'FL_NUM_9', 'FL_NUM_10',
       'FL_NUM_11', 'FL_NUM_12', 'ORIGIN_0', 'ORIGIN_1', 'ORIGIN_2', 'ORIGIN_3', 'ORIGIN_4',
       'ORIGIN_5', 'ORIGIN_6', 'ORIGIN_7', 'DEST_0', 'DEST_1', 'DEST_2',
       'DEST_3', 'DEST_4', 'DEST_5', 'DEST_6', 'DEST_7', 'CRS_ARR_TIME','CRS_DEP_TIME','WHEELS_OFF','TAXI_OUT','DISTANCE', 'PASSENGERS', 'EMPTOTAL', 'NET_INCOME', 'OP_REVENUES','AIR_TIME']]

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [35]:
df['WHEELS_ON'].values

array([ 709,  695,  666, ..., 1030,  969,  817], dtype=int64)

In [36]:
feature_list

['YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'CARRIER_0',
 'FL_NUM_0',
 'FL_NUM_1',
 'FL_NUM_2',
 'FL_NUM_3',
 'FL_NUM_4',
 'FL_NUM_5',
 'FL_NUM_6',
 'FL_NUM_7',
 'FL_NUM_8',
 'FL_NUM_9',
 'FL_NUM_10',
 'FL_NUM_11',
 'FL_NUM_12',
 'Route_0',
 'Route_1',
 'Route_2',
 'Route_3',
 'Route_4',
 'Route_5',
 'Route_6',
 'Route_7',
 'Route_8',
 'Route_9',
 'Route_10',
 'ORIGIN_0',
 'ORIGIN_1',
 'ORIGIN_2',
 'ORIGIN_3',
 'ORIGIN_4',
 'ORIGIN_5',
 'ORIGIN_6',
 'ORIGIN_7',
 'DEST_0',
 'DEST_1',
 'DEST_2',
 'DEST_3',
 'DEST_4',
 'DEST_5',
 'DEST_6',
 'DEST_7',
 'CRS_ARR_TIME',
 'WHEELS_ON',
 'DISTANCE',
 'PASSENGERS',
 'EMPTOTAL',
 'NET_INCOME',
 'OP_REVENUES',
 'AIR_TIME']

In [37]:
labels

array([ 43.,  27.,  -5., ...,   3.,   4., -14.])

# Split data into training and testing sets

In [18]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold
from sklearn import metrics

In [24]:
# Using Scikit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [43]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (111636, 54)
Training Labels Shape: (111636,)
Testing Features Shape: (37212, 54)
Testing Labels Shape: (37212,)


# Using RandomForestRegressor model to fit the data

In [19]:
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size = 0.2)

In [20]:
import time
from sklearn.ensemble import RandomForestRegressor
start = time.time()
# Instantiate model 
rf_reg = RandomForestRegressor(n_estimators=40)
rf_reg.fit(x_train,y_train)

end = time.time()
print(f"{end-start} seconds")

153.45375990867615 seconds


In [21]:
y_pred = rf_reg.predict(x_test)

rsqure_score = rf_reg.score(x_train,y_train)
print('R^2: {}'.format(rsqure_score))
rsqure_testscore = rf_reg.score(x_test,y_test)
print('R^2: {}'.format(rsqure_testscore))

R^2: 0.9846752350482677
R^2: 0.9160519716494869


# Perform the training using KFolds cross validation method

In [30]:
cv_r2_scores_rf = cross_val_score(rf_reg, features, labels, cv=5,scoring='r2')
print(cv_r2_scores_rf)
print("Mean 5-Fold R Squared: {}".format(np.mean(cv_r2_scores_rf)))

[0.95177432 0.96333277 0.88836994 0.7851214  0.88600509]
Mean 5-Fold R Squared: 0.8949207044783802


In [31]:
# Get numerical feature importances
importances = list(rf_reg.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: CRS_DEP_TIME         Importance: 0.56331
Variable: WHEELS_OFF           Importance: 0.3555
Variable: TAXI_OUT             Importance: 0.01879
Variable: CRS_ARR_TIME         Importance: 0.01441
Variable: AIR_TIME             Importance: 0.00993
Variable: PASSENGERS           Importance: 0.00684
Variable: DISTANCE             Importance: 0.00439
Variable: DAY_OF_MONTH         Importance: 0.00346
Variable: DAY_OF_WEEK          Importance: 0.0033
Variable: MONTH                Importance: 0.00244
Variable: EMPTOTAL             Importance: 0.00203
Variable: OP_REVENUES          Importance: 0.00136
Variable: NET_INCOME           Importance: 0.0013
Variable: DEST_3               Importance: 0.00091
Variable: QUARTER              Importance: 0.00077
Variable: ORIGIN_5             Importance: 0.00068
Variable: DEST_6               Importance: 0.00068
Variable: ORIGIN_1             Importance: 0.00067
Variable: FL_NUM_3             Importance: 0.00065
Variable: FL_NUM_11            Imp

In [25]:
import time
from sklearn.ensemble import RandomForestRegressor
start = time.time()
# Instantiate model 
rf = RandomForestRegressor(n_estimators= 50, random_state=42, oob_score = True,
                           bootstrap = True,)

# Train the model on training data
rf.fit(train_features, train_labels)

end = time.time()
print(f"{end-start} seconds")

149.51950788497925 seconds


# Evaluating the accuracy score on training and test data

In [26]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(train_features, train_labels), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(test_features, test_labels)))

R^2 Training Score: 0.99 
OOB Score: 0.89 
R^2 Validation Score: 0.91


# Calculating Mean Absolute Errors

In [27]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 6.66 degrees.


# Feature Importance

In [29]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: CRS_DEP_TIME         Importance: 0.55323
Variable: WHEELS_OFF           Importance: 0.35868
Variable: CRS_ARR_TIME         Importance: 0.01826
Variable: TAXI_OUT             Importance: 0.01825
Variable: AIR_TIME             Importance: 0.00983
Variable: PASSENGERS           Importance: 0.00668
Variable: DISTANCE             Importance: 0.00435
Variable: DAY_OF_MONTH         Importance: 0.00353
Variable: DAY_OF_WEEK          Importance: 0.00267
Variable: EMPTOTAL             Importance: 0.00218
Variable: MONTH                Importance: 0.00189
Variable: NET_INCOME           Importance: 0.00174
Variable: ORIGIN_3             Importance: 0.00116
Variable: QUARTER              Importance: 0.00105
Variable: ORIGIN_4             Importance: 0.00105
Variable: FL_NUM_9             Importance: 0.00099
Variable: OP_REVENUES          Importance: 0.00097
Variable: ORIGIN_2             Importance: 0.00094
Variable: FL_NUM_7             Importance: 0.0009
Variable: FL_NUM_1             I