# Flight Delay Prediction Airline Wise

The large amount of data within the flight delay data set (over 10 million rows) made operating on new variables computationally slow. Thus, it has been split airline wise for data anaylsis and exploration. 

In [1]:
#Imported the necessary python libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pylab as plt 

  import pandas.util.testing as tm


In [3]:
#Loaded Mesa Airlines dataset to a dataframe from airline specific csv file
df=pd.read_csv('AirlineYV.csv')

# Handling Missing values

In [4]:
df=df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,FL_NUM,Route,ORIGIN,...,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,PASSENGERS,EMPFULL,EMPPART,EMPTOTAL,EMPFTE,NET_INCOME,OP_REVENUES
0,2018,1,1,1,1,2018-01-01,YV,5804,20,ABQ,...,,,,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
1,2018,1,1,1,1,2018-01-01,YV,5726,20,ABQ,...,,,,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
2,2018,1,1,1,1,2018-01-01,YV,5744,20,ABQ,...,,,,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
3,2018,1,1,2,2,2018-01-02,YV,5929,20,ABQ,...,,,,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
4,2018,1,1,2,2,2018-01-02,YV,5804,20,ABQ,...,,,,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326523,2019,2,6,1,6,2019-06-01,YV,5894,6684,YUM,...,,,,238.0,3491.0,8.0,3499.0,3495.0,4269.34,167434.86
326524,2019,2,6,2,7,2019-06-02,YV,5894,6684,YUM,...,,,,238.0,3491.0,8.0,3499.0,3495.0,4269.34,167434.86
326525,2019,2,6,3,1,2019-06-03,YV,5894,6684,YUM,...,,,,238.0,3491.0,8.0,3499.0,3495.0,4269.34,167434.86
326526,2019,2,6,4,2,2019-06-04,YV,5894,6684,YUM,...,15.0,0.0,57.0,238.0,3491.0,8.0,3499.0,3495.0,4269.34,167434.86


In [5]:
#checking for null values
df.isna().sum()

YEAR                        0
QUARTER                     0
MONTH                       0
DAY_OF_MONTH                0
DAY_OF_WEEK                 0
FL_DATE                     0
CARRIER                     0
FL_NUM                      0
Route                       0
ORIGIN                      0
DEST                        0
DEST_CITY                   0
DEST_STATE                  0
CRS_DEP_TIME                0
DEP_TIME                 9194
DEP_DELAY                9197
DEP_DELAY_NEW            9197
DEP_DEL15                9197
DEP_DELAY_GROUP          9197
DEP_TIME_BLK                0
TAXI_OUT                 9414
WHEELS_OFF               9414
WHEELS_ON                9568
TAXI_IN                  9568
CRS_ARR_TIME                0
ARR_TIME                 9568
ARR_DELAY               10342
ARR_DELAY_NEW           10342
ARR_DEL15               10342
ARR_DELAY_GROUP         10342
ARR_TIME_BLK                0
CANCELED                    0
CANCELLATION_CODE      317005
DIVERTED  

In [6]:
#dropping null values from columns within the dataset
df=df.dropna(subset=['ARR_DEL15'])

In [7]:
#Imputing the missing values with 0 since the null values represent no delay
df=df.fillna(0)

In [8]:
#checking number of Allegiant Air Flights for the year 2018 and 2019
df['FL_NUM'].nunique()

657

In [9]:
#Percentage of delayed AA flights
df.groupby('ARR_DEL15').size()

ARR_DEL15
0.0    250329
1.0     65857
dtype: int64

# Categorical encoding of Nominal Variables

In [26]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['CARRIER','ORIGIN','DEST','Route','FL_NUM'])
df = encoder.fit_transform(df)

df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER_0,FL_NUM_0,FL_NUM_1,FL_NUM_2,...,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,PASSENGERS,EMPFULL,EMPPART,EMPTOTAL,EMPFTE,NET_INCOME,OP_REVENUES
0,2018,1,1,1,1,2018-01-01,1,0,0,0,...,0.0,0.0,0.0,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
1,2018,1,1,1,1,2018-01-01,1,0,0,0,...,0.0,0.0,0.0,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
2,2018,1,1,1,1,2018-01-01,1,0,0,0,...,0.0,0.0,0.0,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
3,2018,1,1,2,2,2018-01-02,1,0,0,0,...,0.0,0.0,0.0,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87
4,2018,1,1,2,2,2018-01-02,1,0,0,0,...,0.0,0.0,0.0,6432.0,3224.0,13.0,3237.0,3231.0,1831.89,13963.87


In [11]:
#checking columns for consistency
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'CARRIER_0', 'FL_NUM_0', 'FL_NUM_1', 'FL_NUM_2', 'FL_NUM_3', 'FL_NUM_4',
       'FL_NUM_5', 'FL_NUM_6', 'FL_NUM_7', 'FL_NUM_8', 'FL_NUM_9', 'FL_NUM_10',
       'Route_0', 'Route_1', 'Route_2', 'Route_3', 'Route_4', 'Route_5',
       'Route_6', 'Route_7', 'Route_8', 'Route_9', 'Route_10', 'ORIGIN_0',
       'ORIGIN_1', 'ORIGIN_2', 'ORIGIN_3', 'ORIGIN_4', 'ORIGIN_5', 'ORIGIN_6',
       'DEST_0', 'DEST_1', 'DEST_2', 'DEST_3', 'DEST_4', 'DEST_5', 'DEST_6',
       'DEST_CITY', 'DEST_STATE', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY',
       'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_DELAY_GROUP', 'DEP_TIME_BLK',
       'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME',
       'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15',
       'ARR_DELAY_GROUP', 'ARR_TIME_BLK', 'CANCELED', 'CANCELLATION_CODE',
       'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME',
       'DISTANCE', 'CARRIER_

In [10]:
df['CRS_ARR_TIME'].values

array([1326, 1436, 2106, ..., 1250, 1250, 1250], dtype=int64)

In [11]:
#checking correlation among variables to steer better feature selection
df[['CRS_ARR_TIME','ARR_DELAY']].corr()

Unnamed: 0,CRS_ARR_TIME,ARR_DELAY
CRS_ARR_TIME,1.0,0.065519
ARR_DELAY,0.065519,1.0


In [27]:
df[['ORIGIN_3','Route_7']].corr()

Unnamed: 0,ORIGIN_3,Route_7
ORIGIN_3,1.0,-0.03112
Route_7,-0.03112,1.0


# Categorical Encoding for Ordinal Variables

In [28]:
#Info method to get an overall overview of the dataframe 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316186 entries, 0 to 326527
Data columns (total 85 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YEAR                 316186 non-null  int64  
 1   QUARTER              316186 non-null  int64  
 2   MONTH                316186 non-null  int64  
 3   DAY_OF_MONTH         316186 non-null  int64  
 4   DAY_OF_WEEK          316186 non-null  int64  
 5   FL_DATE              316186 non-null  object 
 6   CARRIER_0            316186 non-null  int64  
 7   FL_NUM_0             316186 non-null  int64  
 8   FL_NUM_1             316186 non-null  int64  
 9   FL_NUM_2             316186 non-null  int64  
 10  FL_NUM_3             316186 non-null  int64  
 11  FL_NUM_4             316186 non-null  int64  
 12  FL_NUM_5             316186 non-null  int64  
 13  FL_NUM_6             316186 non-null  int64  
 14  FL_NUM_7             316186 non-null  int64  
 15  FL_NUM_8         

In [29]:
#Used label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:,48] = labelencoder.fit_transform(df.iloc[:,48].values)
df.iloc[:,57] = labelencoder.fit_transform(df.iloc[:,57].values)
df.iloc[:,58] = labelencoder.fit_transform(df.iloc[:,58].values)
df.iloc[:,59] = labelencoder.fit_transform(df.iloc[:,59].values)
df.iloc[:,55] = labelencoder.fit_transform(df.iloc[:,55].values)
df.iloc[:,56] = labelencoder.fit_transform(df.iloc[:,56].values)

In [30]:
#checking the columns for consistency
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'CARRIER_0', 'FL_NUM_0', 'FL_NUM_1', 'FL_NUM_2', 'FL_NUM_3', 'FL_NUM_4',
       'FL_NUM_5', 'FL_NUM_6', 'FL_NUM_7', 'FL_NUM_8', 'FL_NUM_9', 'FL_NUM_10',
       'Route_0', 'Route_1', 'Route_2', 'Route_3', 'Route_4', 'Route_5',
       'Route_6', 'Route_7', 'Route_8', 'Route_9', 'ORIGIN_0', 'ORIGIN_1',
       'ORIGIN_2', 'ORIGIN_3', 'ORIGIN_4', 'ORIGIN_5', 'ORIGIN_6', 'ORIGIN_7',
       'ORIGIN_8', 'DEST_0', 'DEST_1', 'DEST_2', 'DEST_3', 'DEST_4', 'DEST_5',
       'DEST_6', 'DEST_7', 'DEST_8', 'DEST_CITY', 'DEST_STATE', 'CRS_DEP_TIME',
       'DEP_TIME', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15',
       'DEP_DELAY_GROUP', 'DEP_TIME_BLK', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'ARR_DELAY_NEW', 'ARR_DEL15', 'ARR_DELAY_GROUP', 'ARR_TIME_BLK',
       'CANCELED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', '

# Changing numerical features to category datatype to reduce memory and  computational time

In [20]:
#convert object and int64 type to category for less computational time
df['CARRIER_0'] = df['CARRIER_0'].astype('category')
df['ORIGIN_0'] = df['ORIGIN_0'].astype('category')         
df['ORIGIN_1'] = df['ORIGIN_1'].astype('category')   
df['ORIGIN_2'] = df['ORIGIN_2'].astype('category')   
df['ORIGIN_3'] = df['ORIGIN_3'].astype('category')   
df['ORIGIN_4'] = df['ORIGIN_4'].astype('category')   
df['ORIGIN_5'] = df['ORIGIN_5'].astype('category')   
df['ORIGIN_6'] = df['ORIGIN_6'].astype('category')   
df['ORIGIN_7'] = df['ORIGIN_7'].astype('category')   

df['DEST_0'] = df['DEST_0'].astype('category')   
df['DEST_1'] = df['DEST_1'].astype('category')     
df['DEST_2'] = df['DEST_2'].astype('category')     
df['DEST_3'] = df['DEST_3'].astype('category')     
df['DEST_4'] = df['DEST_4'].astype('category')     
df['DEST_5'] = df['DEST_5'].astype('category')     
df['DEST_6'] = df['DEST_6'].astype('category')     
df['DEST_7'] = df['DEST_7'].astype('category')     

df['QUARTER'] = df['QUARTER'].astype('category')
df['MONTH'] = df['MONTH'].astype('category')
df['YEAR'] = df['YEAR'].astype('category')
df['DAY_OF_MONTH'] = df['DAY_OF_MONTH'].astype('category')                                                              
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype('category')

df['FL_NUM_0'] = df['FL_NUM_0'].astype('category')    
df['FL_NUM_1'] = df['FL_NUM_1'].astype('category')    
df['FL_NUM_2'] = df['FL_NUM_2'].astype('category')    
df['FL_NUM_3'] = df['FL_NUM_3'].astype('category')    
df['FL_NUM_4'] = df['FL_NUM_4'].astype('category')    
df['FL_NUM_5'] = df['FL_NUM_5'].astype('category')    
df['FL_NUM_6'] = df['FL_NUM_6'].astype('category')    
df['FL_NUM_7'] = df['FL_NUM_7'].astype('category')    
df['FL_NUM_8'] = df['FL_NUM_8'].astype('category')    
df['FL_NUM_9'] = df['FL_NUM_9'].astype('category')    
df['FL_NUM_10'] = df['FL_NUM_10'].astype('category')    
df['FL_NUM_11'] = df['FL_NUM_11'].astype('category')    
df['FL_NUM_12'] = df['FL_NUM_12'].astype('category')    


df['Route_0'] = df['Route_0'].astype('category')
df['Route_1'] = df['Route_1'].astype('category')
df['Route_2'] = df['Route_2'].astype('category')
df['Route_3'] = df['Route_3'].astype('category')
df['Route_4'] = df['Route_4'].astype('category')
df['Route_5'] = df['Route_5'].astype('category')
df['Route_6'] = df['Route_6'].astype('category')
df['Route_7'] = df['Route_7'].astype('category')
df['Route_8'] = df['Route_8'].astype('category')
df['Route_9'] = df['Route_9'].astype('category')
df['Route_10'] = df['Route_10'].astype('category')

df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].astype('category')    
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].astype('category')                                                              
df['TAXI_IN'] = df['TAXI_IN'].astype('category')
df['WHEELS_OFF'] = df['WHEELS_OFF'].astype('category')                                                              
df['WHEELS_ON'] = df['WHEELS_ON'].astype('category')
df['DISTANCE'] = df['DISTANCE'].astype('category')                                                              
df['PASSENGERS'] = df['PASSENGERS'].astype('category')
df['AIR_TIME'] = df['AIR_TIME'].astype('category')                                                              
df['NET_INCOME'] = df['NET_INCOME'].astype('category')
df['EMPTOTAL'] = df['EMPTOTAL'].astype('category')                                                              
df['EMPFTE'] = df['EMPFTE'].astype('category')
df['OP_REVENUES'] = df['OP_REVENUES'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1365780 entries, 0 to 1397725
Data columns (total 86 columns):
 #   Column               Non-Null Count    Dtype   
---  ------               --------------    -----   
 0   YEAR                 1365780 non-null  category
 1   QUARTER              1365780 non-null  category
 2   MONTH                1365780 non-null  category
 3   DAY_OF_MONTH         1365780 non-null  category
 4   DAY_OF_WEEK          1365780 non-null  category
 5   FL_DATE              1365780 non-null  object  
 6   CARRIER_0            1365780 non-null  category
 7   FL_NUM_0             1365780 non-null  category
 8   FL_NUM_1             1365780 non-null  category
 9   FL_NUM_2             1365780 non-null  category
 10  FL_NUM_3             1365780 non-null  category
 11  FL_NUM_4             1365780 non-null  category
 12  FL_NUM_5             1365780 non-null  category
 13  FL_NUM_6             1365780 non-null  category
 14  FL_NUM_7             1365780 non-n

# Converting features to numerical values to feed the algorithm

In [34]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict, which is ARRIVAL DELAY here
labels = np.array(df['ARR_DELAY'])

# Selecting the encoded variables as features 
features= df[['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
       'CARRIER_0', 'FL_NUM_0', 'FL_NUM_1', 'FL_NUM_2', 'FL_NUM_3', 'FL_NUM_4',
       'FL_NUM_5', 'FL_NUM_6', 'FL_NUM_7', 'FL_NUM_8', 'FL_NUM_9', 'FL_NUM_10',
       'Route_0', 'Route_1', 'Route_2', 'Route_3', 'Route_4', 'Route_5',
       'Route_6', 'Route_7', 'Route_8', 'Route_9', 'ORIGIN_0', 'ORIGIN_1',
       'ORIGIN_2', 'ORIGIN_3', 'ORIGIN_4', 'ORIGIN_5', 'ORIGIN_6', 'ORIGIN_7',
       'ORIGIN_8', 'DEST_0', 'DEST_1', 'DEST_2', 'DEST_3', 'DEST_4', 'DEST_5',
       'DEST_6', 'DEST_7', 'DEST_8', 'CRS_ARR_TIME','CRS_DEP_TIME', 'TAXI_OUT','WHEELS_OFF','DISTANCE', 'PASSENGERS', 'EMPTOTAL', 'NET_INCOME', 'OP_REVENUES','AIR_TIME']]

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [41]:
df['WHEELS_ON'].values

[474, 553, 722, 1038, 1143, ..., 1056, 1029, 1143, 633, 1126]
Length: 1365780
Categories (1440, int64): [0, 1, 2, 3, ..., 1436, 1437, 1438, 1439]

In [30]:
feature_list

['YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'CARRIER_0',
 'FL_NUM_0',
 'FL_NUM_1',
 'FL_NUM_2',
 'FL_NUM_3',
 'FL_NUM_4',
 'FL_NUM_5',
 'FL_NUM_6',
 'FL_NUM_7',
 'FL_NUM_8',
 'FL_NUM_9',
 'FL_NUM_10',
 'FL_NUM_11',
 'FL_NUM_12',
 'Route_0',
 'Route_1',
 'Route_2',
 'Route_3',
 'Route_4',
 'Route_5',
 'Route_6',
 'Route_7',
 'Route_8',
 'Route_9',
 'Route_10',
 'ORIGIN_0',
 'ORIGIN_1',
 'ORIGIN_2',
 'ORIGIN_3',
 'ORIGIN_4',
 'ORIGIN_5',
 'ORIGIN_6',
 'ORIGIN_7',
 'DEST_0',
 'DEST_1',
 'DEST_2',
 'DEST_3',
 'DEST_4',
 'DEST_5',
 'DEST_6',
 'DEST_7',
 'CRS_ARR_TIME',
 'WHEELS_ON',
 'DISTANCE',
 'PASSENGERS',
 'EMPTOTAL',
 'NET_INCOME',
 'OP_REVENUES',
 'AIR_TIME']

In [24]:
labels

array([  0., -14.,  65., ..., -25.,  59., -25.])

# Split data into training and testing sets

In [35]:
# Using Scikit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [47]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (203349, 50)
Training Labels Shape: (203349,)
Testing Features Shape: (67784, 50)
Testing Labels Shape: (67784,)


# Using RandomForestRegressor model to fit the data

In [36]:
import time
from sklearn.ensemble import RandomForestRegressor
start = time.time()
# Instantiate model 
rf = RandomForestRegressor(n_estimators= 20, random_state=42,
                           bootstrap = True)

# Train the model on training data
rf.fit(train_features, train_labels)

end = time.time()
print(f"{end-start} seconds")

208.08326268196106 seconds


# Evaluating the accuracy score on training and test data

In [39]:
y_pred = rf.predict(test_features)

rsqure_score = rf.score(train_features,train_labels)
rsqure_testscore = rf.score(test_features,test_labels)
print('R^2: {}'.format(rsqure_score))
print('R^2: {}'.format(rsqure_testscore))

R^2: 0.9890158474530054
R^2: 0.9111130869376716


# Perform the training using KFolds cross validation method

In [41]:
from sklearn.model_selection import cross_val_score, cross_val_predict
cv_r2_scores_rf = cross_val_score(rf, features, labels, cv=5,scoring='r2')
print(cv_r2_scores_rf)
print("Mean 5-Fold R Squared: {}".format(np.mean(cv_r2_scores_rf)))

[0.91745969 0.95401313 0.93622906 0.83432648 0.92340475]
Mean 5-Fold R Squared: 0.9130866229355872


# Calculating Mean Absolute Errors

In [42]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 5.6 degrees.


# Feature Importance

In [43]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: CRS_DEP_TIME         Importance: 0.67327
Variable: WHEELS_OFF           Importance: 0.2274
Variable: TAXI_OUT             Importance: 0.02758
Variable: CRS_ARR_TIME         Importance: 0.02217
Variable: AIR_TIME             Importance: 0.01178
Variable: PASSENGERS           Importance: 0.00516
Variable: DAY_OF_MONTH         Importance: 0.00472
Variable: DISTANCE             Importance: 0.00351
Variable: EMPTOTAL             Importance: 0.00291
Variable: DAY_OF_WEEK          Importance: 0.00263
Variable: MONTH                Importance: 0.00262
Variable: ORIGIN_6             Importance: 0.001
Variable: FL_NUM_4             Importance: 0.00084
Variable: OP_REVENUES          Importance: 0.00082
Variable: ORIGIN_5             Importance: 0.00078
Variable: Route_4              Importance: 0.00072
Variable: DEST_8               Importance: 0.00071
Variable: QUARTER              Importance: 0.0007
Variable: NET_INCOME           Importance: 0.00064
Variable: FL_NUM_7             Impo

# Store the trained model using pickle

In [44]:
import pickle

In [50]:
# save the model to disk
filename = 'modelYV.sav'
pickle.dump(rf, open(filename, 'wb'))

# Loading Quarter 3 July month data for testing

In [51]:
#Loaded American Airlines dataset to a dataframe from airline specific csv file
df=pd.read_csv('445660410_T_ONTIME_REPORTING.csv')

In [53]:
df=df.drop(columns=['Unnamed: 22'])

In [55]:
df=df.fillna(0)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605979 entries, 0 to 605978
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   YEAR               605979 non-null  int64  
 1   QUARTER            605979 non-null  int64  
 2   MONTH              605979 non-null  int64  
 3   DAY_OF_MONTH       605979 non-null  int64  
 4   DAY_OF_WEEK        605979 non-null  int64  
 5   OP_UNIQUE_CARRIER  605979 non-null  object 
 6   OP_CARRIER         605979 non-null  object 
 7   TAIL_NUM           605979 non-null  object 
 8   OP_CARRIER_FL_NUM  605979 non-null  int64  
 9   ORIGIN_AIRPORT_ID  605979 non-null  int64  
 10  ORIGIN             605979 non-null  object 
 11  DEST_AIRPORT_ID    605979 non-null  int64  
 12  DEST               605979 non-null  object 
 13  CRS_DEP_TIME       605979 non-null  int64  
 14  TAXI_OUT           605979 non-null  float64
 15  WHEELS_OFF         605979 non-null  float64
 16  CR

In [58]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['OP_CARRIER','ORIGIN','DEST','OP_CARRIER_FL_NUM'])
df = encoder.fit_transform(df)

df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_0,OP_CARRIER_1,OP_CARRIER_2,OP_CARRIER_3,...,DEST_9,CRS_DEP_TIME,TAXI_OUT,WHEELS_OFF,CRS_ARR_TIME,ARR_DELAY,CANCELLED,DIVERTED,AIR_TIME,DISTANCE
0,2019,3,9,3,2,WN,0,0,0,0,...,1,605,18.0,624.0,735,-3.0,0.0,0.0,65.0,430.0
1,2019,3,9,3,2,WN,0,0,0,0,...,1,1635,6.0,1646.0,1800,-5.0,0.0,0.0,66.0,430.0
2,2019,3,9,3,2,WN,0,0,0,0,...,1,1040,12.0,1050.0,1205,-5.0,0.0,0.0,67.0,430.0
3,2019,3,9,3,2,WN,0,0,0,0,...,0,1935,9.0,1944.0,2025,-8.0,0.0,0.0,146.0,1069.0
4,2019,3,9,3,2,WN,0,0,0,0,...,1,1725,8.0,1729.0,1735,-1.0,0.0,0.0,48.0,249.0


In [74]:
encoder = ce.BinaryEncoder(cols=['OP_UNIQUE_CARRIER'])
df = encoder.fit_transform(df)

df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER_0,OP_UNIQUE_CARRIER_1,OP_UNIQUE_CARRIER_2,OP_UNIQUE_CARRIER_3,OP_UNIQUE_CARRIER_4,...,DEST_9,CRS_DEP_TIME,TAXI_OUT,WHEELS_OFF,CRS_ARR_TIME,ARR_DELAY,CANCELLED,DIVERTED,AIR_TIME,DISTANCE
0,2019,3,9,3,2,0,0,0,0,0,...,1,605,18.0,624.0,735,-3.0,0.0,0.0,65.0,430.0
1,2019,3,9,3,2,0,0,0,0,0,...,1,1635,6.0,1646.0,1800,-5.0,0.0,0.0,66.0,430.0
2,2019,3,9,3,2,0,0,0,0,0,...,1,1040,12.0,1050.0,1205,-5.0,0.0,0.0,67.0,430.0
3,2019,3,9,3,2,0,0,0,0,0,...,0,1935,9.0,1944.0,2025,-8.0,0.0,0.0,146.0,1069.0
4,2019,3,9,3,2,0,0,0,0,0,...,1,1725,8.0,1729.0,1735,-1.0,0.0,0.0,48.0,249.0


In [59]:
#Used label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:,13] = labelencoder.fit_transform(df.iloc[:,13].values)
df.iloc[:,14] = labelencoder.fit_transform(df.iloc[:,14].values)
df.iloc[:,15] = labelencoder.fit_transform(df.iloc[:,15].values)
df.iloc[:,16] = labelencoder.fit_transform(df.iloc[:,16].values)

In [75]:
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
       'OP_UNIQUE_CARRIER_0', 'OP_UNIQUE_CARRIER_1', 'OP_UNIQUE_CARRIER_2',
       'OP_UNIQUE_CARRIER_3', 'OP_UNIQUE_CARRIER_4', 'OP_UNIQUE_CARRIER_5',
       'OP_CARRIER_0', 'OP_CARRIER_1', 'OP_CARRIER_2', 'OP_CARRIER_3',
       'OP_CARRIER_4', 'OP_CARRIER_5', 'TAIL_NUM', 'OP_CARRIER_FL_NUM_0',
       'OP_CARRIER_FL_NUM_1', 'OP_CARRIER_FL_NUM_2', 'OP_CARRIER_FL_NUM_3',
       'OP_CARRIER_FL_NUM_4', 'OP_CARRIER_FL_NUM_5', 'OP_CARRIER_FL_NUM_6',
       'OP_CARRIER_FL_NUM_7', 'OP_CARRIER_FL_NUM_8', 'OP_CARRIER_FL_NUM_9',
       'OP_CARRIER_FL_NUM_10', 'OP_CARRIER_FL_NUM_11', 'OP_CARRIER_FL_NUM_12',
       'OP_CARRIER_FL_NUM_13', 'ORIGIN_AIRPORT_ID', 'ORIGIN_0', 'ORIGIN_1',
       'ORIGIN_2', 'ORIGIN_3', 'ORIGIN_4', 'ORIGIN_5', 'ORIGIN_6', 'ORIGIN_7',
       'ORIGIN_8', 'ORIGIN_9', 'DEST_AIRPORT_ID', 'DEST_0', 'DEST_1', 'DEST_2',
       'DEST_3', 'DEST_4', 'DEST_5', 'DEST_6', 'DEST_7', 'DEST_8', 'DEST_9',
       'CRS_DEP_TI

In [76]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict, which is ARRIVAL DELAY here
labels = np.array(df['ARR_DELAY'])

# Selecting the encoded variables as features 
features= df[['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
        'OP_CARRIER_0', 'OP_CARRIER_1', 'OP_CARRIER_2','OP_UNIQUE_CARRIER_0', 'OP_UNIQUE_CARRIER_1',
       'OP_CARRIER_3', 'OP_CARRIER_4', 'OP_CARRIER_5',
       'OP_CARRIER_FL_NUM_0', 'OP_CARRIER_FL_NUM_1', 'OP_CARRIER_FL_NUM_2',
       'OP_CARRIER_FL_NUM_3', 'OP_CARRIER_FL_NUM_4', 'OP_CARRIER_FL_NUM_5',
       'OP_CARRIER_FL_NUM_6', 'OP_CARRIER_FL_NUM_7', 'OP_CARRIER_FL_NUM_8',
       'OP_CARRIER_FL_NUM_9', 'OP_CARRIER_FL_NUM_10', 'OP_CARRIER_FL_NUM_11',
       'OP_CARRIER_FL_NUM_12', 'OP_CARRIER_FL_NUM_13',
       'ORIGIN_0', 'ORIGIN_1', 'ORIGIN_2', 'ORIGIN_3', 'ORIGIN_4', 'ORIGIN_5',
       'ORIGIN_6', 'ORIGIN_7', 'ORIGIN_8', 'ORIGIN_9', 
       'DEST_0', 'DEST_1', 'DEST_2', 'DEST_3', 'DEST_4', 'DEST_5', 'DEST_6',
       'DEST_7', 'DEST_8', 'DEST_9', 'CRS_DEP_TIME', 'TAXI_OUT', 'WHEELS_OFF',
       'CRS_ARR_TIME', 'CANCELLED', 'DIVERTED', 'AIR_TIME',
       'DISTANCE']]

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [83]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [84]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

-8.274714737446542
