<a href="https://www.kaggle.com/code/samithsachidanandan/airline-flight-delay-prediction?scriptVersionId=271049550" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### Importing Libraries 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

import xgboost as xgb 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2


import warnings 

warnings.filterwarnings("ignore")

### Import Data 

In [2]:
df = pd.read_csv('/kaggle/input/flight-delay-and-causes/Flight_delay.csv')

In [3]:
pd.set_option('display.max_columns',None)
df.head()

Unnamed: 0,DayOfWeek,Date,DepTime,ArrTime,CRSArrTime,UniqueCarrier,Airline,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Org_Airport,Dest,Dest_Airport,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,4,03-01-2019,1829,1959,1925,WN,Southwest Airlines Co.,3920,N464WN,90,90,77,34,34,IND,Indianapolis International Airport,BWI,Baltimore-Washington International Airport,515,3,10,0,N,0,2,0,0,0,32
1,4,03-01-2019,1937,2037,1940,WN,Southwest Airlines Co.,509,N763SW,240,250,230,57,67,IND,Indianapolis International Airport,LAS,McCarran International Airport,1591,3,7,0,N,0,10,0,0,0,47
2,4,03-01-2019,1644,1845,1725,WN,Southwest Airlines Co.,1333,N334SW,121,135,107,80,94,IND,Indianapolis International Airport,MCO,Orlando International Airport,828,6,8,0,N,0,8,0,0,0,72
3,4,03-01-2019,1452,1640,1625,WN,Southwest Airlines Co.,675,N286WN,228,240,213,15,27,IND,Indianapolis International Airport,PHX,Phoenix Sky Harbor International Airport,1489,7,8,0,N,0,3,0,0,0,12
4,4,03-01-2019,1323,1526,1510,WN,Southwest Airlines Co.,4,N674AA,123,135,110,16,28,IND,Indianapolis International Airport,TPA,Tampa International Airport,838,4,9,0,N,0,0,0,0,0,16


### Basic Stats

In [4]:
df.shape

(484551, 29)

In [5]:
df['CarrierDelay'].value_counts()

CarrierDelay
0       226046
6        10034
7         9409
1         8899
2         8855
         ...  
559          1
898          1
903          1
666          1
1707         1
Name: count, Length: 708, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484551 entries, 0 to 484550
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DayOfWeek          484551 non-null  int64 
 1   Date               484551 non-null  object
 2   DepTime            484551 non-null  int64 
 3   ArrTime            484551 non-null  int64 
 4   CRSArrTime         484551 non-null  int64 
 5   UniqueCarrier      484551 non-null  object
 6   Airline            484551 non-null  object
 7   FlightNum          484551 non-null  int64 
 8   TailNum            484551 non-null  object
 9   ActualElapsedTime  484551 non-null  int64 
 10  CRSElapsedTime     484551 non-null  int64 
 11  AirTime            484551 non-null  int64 
 12  ArrDelay           484551 non-null  int64 
 13  DepDelay           484551 non-null  int64 
 14  Origin             484551 non-null  object
 15  Org_Airport        483374 non-null  object
 16  Dest               4

In [7]:
df.describe()

Unnamed: 0,DayOfWeek,DepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0,484551.0
mean,3.991605,1564.477865,1617.784438,1652.129929,2139.207386,134.810422,131.400761,108.877134,60.907764,57.498086,752.142689,6.782413,19.150876,0.0,0.0,17.41944,3.153284,13.599421,0.082033,26.653587
std,1.971466,452.235219,583.63766,466.096216,1812.677071,74.070374,71.542531,70.113513,56.97542,55.991012,571.631124,5.555816,15.309747,0.0,0.0,39.417893,19.503657,31.454655,1.884774,40.535994
min,1.0,1.0,1.0,1.0,1.0,15.0,-21.0,0.0,15.0,6.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1234.0,1327.0,1339.0,629.0,80.0,79.0,57.0,25.0,23.0,331.0,4.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,1620.0,1737.0,1723.0,1514.0,116.0,114.0,90.0,42.0,40.0,599.0,5.0,15.0,0.0,0.0,2.0,0.0,1.0,0.0,13.0
75%,6.0,1928.0,2049.0,2025.0,3683.0,168.0,162.0,139.0,76.0,72.0,992.0,8.0,22.0,0.0,0.0,19.0,0.0,13.0,0.0,36.0
max,7.0,2400.0,2400.0,2359.0,8403.0,727.0,602.0,609.0,1707.0,1710.0,4502.0,207.0,383.0,0.0,0.0,1707.0,1148.0,1357.0,392.0,1254.0


### Data Cleaning & Preprocessing

In [8]:
df.duplicated().sum()

2

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.isnull().sum()

DayOfWeek               0
Date                    0
DepTime                 0
ArrTime                 0
CRSArrTime              0
UniqueCarrier           0
Airline                 0
FlightNum               0
TailNum                 0
ActualElapsedTime       0
CRSElapsedTime          0
AirTime                 0
ArrDelay                0
DepDelay                0
Origin                  0
Org_Airport          1177
Dest                    0
Dest_Airport         1479
Distance                0
TaxiIn                  0
TaxiOut                 0
Cancelled               0
CancellationCode        0
Diverted                0
CarrierDelay            0
WeatherDelay            0
NASDelay                0
SecurityDelay           0
LateAircraftDelay       0
dtype: int64

In [11]:
df.columns

Index(['DayOfWeek', 'Date', 'DepTime', 'ArrTime', 'CRSArrTime',
       'UniqueCarrier', 'Airline', 'FlightNum', 'TailNum', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin',
       'Org_Airport', 'Dest', 'Dest_Airport', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

### Feature Engineering 

In [12]:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

In [13]:
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [14]:
df= df.drop(columns=['Date','FlightNum','UniqueCarrier','CancellationCode','TailNum','Origin','Dest','Org_Airport', 'Dest_Airport' ])

In [15]:
df['CarrierDelay'] = df['CarrierDelay'].apply(lambda x: 1 if x != 0 else 0)

In [16]:
df['CarrierDelay'].value_counts()


CarrierDelay
1    258503
0    226046
Name: count, dtype: int64

In [17]:
categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
categorical_cols

['Airline']

In [18]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'CarrierDelay']
numeric_cols

['DayOfWeek',
 'DepTime',
 'ArrTime',
 'CRSArrTime',
 'ActualElapsedTime',
 'CRSElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'Distance',
 'TaxiIn',
 'TaxiOut',
 'Cancelled',
 'Diverted',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay']

### Train Test Split 

In [19]:

X = df.drop('CarrierDelay', axis=1)  
y = df['CarrierDelay']


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [20]:
X_train.head()

Unnamed: 0,DayOfWeek,DepTime,ArrTime,CRSArrTime,Airline,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Month,Day
408657,6,1442,1705,1640,Southwest Airlines Co.,143,155,131,25,37,1011,6,6,0,0,0,0,0,23,6,7
181987,7,1737,1848,1715,Southwest Airlines Co.,71,70,60,93,92,288,3,8,0,0,0,1,0,90,3,9
434881,1,1113,1224,1205,Skywest Airlines Inc.,71,69,50,19,17,316,4,17,0,0,0,2,0,0,6,30
261133,2,2250,150,55,JetBlue Airways,180,180,154,55,55,1069,4,22,0,0,0,0,0,55,3,4
117330,1,955,1041,1026,Skywest Airlines Inc.,46,43,22,15,12,86,3,21,0,0,0,0,0,0,2,25


In [21]:
X_train.columns

Index(['DayOfWeek', 'DepTime', 'ArrTime', 'CRSArrTime', 'Airline',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'Month', 'Day'],
      dtype='object')

In [22]:
from sklearn import set_config
set_config(display='diagram')

In [23]:
preprocessor = ColumnTransformer([
    ('ohe_airline', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['Airline']),
    ('scale_numeric', MinMaxScaler(), numeric_cols)
])

In [24]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_select', SelectKBest(score_func=chi2, k=10)),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

In [25]:
pipe.fit(X_train, y_train)


In [26]:
y_pred = pipe.predict(X_test)

In [27]:
y_pred

array([0, 0, 0, ..., 0, 1, 0])

In [28]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.7791662367144774

### Cross Validation using Pipeline 

In [29]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.7808295808872932

### Grid Search using Pipeline 

In [30]:
params = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__n_estimators': [50, 100]
}

In [31]:
grid = GridSearchCV(pipe,params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [32]:
grid.best_score_

0.7812449167319497

In [33]:
grid.best_params_ 

{'classifier__max_depth': 7, 'classifier__n_estimators': 50}