In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# License: MIT

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

np.random.seed(0)

In [2]:
# data source: https://www.kaggle.com/datasets/jimschacko/airlines-dataset-to-predict-a-delay
df = pd.read_csv('data/airlines.csv', index_col='id')  
df.head()

Unnamed: 0_level_0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,CO,269,SFO,IAH,3,15,205,1
2,US,1558,PHX,CLT,3,15,222,1
3,AA,2400,LAX,DFW,3,20,165,1
4,AA,2466,SFO,DFW,3,20,195,1
5,AS,108,ANC,SEA,3,30,202,0


EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 539383 entries, 1 to 539383
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Airline      539383 non-null  object
 1   Flight       539383 non-null  int64 
 2   AirportFrom  539383 non-null  object
 3   AirportTo    539383 non-null  object
 4   DayOfWeek    539383 non-null  int64 
 5   Time         539383 non-null  int64 
 6   Length       539383 non-null  int64 
 7   Delay        539383 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 37.0+ MB


In [4]:
df['Delay'].value_counts()

0    299119
1    240264
Name: Delay, dtype: int64

Data Preparation

In [5]:
X = df.drop(['Delay'], axis=1)
y = df['Delay']

In [6]:
X['Airline'] = X['Airline'].astype('category')
X['AirportFrom'] = X['AirportFrom'].astype('category')
X['AirportTo'] = X['AirportTo'].astype('category')

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 539383 entries, 1 to 539383
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   Airline      539383 non-null  category
 1   Flight       539383 non-null  int64   
 2   AirportFrom  539383 non-null  category
 3   AirportTo    539383 non-null  category
 4   DayOfWeek    539383 non-null  int64   
 5   Time         539383 non-null  int64   
 6   Length       539383 non-null  int64   
dtypes: category(3), int64(4)
memory usage: 23.2 MB


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [18]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(tree_method='gpu_hist', enable_categorical=True, random_state=0)
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=0, ...)

In [11]:
print(f'Train accuracy: {xgb_clf.score(X_train, y_train):.4f}')
print(f'Test accuracy: {xgb_clf.score(X_test, y_test):.4f}')

Train accuracy: 0.7019
Test accuracy: 0.6697


In [12]:
X['AirportFrom'].value_counts()

ATL    34449
ORD    24822
DFW    22154
DEN    19843
LAX    16657
       ...  
MMH       16
SJT       15
GUM       10
ADK        9
ABR        2
Name: AirportFrom, Length: 293, dtype: int64

In [13]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingClassifier

one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(sparse=False, handle_unknown='ignore'),
        make_column_selector(dtype_include='category'),
    ),
    remainder='passthrough',
)

hist_gboost_clf = make_pipeline(
    one_hot_encoder, HistGradientBoostingClassifier(random_state=0)
)

In [14]:
hist_gboost_clf.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001C2D93B7760>)])),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(random_state=0))])

In [75]:
print(f'Train accuracy: {hist_gboost_clf.score(X_train, y_train):.4f}')
print(f'Test accuracy: {hist_gboost_clf.score(X_test, y_test):.4f}')

Train accuracy: 0.6637
Test accuracy: 0.6624


In [76]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = make_pipeline(
    one_hot_encoder, RandomForestClassifier(random_state=0, n_jobs=-1)
)

rf_clf.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001D387DF0730>)])),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=0))])

In [77]:
print(f'Train accuracy: {rf_clf.score(X_train, y_train):.4f}')
print(f'Test accuracy: {rf_clf.score(X_test, y_test):.4f}')

Train accuracy: 0.8305
Test accuracy: 0.6193
