<img src="../images/airplane-symbol.jpg" style="float: left; margin: 20px;" width="50" height="50"> 
#  Predicting Flight Delays (<i>a Proof-of-Concept</i>)

Author: Solomon Heng

---

# 7c. Classification Modeling PHL

## Processes covered in this notebook:
1. [Importing dataset](#(1)-Importing-dataset)
2. [Setting X_train, X_test, y_train & y_test](#(2)-Setting-X_train,-X_test,-y_train-&-y_test)
3. [Model (I): Logistic Regression](#(3)-Model-(I):-Logistic-Regression)
4. [Model (II): RandomForest](#(4)-Model-(II):-RandomForest)
5. [Model (III): XGBoost](#(5)-Model-(III):-XGBoost)
6. [Model (IV): Neural Networks](#(6)-Model-(IV):-Neural-Networks)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras import backend as K

import xgboost as xgb

import pickle

sns.set()

Using TensorFlow backend.


---
### (1) Importing dataset

---

In [2]:
df = pd.read_csv('../datasets/combined_data_class_phl.csv')
df_test = pd.read_csv('../datasets/combined_data_class_test_phl.csv')

In [3]:
pd.set_option('display.max_columns', 100)
df.head()

Unnamed: 0,DEPARTURE_DELAY,LATE_AIRCRAFT_DELAY,QNH,dew_point,temp,visibility,NUM_ARR_AVG_3HOUR,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,AIRLINE_CODE_DL,AIRLINE_CODE_F9,AIRLINE_CODE_NK,AIRLINE_CODE_US,AIRLINE_CODE_WN,DELAY
0,-0.360432,-0.229425,-0.811446,-0.020557,-0.15768,0.354861,0.027284,-1.052649,-0.190386,-1.081797,-0.197911,-0.165112,-0.387541,-0.403795,0
1,1.679034,2.445886,-0.285887,0.654788,0.640921,0.354861,-3.223399,0.483358,1.706147,-1.081797,5.052783,-0.165112,-0.387541,-0.403795,2
2,-0.406783,-0.229425,0.239673,0.847744,0.869092,0.354861,0.230452,-1.142971,0.18844,0.924388,-0.197911,-0.165112,-0.387541,-0.403795,1
3,-0.383608,-0.229425,-0.285887,0.55831,0.526835,0.354861,0.749658,1.512112,-0.816698,-1.081797,-0.197911,6.056497,-0.387541,-0.403795,0
4,-0.290905,-0.229425,1.487877,-0.985336,-1.070366,0.354861,-0.130735,-1.052649,-0.190386,0.924388,-0.197911,-0.165112,-0.387541,-0.403795,0


In [4]:
df_test.head()

Unnamed: 0,DEPARTURE_DELAY,LATE_AIRCRAFT_DELAY,QNH,dew_point,temp,visibility,NUM_ARR_AVG_3HOUR,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,AIRLINE_CODE_DL,AIRLINE_CODE_F9,AIRLINE_CODE_NK,AIRLINE_CODE_US,AIRLINE_CODE_WN,DELAY
0,-0.337256,-0.229425,0.239673,-0.117035,0.526835,0.354861,-0.130735,2.019364,-0.190386,0.924388,-0.197911,-0.165112,-0.387541,-0.403795,0
1,-0.244553,-0.229425,1.487877,-0.599425,-0.728109,0.354861,0.2756,-0.707999,1.255134,-1.081797,-0.197911,-0.165112,-0.387541,-0.403795,1
2,-0.244553,-0.229425,-0.942836,0.461832,0.298663,0.354861,0.862529,-0.545397,-0.816698,0.924388,-0.197911,-0.165112,-0.387541,-0.403795,0
3,0.358016,-0.229425,2.538996,-1.853638,-0.95628,0.354861,-2.252709,0.043554,1.653462,-1.081797,-0.197911,-0.165112,-0.387541,-0.403795,0
4,-0.267729,-0.229425,-0.614361,0.268877,0.983178,0.354861,0.885104,-0.707999,1.255134,0.924388,-0.197911,-0.165112,-0.387541,-0.403795,0


---
### (2) Setting X_train, X_test, y_train & y_test

---

In [5]:
X_train = df.drop('DELAY', axis=1)
y_train = df['DELAY']
X_test = df_test.drop('DELAY', axis=1)
y_test = df_test['DELAY']

In [6]:
X_train.shape

(16668, 14)

In [7]:
X_test.shape

(1300, 14)

In [8]:
X_train.columns

Index(['DEPARTURE_DELAY', 'LATE_AIRCRAFT_DELAY', 'QNH', 'dew_point', 'temp',
       'visibility', 'NUM_ARR_AVG_3HOUR', 'SCHEDULED_ARRIVAL_HOUR_sin',
       'SCHEDULED_ARRIVAL_HOUR_cos', 'AIRLINE_CODE_DL', 'AIRLINE_CODE_F9',
       'AIRLINE_CODE_NK', 'AIRLINE_CODE_US', 'AIRLINE_CODE_WN'],
      dtype='object')

In [9]:
len(X_train.columns)

14

---
### (3) Model (I): Logistic Regression

---

In [10]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
cross_val_score(lr, X_train, y_train).mean()

0.8362133513891281

In [12]:
lr.score(X_test, y_test)

0.8353846153846154

In [13]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, lr.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.94      0.89      0.92      1041
15mins to 1hr       0.43      0.53      0.48       175
    1 to 3hrs       0.63      0.74      0.68        70
        >3hrs       0.68      0.93      0.79        14

     accuracy                           0.84      1300
    macro avg       0.67      0.77      0.71      1300
 weighted avg       0.85      0.84      0.84      1300



In [14]:
# Exporting model

# lr_filename = '../models/classification_logreg_model.sav'
# pickle.dump(lr, open(lr_filename, 'wb'))

---
### (4) Model (II): RandomForest

---

In [15]:
rf = RandomForestClassifier()

In [16]:
rf_params = {
  "n_estimators":[50,100,200],
  "min_samples_split":[5,10,20],
  "max_depth":[5,10,20]
}

rf_cv = RandomizedSearchCV(rf, param_distributions=rf_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [17]:
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    8.2s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
            

In [18]:
rf_cv.best_params_

{'n_estimators': 100, 'min_samples_split': 5, 'max_depth': 20}

In [19]:
opt_rf = RandomForestClassifier(n_estimators=50, min_samples_split=20, max_depth=10)

In [20]:
opt_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
cross_val_score(opt_rf, X_train, y_train).mean()

0.9023889191558627

In [22]:
cross_val_score(opt_rf, X_train, y_train, scoring='f1_macro').mean()

0.9043908448857729

In [23]:
opt_rf.score(X_test, y_test)

0.8815384615384615

In [24]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_rf.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.93      0.96      0.94      1041
15mins to 1hr       0.61      0.50      0.55       175
    1 to 3hrs       0.69      0.76      0.72        70
        >3hrs       0.73      0.79      0.76        14

     accuracy                           0.88      1300
    macro avg       0.74      0.75      0.74      1300
 weighted avg       0.87      0.88      0.88      1300



In [25]:
# Exporting model

rf_filename = '../models/classification_rf_model_phl.sav'
pickle.dump(opt_rf, open(rf_filename, 'wb'))

---
### (5) Model (III): XGBoost

---

In [26]:
xgbc = xgb.XGBClassifier()

In [27]:
xgbc_params = {
  "learning_rate":[0.01, 0.1, 0.2, 0.3],
  "max_depth":[3, 5, 10]
}

xgbc_cv = RandomizedSearchCV(xgbc, param_distributions=xgbc_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [28]:
xgbc_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   42.9s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid=

In [29]:
xgbc_cv.best_params_

{'max_depth': 10, 'learning_rate': 0.3}

In [30]:
opt_xgb = xgb.XGBClassifier(max_depth=5, learning_rate=0.3)

In [31]:
opt_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [32]:
cross_val_score(opt_xgb, X_train, y_train).mean()

0.9604049847096288

In [33]:
opt_xgb.score(X_test, y_test)

0.91

In [34]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_xgb.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.93      0.98      0.96      1041
15mins to 1hr       0.75      0.55      0.63       175
    1 to 3hrs       0.83      0.77      0.80        70
        >3hrs       0.87      0.93      0.90        14

     accuracy                           0.91      1300
    macro avg       0.85      0.81      0.82      1300
 weighted avg       0.90      0.91      0.90      1300



In [35]:
# Exporting model

# xgb_filename = '../models/classification_xgb_model.sav'
# pickle.dump(opt_xgb, open(xgb_filename, 'wb'))

---
### (6) Model (IV): Neural Networks

---

In [36]:
from keras.utils import to_categorical

In [37]:
y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)

In [43]:
model = Sequential()

In [44]:
model.add(Dense(10, 
                input_dim=14, 
                activation='relu'))

# Dropout did not help improve accuracy in this case
# model.add(Dropout(0.5))

model.add(Dense(10, 
                activation='relu'))

model.add(Dense(4, 
                activation='softmax'))

In [45]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 10)                150       
_________________________________________________________________
dense_4 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 44        
Total params: 304
Trainable params: 304
Non-trainable params: 0
_________________________________________________________________


In [46]:
es = EarlyStopping(monitor='val_loss', patience=3)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [47]:
model.fit(X_train, y_train_enc, batch_size=8, epochs=200, validation_split=0.2, callbacks=[es])

Train on 13334 samples, validate on 3334 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200


<keras.callbacks.callbacks.History at 0x244252c7e88>

In [48]:
model.evaluate(X_test, y_test_enc)



[0.4594224968323341, 0.8361538648605347]

In [49]:
y_nn_pred = model.predict(X_test).argmax(axis=-1)
target_names = ['<1hr', '1 to 2hrs', '2 to 3hrs', '>3hrs']

print(classification_report(y_test, y_nn_pred, target_names=target_names))

              precision    recall  f1-score   support

        <1hr       0.95      0.88      0.91      1041
   1 to 2hrs       0.44      0.62      0.52       175
   2 to 3hrs       0.69      0.73      0.71        70
       >3hrs       0.81      0.93      0.87        14

    accuracy                           0.84      1300
   macro avg       0.72      0.79      0.75      1300
weighted avg       0.87      0.84      0.85      1300



In [50]:
# Exporting model

# nn_filename = '../models/classification_nn_model.sav'
# pickle.dump(model, open(nn_filename, 'wb'))