<img src="../images/airplane-symbol.jpg" style="float: left; margin: 20px;" width="50" height="50"> 
#  Predicting Flight Delays (<i>a Proof-of-Concept</i>)

Author: Solomon Heng

---

# 7e. Classification Modeling MCO

## Processes covered in this notebook:
1. [Importing dataset](#(1)-Importing-dataset)
2. [Setting X_train, X_test, y_train & y_test](#(2)-Setting-X_train,-X_test,-y_train-&-y_test)
3. [Model (I): Logistic Regression](#(3)-Model-(I):-Logistic-Regression)
4. [Model (II): RandomForest](#(4)-Model-(II):-RandomForest)
5. [Model (III): XGBoost](#(5)-Model-(III):-XGBoost)
6. [Model (IV): Neural Networks](#(6)-Model-(IV):-Neural-Networks)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras import backend as K

import xgboost as xgb

import pickle

sns.set()

Using TensorFlow backend.


---
### (1) Importing dataset

---

In [2]:
df = pd.read_csv('../datasets/combined_data_class_mco.csv')
df_test = pd.read_csv('../datasets/combined_data_class_test_mco.csv')

In [3]:
pd.set_option('display.max_columns', 100)
df.head()

Unnamed: 0,DEPARTURE_DELAY,SCHEDULED_TIME,LATE_AIRCRAFT_DELAY,QNH,dew_point,temp,visibility,windspd,SCHEDULED_ARRIVAL_MONTH_sin,SCHEDULED_ARRIVAL_MONTH_cos,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,AIRLINE_CODE_EV,AIRLINE_CODE_F9,AIRLINE_CODE_NK,AIRLINE_CODE_WN,DELAY
0,-0.325555,-0.829324,-0.19734,-0.114255,0.868599,0.685551,0.355234,-1.662574,-0.101151,-1.304766,0.911083,-1.017175,-0.012411,-0.196758,-0.195912,1.593847,0
1,-0.325555,0.130607,-0.19734,-0.30895,0.300318,0.230538,0.355234,-0.873113,-0.101151,1.458055,-0.46556,1.595281,-0.012411,-0.196758,-0.195912,-0.627413,0
2,-0.382248,-0.829324,-0.19734,-1.15263,0.679172,0.571798,0.355234,-0.083652,-0.101151,-1.304766,0.517827,-1.125521,-0.012411,5.082379,-0.195912,-0.627413,0
3,-0.070436,0.542006,-0.19734,-0.828138,0.395031,-0.110722,-2.880781,-0.083652,-0.101151,1.458055,1.538784,-0.515027,-0.012411,-0.196758,-0.195912,-0.627413,0
4,-0.297209,-0.829324,-0.19734,0.340035,0.963313,0.799304,0.355234,-0.873113,-0.831951,-1.119693,0.517827,-1.125521,-0.012411,-0.196758,-0.195912,-0.627413,0


In [4]:
df_test.head()

Unnamed: 0,DEPARTURE_DELAY,SCHEDULED_TIME,LATE_AIRCRAFT_DELAY,QNH,dew_point,temp,visibility,windspd,SCHEDULED_ARRIVAL_MONTH_sin,SCHEDULED_ARRIVAL_MONTH_cos,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,AIRLINE_CODE_EV,AIRLINE_CODE_F9,AIRLINE_CODE_NK,AIRLINE_CODE_WN,DELAY
0,-0.183823,0.953405,-0.19734,-0.049356,-0.17325,-0.338229,0.355234,0.968963,-0.831951,1.272981,-1.154643,0.635367,-0.012411,-0.196758,-0.195912,-0.627413,0
1,1.290198,0.130607,1.6463,0.210238,0.679172,0.458044,-2.880781,0.179502,-0.101151,1.458055,-0.782361,1.341846,-0.012411,-0.196758,-0.195912,-0.627413,1
2,0.241376,0.404873,-0.19734,-1.412224,-0.078537,0.116784,0.355234,-1.662574,1.164632,-0.614061,-0.28383,-1.017175,-0.012411,-0.196758,-0.195912,-0.627413,0
3,-0.325555,1.776202,-0.19734,0.859223,-0.078537,-0.565736,-2.797807,0.442656,1.360449,0.076644,-0.782361,1.341846,-0.012411,-0.196758,-0.195912,-0.627413,0
4,0.836653,-0.143659,1.199357,-0.568544,0.300318,-0.110722,0.355234,-1.662574,-0.101151,1.458055,-1.017878,1.013764,-0.012411,-0.196758,-0.195912,1.593847,1


---
### (2) Setting X_train, X_test, y_train & y_test

---

In [5]:
X_train = df.drop('DELAY', axis=1)
y_train = df['DELAY']
X_test = df_test.drop('DELAY', axis=1)
y_test = df_test['DELAY']

In [6]:
X_train.shape

(22044, 16)

In [7]:
X_test.shape

(1624, 16)

In [8]:
X_train.columns

Index(['DEPARTURE_DELAY', 'SCHEDULED_TIME', 'LATE_AIRCRAFT_DELAY', 'QNH',
       'dew_point', 'temp', 'visibility', 'windspd',
       'SCHEDULED_ARRIVAL_MONTH_sin', 'SCHEDULED_ARRIVAL_MONTH_cos',
       'SCHEDULED_ARRIVAL_HOUR_sin', 'SCHEDULED_ARRIVAL_HOUR_cos',
       'AIRLINE_CODE_EV', 'AIRLINE_CODE_F9', 'AIRLINE_CODE_NK',
       'AIRLINE_CODE_WN'],
      dtype='object')

In [9]:
len(X_train.columns)

16

---
### (3) Model (I): Logistic Regression

---

In [10]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
cross_val_score(lr, X_train, y_train).mean()

0.9001547527557682

In [12]:
lr.score(X_test, y_test)

0.9057881773399015

In [13]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, lr.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.97      0.94      0.96      1378
15mins to 1hr       0.54      0.69      0.61       167
    1 to 3hrs       0.74      0.76      0.75        66
        >3hrs       0.93      1.00      0.96        13

     accuracy                           0.91      1624
    macro avg       0.79      0.85      0.82      1624
 weighted avg       0.92      0.91      0.91      1624



In [14]:
# Exporting model

# lr_filename = '../models/classification_logreg_model.sav'
# pickle.dump(lr, open(lr_filename, 'wb'))

---
### (4) Model (II): RandomForest

---

In [15]:
rf = RandomForestClassifier()

In [16]:
rf_params = {
  "n_estimators":[50,100,200],
  "min_samples_split":[10,20,30],
  "max_depth":[5,10,20]
}

rf_cv = RandomizedSearchCV(rf, param_distributions=rf_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [17]:
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    7.1s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
            

In [18]:
rf_cv.best_params_

{'n_estimators': 50, 'min_samples_split': 10, 'max_depth': 20}

In [19]:
opt_rf = RandomForestClassifier(n_estimators=100, min_samples_split=30, max_depth=20)

In [20]:
opt_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=30,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
cross_val_score(opt_rf, X_train, y_train).mean()

0.967746646337573

In [22]:
cross_val_score(opt_rf, X_train, y_train, scoring='f1_macro').mean()

0.9685587561491307

In [23]:
opt_rf.score(X_test, y_test)

0.9519704433497537

In [24]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_rf.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.97      0.99      0.98      1378
15mins to 1hr       0.80      0.72      0.76       167
    1 to 3hrs       0.85      0.76      0.80        66
        >3hrs       1.00      1.00      1.00        13

     accuracy                           0.95      1624
    macro avg       0.91      0.87      0.89      1624
 weighted avg       0.95      0.95      0.95      1624



In [25]:
# Exporting model

rf_filename = '../models/classification_rf_model_mco.sav'
pickle.dump(opt_rf, open(rf_filename, 'wb'))

---
### (5) Model (III): XGBoost

---

In [26]:
xgbc = xgb.XGBClassifier()

In [27]:
xgbc_params = {
  "learning_rate":[0.01, 0.1, 0.2, 0.3],
  "max_depth":[3, 5, 10]
}

xgbc_cv = RandomizedSearchCV(xgbc, param_distributions=xgbc_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [28]:
xgbc_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.1min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid=

In [29]:
xgbc_cv.best_params_

{'max_depth': 10, 'learning_rate': 0.1}

In [30]:
opt_xgb = xgb.XGBClassifier(max_depth=10, learning_rate=0.2)

In [31]:
opt_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [32]:
cross_val_score(opt_xgb, X_train, y_train).mean()

0.9850303310461731

In [33]:
opt_xgb.score(X_test, y_test)

0.9538177339901478

In [34]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_xgb.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.97      0.99      0.98      1378
15mins to 1hr       0.84      0.70      0.76       167
    1 to 3hrs       0.85      0.76      0.80        66
        >3hrs       1.00      0.92      0.96        13

     accuracy                           0.95      1624
    macro avg       0.91      0.84      0.88      1624
 weighted avg       0.95      0.95      0.95      1624



In [35]:
# Exporting model

# xgb_filename = '../models/classification_xgb_model.sav'
# pickle.dump(opt_xgb, open(xgb_filename, 'wb'))

---
### (6) Model (IV): Neural Networks

---

In [36]:
from keras.utils import to_categorical

In [37]:
y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)

In [43]:
model = Sequential()

In [44]:
model.add(Dense(10, 
                input_dim=16, 
                activation='relu'))

# Dropout did not help improve accuracy in this case
# model.add(Dropout(0.5))

model.add(Dense(4, 
                activation='softmax'))

In [45]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 10)                170       
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 44        
Total params: 214
Trainable params: 214
Non-trainable params: 0
_________________________________________________________________


In [46]:
es = EarlyStopping(monitor='val_loss', patience=3)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [47]:
model.fit(X_train, y_train_enc, batch_size=8, epochs=200, validation_split=0.2, callbacks=[es])

Train on 17635 samples, validate on 4409 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200


<keras.callbacks.callbacks.History at 0x1bf0cba8588>

In [48]:
model.evaluate(X_test, y_test_enc)



[0.2916400658673254, 0.9113300442695618]

In [49]:
y_nn_pred = model.predict(X_test).argmax(axis=-1)
target_names = ['<1hr', '1 to 2hrs', '2 to 3hrs', '>3hrs']

print(classification_report(y_test, y_nn_pred, target_names=target_names))

              precision    recall  f1-score   support

        <1hr       0.98      0.93      0.96      1378
   1 to 2hrs       0.55      0.79      0.65       167
   2 to 3hrs       0.83      0.76      0.79        66
       >3hrs       0.93      1.00      0.96        13

    accuracy                           0.91      1624
   macro avg       0.82      0.87      0.84      1624
weighted avg       0.93      0.91      0.92      1624



In [50]:
# Exporting model

# nn_filename = '../models/classification_nn_model.sav'
# pickle.dump(model, open(nn_filename, 'wb'))