<img src="../images/airplane-symbol.jpg" style="float: left; margin: 20px;" width="50" height="50"> 
#  Predicting Flight Delays (<i>a Proof-of-Concept</i>)

Author: Solomon Heng

---

# 7d. Classification Modeling DFW

## Processes covered in this notebook:
1. [Importing dataset](#(1)-Importing-dataset)
2. [Setting X_train, X_test, y_train & y_test](#(2)-Setting-X_train,-X_test,-y_train-&-y_test)
3. [Model (I): Logistic Regression](#(3)-Model-(I):-Logistic-Regression)
4. [Model (II): RandomForest](#(4)-Model-(II):-RandomForest)
5. [Model (III): XGBoost](#(5)-Model-(III):-XGBoost)
6. [Model (IV): Neural Networks](#(6)-Model-(IV):-Neural-Networks)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras import backend as K

import xgboost as xgb

import pickle

sns.set()

Using TensorFlow backend.


---
### (1) Importing dataset

---

In [2]:
df = pd.read_csv('../datasets/combined_data_class_dfw.csv')
df_test = pd.read_csv('../datasets/combined_data_class_test_dfw.csv')

In [3]:
pd.set_option('display.max_columns', 100)
df.head()

Unnamed: 0,DEPARTURE_DELAY,LATE_AIRCRAFT_DELAY,dew_point,squall,temp,visibility,NUM_ARR_AVG_3HOUR,crosswind_comp,SCHEDULED_ARRIVAL_MONTH_sin,SCHEDULED_ARRIVAL_MONTH_cos,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,DELAY
0,-0.366729,-0.200019,0.040071,-0.724109,0.809347,0.366635,0.821866,0.433515,0.580484,-1.122557,-1.069125,0.545816,0
1,-0.366729,-0.200019,0.87629,1.381008,0.809347,0.366635,-0.271767,-1.108409,-1.400841,-0.61033,1.039344,-1.119966,0
2,1.438444,-0.200019,-0.796147,-0.724109,-1.069316,0.366635,-0.140531,-0.148886,0.580484,1.301324,-0.999508,-0.205142,1
3,1.562086,-0.200019,0.87629,-0.724109,1.140876,0.366635,0.450031,0.247536,-0.869946,-1.122557,-1.069125,0.545816,1
4,-0.218358,-0.200019,0.690463,-0.724109,0.477818,0.366635,-0.271767,0.090995,0.580484,-1.122557,1.039344,-1.119966,0


In [4]:
df_test.head()

Unnamed: 0,DEPARTURE_DELAY,LATE_AIRCRAFT_DELAY,dew_point,squall,temp,visibility,NUM_ARR_AVG_3HOUR,crosswind_comp,SCHEDULED_ARRIVAL_MONTH_sin,SCHEDULED_ARRIVAL_MONTH_cos,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,DELAY
0,-0.292544,-0.200019,0.690463,-0.724109,0.698838,0.366635,0.36254,0.457301,-0.869946,-1.122557,-1.104261,0.166807,0
1,-0.366729,-0.200019,0.969203,1.381008,0.809347,0.366635,0.340667,-0.388767,-1.595161,0.089384,0.524477,-1.119966,0
2,-0.292544,-0.200019,4.035337,-0.724109,-0.406258,0.366635,-0.118658,-1.108409,1.111378,-0.61033,1.039344,-1.119966,0
3,-0.243087,-0.200019,0.318811,-0.724109,0.477818,0.366635,0.931229,0.927014,0.580484,-1.122557,-0.762635,-0.542444,0
4,-0.342,-0.200019,0.87629,-0.724109,0.698838,0.366635,1.34681,-0.890902,-0.869946,-1.122557,-0.762635,-0.542444,0


---
### (2) Setting X_train, X_test, y_train & y_test

---

In [5]:
X_train = df.drop('DELAY', axis=1)
y_train = df['DELAY']
X_test = df_test.drop('DELAY', axis=1)
y_test = df_test['DELAY']

In [6]:
X_train.shape

(18140, 12)

In [7]:
X_test.shape

(1387, 12)

In [8]:
X_train.columns

Index(['DEPARTURE_DELAY', 'LATE_AIRCRAFT_DELAY', 'dew_point', 'squall', 'temp',
       'visibility', 'NUM_ARR_AVG_3HOUR', 'crosswind_comp',
       'SCHEDULED_ARRIVAL_MONTH_sin', 'SCHEDULED_ARRIVAL_MONTH_cos',
       'SCHEDULED_ARRIVAL_HOUR_sin', 'SCHEDULED_ARRIVAL_HOUR_cos'],
      dtype='object')

In [9]:
len(X_train.columns)

12

---
### (3) Model (I): Logistic Regression

---

In [10]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
cross_val_score(lr, X_train, y_train).mean()

0.8767916207276736

In [13]:
lr.score(X_test, y_test)

0.8781542898341744

In [15]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, lr.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.96      0.92      0.94      1135
15mins to 1hr       0.50      0.63      0.56       163
    1 to 3hrs       0.67      0.74      0.70        72
        >3hrs       0.77      1.00      0.87        17

     accuracy                           0.88      1387
    macro avg       0.73      0.82      0.77      1387
 weighted avg       0.89      0.88      0.88      1387



In [20]:
# Exporting model

# lr_filename = '../models/classification_logreg_model.sav'
# pickle.dump(lr, open(lr_filename, 'wb'))

---
### (4) Model (II): RandomForest

---

In [21]:
rf = RandomForestClassifier()

In [22]:
rf_params = {
  "n_estimators":[50,100,200],
  "min_samples_split":[10,20,30],
  "max_depth":[10,15]
}

rf_cv = RandomizedSearchCV(rf, param_distributions=rf_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [23]:
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   12.9s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
            

In [24]:
rf_cv.best_params_

{'n_estimators': 50, 'min_samples_split': 10, 'max_depth': 15}

In [25]:
opt_rf = RandomForestClassifier(n_estimators=100, min_samples_split=20, max_depth=15)

In [26]:
opt_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
cross_val_score(opt_rf, X_train, y_train).mean()

0.9564498346196251

In [28]:
cross_val_score(opt_rf, X_train, y_train, scoring='f1_macro').mean()

0.9565093616082653

In [29]:
opt_rf.score(X_test, y_test)

0.9178082191780822

In [31]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_rf.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.96      0.97      0.96      1135
15mins to 1hr       0.68      0.64      0.66       163
    1 to 3hrs       0.79      0.74      0.76        72
        >3hrs       0.81      1.00      0.89        17

     accuracy                           0.92      1387
    macro avg       0.81      0.84      0.82      1387
 weighted avg       0.92      0.92      0.92      1387



In [35]:
# Exporting model

rf_filename = '../models/classification_rf_model_dfw.sav'
pickle.dump(opt_rf, open(rf_filename, 'wb'))

---
### (5) Model (III): XGBoost

---

In [36]:
xgbc = xgb.XGBClassifier()

In [37]:
xgbc_params = {
  "learning_rate":[0.01, 0.1, 0.2, 0.3],
  "max_depth":[3, 5, 10]
}

xgbc_cv = RandomizedSearchCV(xgbc, param_distributions=xgbc_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [38]:
xgbc_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.1min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid=

In [39]:
xgbc_cv.best_params_

{'max_depth': 10, 'learning_rate': 0.2}

In [40]:
opt_xgb = xgb.XGBClassifier(max_depth=3, learning_rate=0.2)

In [41]:
opt_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [42]:
cross_val_score(opt_xgb, X_train, y_train).mean()

0.9443219404630652

In [44]:
opt_xgb.score(X_test, y_test)

0.9170872386445565

In [46]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_xgb.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.96      0.97      0.96      1135
15mins to 1hr       0.68      0.65      0.66       163
    1 to 3hrs       0.80      0.74      0.77        72
        >3hrs       0.81      1.00      0.89        17

     accuracy                           0.92      1387
    macro avg       0.81      0.84      0.82      1387
 weighted avg       0.92      0.92      0.92      1387



In [64]:
# Exporting model

# xgb_filename = '../models/classification_xgb_model.sav'
# pickle.dump(opt_xgb, open(xgb_filename, 'wb'))

---
### (6) Model (IV): Neural Networks

---

In [85]:
from keras.utils import to_categorical

In [86]:
y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)

In [92]:
model = Sequential()

In [93]:
model.add(Dense(5, 
                input_dim=12, 
                activation='relu'))

# Dropout did not help improve accuracy in this case
# model.add(Dropout(0.5))

model.add(Dense(5, 
                activation='relu'))

model.add(Dense(4, 
                activation='softmax'))

In [94]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 5)                 65        
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 30        
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 24        
Total params: 119
Trainable params: 119
Non-trainable params: 0
_________________________________________________________________


In [95]:
es = EarlyStopping(monitor='val_loss', patience=3)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [96]:
model.fit(X_train, y_train_enc, batch_size=8, epochs=200, validation_split=0.2, callbacks=[es])

Train on 14512 samples, validate on 3628 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200


<keras.callbacks.callbacks.History at 0x2dec065c948>

In [97]:
model.evaluate(X_test, y_test_enc)



[0.43439035551661453, 0.8529199957847595]

In [98]:
y_nn_pred = model.predict(X_test).argmax(axis=-1)
target_names = ['<1hr', '1 to 2hrs', '2 to 3hrs', '>3hrs']

print(classification_report(y_test, y_nn_pred, target_names=target_names))

              precision    recall  f1-score   support

        <1hr       0.97      0.88      0.92      1135
   1 to 2hrs       0.44      0.69      0.54       163
   2 to 3hrs       0.69      0.75      0.72        72
       >3hrs       0.89      1.00      0.94        17

    accuracy                           0.85      1387
   macro avg       0.75      0.83      0.78      1387
weighted avg       0.89      0.85      0.87      1387



In [99]:
# Exporting model

# nn_filename = '../models/classification_nn_model.sav'
# pickle.dump(model, open(nn_filename, 'wb'))