<img src="../images/airplane-symbol.jpg" style="float: left; margin: 20px;" width="50" height="50"> 
#  Predicting Flight Delays (<i>a Proof-of-Concept</i>)

Author: Solomon Heng

---

# 7b. Classification Modeling LGA

## Processes covered in this notebook:
1. [Importing dataset](#(1)-Importing-dataset)
2. [Setting X_train, X_test, y_train & y_test](#(2)-Setting-X_train,-X_test,-y_train-&-y_test)
3. [Model (I): Logistic Regression](#(3)-Model-(I):-Logistic-Regression)
4. [Model (II): RandomForest](#(4)-Model-(II):-RandomForest)
5. [Model (III): XGBoost](#(5)-Model-(III):-XGBoost)
6. [Model (IV): Neural Networks](#(6)-Model-(IV):-Neural-Networks)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras import backend as K

import xgboost as xgb

import pickle

sns.set()

Using TensorFlow backend.


---
### (1) Importing dataset

---

In [2]:
df = pd.read_csv('../datasets/combined_data_class_lga.csv')
df_test = pd.read_csv('../datasets/combined_data_class_test_lga.csv')

In [3]:
pd.set_option('display.max_columns', 100)
df.head()

Unnamed: 0,DEPARTURE_DELAY,SCHEDULED_TIME,LATE_AIRCRAFT_DELAY,QNH,dew_point,squall,visibility,winddir,windspd,windgust,NUM_ARR_AVG_3HOUR,crosswind_comp,SCHEDULED_ARRIVAL_MONTH_sin,SCHEDULED_ARRIVAL_MONTH_cos,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,AIRLINE_CODE_F9,AIRLINE_CODE_MQ,AIRLINE_CODE_WN,DELAY
0,-0.518875,-1.458929,-0.245583,1.019847,-0.104704,-0.733113,0.352208,-0.923471,-0.630803,-0.265583,-0.875782,-0.888842,0.624847,-1.062664,0.343113,-1.106962,6.196072,-0.379121,-0.492746,0
1,0.004073,-0.639261,-0.245583,-0.232773,0.470217,-0.733113,0.352208,1.341418,0.434165,-0.265583,-1.335951,1.26664,-0.107049,-1.248873,0.107194,1.55658,-0.161393,-0.379121,-0.492746,0
2,-0.330614,-1.224738,-0.245583,0.887993,0.182756,-0.733113,0.352208,-0.313694,-0.364561,-0.265583,0.211888,0.259683,0.624847,-1.062664,1.286872,-1.006787,-0.161393,-0.379121,-0.492746,0
3,0.066827,-2.981169,-0.245583,-0.034991,0.182756,-0.733113,0.352208,-1.620361,-1.695771,-0.265583,1.006725,-1.108275,0.624847,-1.062664,2.02584,-0.542507,-0.161393,2.637677,-0.492746,0
4,-0.226024,-0.40507,-0.245583,2.668032,-2.212746,-0.733113,0.352208,-0.749249,-0.364561,-0.265583,-0.352864,-0.833984,0.624847,1.344693,0.343113,-1.106962,-0.161393,2.637677,-0.492746,0


In [4]:
df_test.head()

Unnamed: 0,DEPARTURE_DELAY,SCHEDULED_TIME,LATE_AIRCRAFT_DELAY,QNH,dew_point,squall,visibility,winddir,windspd,windgust,NUM_ARR_AVG_3HOUR,crosswind_comp,SCHEDULED_ARRIVAL_MONTH_sin,SCHEDULED_ARRIVAL_MONTH_cos,SCHEDULED_ARRIVAL_HOUR_sin,SCHEDULED_ARRIVAL_HOUR_cos,AIRLINE_CODE_F9,AIRLINE_CODE_MQ,AIRLINE_CODE_WN,DELAY
0,-0.288778,0.648788,-0.245583,-0.430555,-0.871265,-0.733113,0.352208,0.383196,-0.098319,-0.265583,0.400139,0.110128,0.624847,1.344693,-0.858819,-0.542507,-0.161393,-0.379121,-0.492746,0
1,0.171417,0.297502,-0.245583,0.624283,0.757677,-0.733113,0.352208,0.992974,-0.630803,-0.265583,0.023638,-0.476442,-0.107049,-1.248873,-1.145026,0.521131,-0.161393,2.637677,-0.492746,0
2,-0.309696,0.180407,-0.245583,-0.100918,-0.104704,-0.733113,0.352208,1.428529,0.966649,-0.265583,0.211888,2.002895,1.356743,0.141015,-0.858819,-0.542507,-0.161393,-0.379121,-0.492746,0
3,-0.05868,1.11717,-0.245583,0.756138,-1.542006,-0.733113,0.352208,-0.139471,0.167923,-0.265583,-2.507288,1.069544,1.160632,0.835959,0.107194,1.55658,-0.161393,-0.379121,-0.492746,0
4,-0.037762,-0.873452,-0.245583,-1.353539,0.853497,-0.733113,0.352208,1.167196,-0.364561,-0.265583,-0.018196,0.101755,-0.838946,-1.062664,0.343113,-1.106962,6.196072,-0.379121,-0.492746,0


---
### (2) Setting X_train, X_test, y_train & y_test

---

In [5]:
X_train = df.drop('DELAY', axis=1)
y_train = df['DELAY']
X_test = df_test.drop('DELAY', axis=1)
y_test = df_test['DELAY']

In [6]:
X_train.shape

(20068, 19)

In [7]:
X_test.shape

(1586, 19)

In [8]:
X_train.columns

Index(['DEPARTURE_DELAY', 'SCHEDULED_TIME', 'LATE_AIRCRAFT_DELAY', 'QNH',
       'dew_point', 'squall', 'visibility', 'winddir', 'windspd', 'windgust',
       'NUM_ARR_AVG_3HOUR', 'crosswind_comp', 'SCHEDULED_ARRIVAL_MONTH_sin',
       'SCHEDULED_ARRIVAL_MONTH_cos', 'SCHEDULED_ARRIVAL_HOUR_sin',
       'SCHEDULED_ARRIVAL_HOUR_cos', 'AIRLINE_CODE_F9', 'AIRLINE_CODE_MQ',
       'AIRLINE_CODE_WN'],
      dtype='object')

In [9]:
len(X_train.columns)

19

---
### (3) Model (I): Logistic Regression

---

In [10]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
cross_val_score(lr, X_train, y_train).mean()

0.8585315586824137

In [13]:
lr.score(X_test, y_test)

0.8776796973518285

In [15]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, lr.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.95      0.93      0.94      1254
15mins to 1hr       0.57      0.62      0.59       206
    1 to 3hrs       0.72      0.78      0.75       101
        >3hrs       0.68      0.84      0.75        25

     accuracy                           0.88      1586
    macro avg       0.73      0.79      0.76      1586
 weighted avg       0.88      0.88      0.88      1586



In [20]:
# Exporting model

# lr_filename = '../models/classification_logreg_model.sav'
# pickle.dump(lr, open(lr_filename, 'wb'))

---
### (4) Model (II): RandomForest

---

In [21]:
rf = RandomForestClassifier()

In [22]:
rf_params = {
  "n_estimators":[100,200,300],
  "min_samples_split":[10,20,30],
  "max_depth":[10,20]
}

rf_cv = RandomizedSearchCV(rf, param_distributions=rf_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [23]:
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   31.0s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
            

In [24]:
rf_cv.best_params_

{'n_estimators': 200, 'min_samples_split': 20, 'max_depth': 20}

In [25]:
opt_rf = RandomForestClassifier(n_estimators=300, min_samples_split=10, max_depth=20)

In [26]:
opt_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
cross_val_score(opt_rf, X_train, y_train).mean()

0.9659173580233945

In [28]:
cross_val_score(opt_rf, X_train, y_train, scoring='f1_macro').mean()

0.9656494882311346

In [29]:
opt_rf.score(X_test, y_test)

0.9155107187894073

In [31]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_rf.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.94      0.99      0.96      1254
15mins to 1hr       0.82      0.55      0.66       206
    1 to 3hrs       0.78      0.78      0.78       101
        >3hrs       0.77      0.92      0.84        25

     accuracy                           0.92      1586
    macro avg       0.83      0.81      0.81      1586
 weighted avg       0.91      0.92      0.91      1586



In [35]:
# Exporting model

rf_filename = '../models/classification_rf_model_lga.sav'
pickle.dump(opt_rf, open(rf_filename, 'wb'))

---
### (5) Model (III): XGBoost

---

In [36]:
xgbc = xgb.XGBClassifier()

In [37]:
xgbc_params = {
  "learning_rate":[0.01, 0.1, 0.2, 0.3],
  "max_depth":[3, 5, 10]
}

xgbc_cv = RandomizedSearchCV(xgbc, param_distributions=xgbc_params, scoring='f1_micro', n_iter=2, n_jobs=4, verbose=2)

In [38]:
xgbc_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.4min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid=

In [39]:
xgbc_cv.best_params_

{'max_depth': 10, 'learning_rate': 0.2}

In [40]:
opt_xgb = xgb.XGBClassifier(max_depth=10, learning_rate=0.2)

In [41]:
opt_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [42]:
cross_val_score(opt_xgb, X_train, y_train).mean()

0.9710002283311672

In [44]:
opt_xgb.score(X_test, y_test)

0.9186633039092056

In [46]:
target_names = ['<15mins', '15mins to 1hr', '1 to 3hrs', '>3hrs']
print(classification_report(y_test, opt_xgb.predict(X_test),target_names=target_names))

               precision    recall  f1-score   support

      <15mins       0.95      0.99      0.97      1254
15mins to 1hr       0.80      0.59      0.68       206
    1 to 3hrs       0.80      0.75      0.78       101
        >3hrs       0.74      0.92      0.82        25

     accuracy                           0.92      1586
    macro avg       0.82      0.81      0.81      1586
 weighted avg       0.91      0.92      0.91      1586



In [64]:
# Exporting model

# xgb_filename = '../models/classification_xgb_model.sav'
# pickle.dump(opt_xgb, open(xgb_filename, 'wb'))

---
### (6) Model (IV): Neural Networks

---

In [85]:
from keras.utils import to_categorical

In [86]:
y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)

In [87]:
model = Sequential()

In [88]:
model.add(Dense(15, 
                input_dim=19, 
                activation='relu'))

# Dropout did not help improve accuracy in this case
# model.add(Dropout(0.5))

model.add(Dense(15, 
                activation='relu'))

In [94]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 15)                300       
_________________________________________________________________
dense_2 (Dense)              (None, 15)                240       
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 64        
Total params: 604
Trainable params: 604
Non-trainable params: 0
_________________________________________________________________


In [89]:
es = EarlyStopping(monitor='val_loss', patience=3)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [90]:
model.fit(X_train, y_train_enc, batch_size=8, epochs=200, validation_split=0.2, callbacks=[es])

Train on 16054 samples, validate on 4014 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200


<keras.callbacks.callbacks.History at 0x23b07ff4648>

In [91]:
model.evaluate(X_test, y_test_enc)



[0.48938196506656584, 0.812736451625824]

In [92]:
y_nn_pred = model.predict(X_test).argmax(axis=-1)
target_names = ['<1hr', '1 to 2hrs', '2 to 3hrs', '>3hrs']

print(classification_report(y_test, y_nn_pred, target_names=target_names))

              precision    recall  f1-score   support

        <1hr       0.95      0.84      0.89      1254
   1 to 2hrs       0.39      0.64      0.49       206
   2 to 3hrs       0.71      0.78      0.74       101
       >3hrs       0.80      0.80      0.80        25

    accuracy                           0.81      1586
   macro avg       0.71      0.77      0.73      1586
weighted avg       0.86      0.81      0.83      1586



In [93]:
# Exporting model

# nn_filename = '../models/classification_nn_model.sav'
# pickle.dump(model, open(nn_filename, 'wb'))