<img src="../../images/airplane-symbol.jpg" style="float: left; margin: 20px;" width="50" height="50"> 
#  Predicting Flight Delays (<i>a Proof-of-Concept</i>)

Author: Solomon Heng

---

# Binary Classification Modeling (all routes)

## Processes covered in this notebook:
1. [Importing dataset](#(1)-Importing-dataset)
2. [Setting X_train, X_test, y_train & y_test](#(2)-Setting-X_train,-X_test,-y_train-&-y_test)
3. [Model (I): Logistic Regression](#(3)-Model-(I):-Logistic-Regression)
4. [Model (II): RandomForest](#(4)-Model-(II):-RandomForest)
5. [Model (III): XGBoost](#(5)-Model-(III):-XGBoost)
6. [Model (IV): Neural Networks](#(6)-Model-(IV):-Neural-Networks)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras import backend as K

import xgboost as xgb

import pickle

sns.set()

Using TensorFlow backend.


---
### (1) Importing dataset

---

In [2]:
df = pd.read_csv('../datasets/combined_data_classification.csv')
df_test = pd.read_csv('../datasets/combined_data_classification_test.csv')

In [3]:
pd.set_option('display.max_columns', 100)
df.head()

Unnamed: 0,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,LATE_AIRCRAFT_DELAY,QNH,dew_point,snow,squall,temp,visibility,winddir,windspd,windgust,NUM_ARR_AVG_3HOUR,crosswind_comp,SCHEDULED_ARRIVAL_MONTH_11,SCHEDULED_ARRIVAL_MONTH_12,SCHEDULED_ARRIVAL_MONTH_2,SCHEDULED_ARRIVAL_MONTH_3,SCHEDULED_ARRIVAL_MONTH_4,SCHEDULED_ARRIVAL_MONTH_5,SCHEDULED_ARRIVAL_MONTH_6,SCHEDULED_ARRIVAL_MONTH_7,SCHEDULED_ARRIVAL_MONTH_8,SCHEDULED_ARRIVAL_MONTH_9,SCHEDULED_ARRIVAL_HOUR_1,SCHEDULED_ARRIVAL_HOUR_10,SCHEDULED_ARRIVAL_HOUR_11,SCHEDULED_ARRIVAL_HOUR_12,SCHEDULED_ARRIVAL_HOUR_13,SCHEDULED_ARRIVAL_HOUR_14,SCHEDULED_ARRIVAL_HOUR_15,SCHEDULED_ARRIVAL_HOUR_16,SCHEDULED_ARRIVAL_HOUR_17,SCHEDULED_ARRIVAL_HOUR_18,SCHEDULED_ARRIVAL_HOUR_19,SCHEDULED_ARRIVAL_HOUR_2,SCHEDULED_ARRIVAL_HOUR_20,SCHEDULED_ARRIVAL_HOUR_21,SCHEDULED_ARRIVAL_HOUR_22,SCHEDULED_ARRIVAL_HOUR_23,SCHEDULED_ARRIVAL_HOUR_3,SCHEDULED_ARRIVAL_HOUR_4,SCHEDULED_ARRIVAL_HOUR_5,SCHEDULED_ARRIVAL_HOUR_6,SCHEDULED_ARRIVAL_HOUR_7,SCHEDULED_ARRIVAL_HOUR_8,SCHEDULED_ARRIVAL_HOUR_9,AIRLINE_CODE_AS,AIRLINE_CODE_DL,...,ORIGIN_AIRPORT_ORF,ORIGIN_AIRPORT_PBI,ORIGIN_AIRPORT_PDX,ORIGIN_AIRPORT_PHF,ORIGIN_AIRPORT_PHL,ORIGIN_AIRPORT_PHX,ORIGIN_AIRPORT_PIA,ORIGIN_AIRPORT_PIT,ORIGIN_AIRPORT_PNS,ORIGIN_AIRPORT_PVD,ORIGIN_AIRPORT_PWM,ORIGIN_AIRPORT_RAP,ORIGIN_AIRPORT_RDU,ORIGIN_AIRPORT_RIC,ORIGIN_AIRPORT_ROA,ORIGIN_AIRPORT_ROC,ORIGIN_AIRPORT_RST,ORIGIN_AIRPORT_RSW,ORIGIN_AIRPORT_SAN,ORIGIN_AIRPORT_SAT,ORIGIN_AIRPORT_SAV,ORIGIN_AIRPORT_SBN,ORIGIN_AIRPORT_SCE,ORIGIN_AIRPORT_SDF,ORIGIN_AIRPORT_SEA,ORIGIN_AIRPORT_SFO,ORIGIN_AIRPORT_SGF,ORIGIN_AIRPORT_SHV,ORIGIN_AIRPORT_SJC,ORIGIN_AIRPORT_SJU,ORIGIN_AIRPORT_SLC,ORIGIN_AIRPORT_SMF,ORIGIN_AIRPORT_SNA,ORIGIN_AIRPORT_SRQ,ORIGIN_AIRPORT_STL,ORIGIN_AIRPORT_STT,ORIGIN_AIRPORT_STX,ORIGIN_AIRPORT_SYR,ORIGIN_AIRPORT_TLH,ORIGIN_AIRPORT_TPA,ORIGIN_AIRPORT_TRI,ORIGIN_AIRPORT_TTN,ORIGIN_AIRPORT_TUL,ORIGIN_AIRPORT_TUS,ORIGIN_AIRPORT_TVC,ORIGIN_AIRPORT_TYS,ORIGIN_AIRPORT_VLD,ORIGIN_AIRPORT_VPS,ORIGIN_AIRPORT_XNA,ARRIVAL_DELAY/NO_DELAY
0,2.535036,-0.704554,-0.900164,4.664765,-1.013092,0.960476,0.221437,-0.760361,1.16425,0.359164,0.394454,0.197798,-0.251815,0.113949,0.336492,-0.315111,-0.308782,-0.289299,-0.325409,-0.316162,-0.321104,3.11863,-0.329611,-0.328894,-0.317112,-0.017317,-0.225468,-0.249137,-0.229945,-0.250615,-0.289551,-0.278578,-0.251898,-0.206389,-0.285744,3.489136,-0.004684,-0.252423,-0.239934,-0.149888,-0.105059,-0.001912,-0.018343,-0.066715,-0.149275,-0.224706,-0.302439,-0.282686,-0.036252,0.748215,...,-0.088277,-0.111016,-0.05626,-0.059286,-0.139334,-0.102631,-0.050217,-0.097774,-0.081459,-0.052925,-0.041533,-0.006894,-0.117806,-0.101497,-0.065431,-0.053718,-0.030667,-0.107313,-0.076369,-0.091562,-0.095986,-0.057744,-0.010647,-0.088696,-0.089489,-0.095577,-0.061,-0.076684,-0.028565,-0.063486,-0.086963,-0.040414,-0.052786,-0.075321,-0.101681,-0.042147,-0.010647,-0.05334,-0.078094,-0.14398,-0.074804,-0.030003,-0.066908,-0.04219,-0.009561,-0.070376,-0.051943,-0.080957,-0.067101,1
1,-0.182679,0.076068,-0.137569,-0.191011,1.608863,-0.094469,0.221437,1.315165,-0.330909,0.359164,-0.81635,-0.337629,-0.251815,0.766444,-1.098015,3.173483,-0.308782,-0.289299,-0.325409,-0.316162,-0.321104,-0.320654,-0.329611,-0.328894,-0.317112,-0.017317,-0.225468,-0.249137,-0.229945,-0.250615,-0.289551,-0.278578,-0.251898,-0.206389,-0.285744,-0.286604,-0.004684,-0.252423,-0.239934,-0.149888,-0.105059,-0.001912,-0.018343,-0.066715,-0.149275,-0.224706,3.306454,-0.282686,-0.036252,-1.336515,...,-0.088277,-0.111016,-0.05626,-0.059286,-0.139334,-0.102631,-0.050217,-0.097774,-0.081459,-0.052925,-0.041533,-0.006894,-0.117806,-0.101497,-0.065431,-0.053718,-0.030667,-0.107313,-0.076369,-0.091562,-0.095986,-0.057744,-0.010647,-0.088696,-0.089489,-0.095577,-0.061,-0.076684,-0.028565,-0.063486,-0.086963,-0.040414,-0.052786,-0.075321,-0.101681,-0.042147,-0.010647,-0.05334,-0.078094,-0.14398,-0.074804,-0.030003,-0.066908,-0.04219,-0.009561,-0.070376,-0.051943,-0.080957,-0.067101,0
2,1.12585,-0.54843,-0.693763,2.054133,-0.488701,-0.190373,0.221437,-0.760361,-0.560933,-2.848461,0.04851,0.197798,-0.251815,0.838944,1.099777,-0.315111,-0.308782,-0.289299,3.073057,-0.316162,-0.321104,-0.320654,-0.329611,-0.328894,-0.317112,-0.017317,-0.225468,-0.249137,-0.229945,-0.250615,-0.289551,3.589662,-0.251898,-0.206389,-0.285744,-0.286604,-0.004684,-0.252423,-0.239934,-0.149888,-0.105059,-0.001912,-0.018343,-0.066715,-0.149275,-0.224706,-0.302439,-0.282686,-0.036252,0.748215,...,-0.088277,-0.111016,-0.05626,-0.059286,-0.139334,-0.102631,-0.050217,-0.097774,-0.081459,-0.052925,-0.041533,-0.006894,-0.117806,-0.101497,-0.065431,-0.053718,-0.030667,-0.107313,-0.076369,-0.091562,-0.095986,-0.057744,-0.010647,11.274414,-0.089489,-0.095577,-0.061,-0.076684,-0.028565,-0.063486,-0.086963,-0.040414,-0.052786,-0.075321,-0.101681,-0.042147,-0.010647,-0.05334,-0.078094,-0.14398,-0.074804,-0.030003,-0.066908,-0.04219,-0.009561,-0.070376,-0.051943,-0.080957,-0.067101,1
3,-0.258171,-0.02151,-0.346142,-0.191011,-0.619799,-1.533031,0.221437,-0.760361,-0.675945,0.359164,0.48094,-0.069916,-0.251815,0.669778,-0.141573,-0.315111,-0.308782,-0.289299,-0.325409,-0.316162,-0.321104,-0.320654,-0.329611,-0.328894,-0.317112,-0.017317,-0.225468,-0.249137,-0.229945,-0.250615,-0.289551,-0.278578,-0.251898,-0.206389,-0.285744,-0.286604,-0.004684,-0.252423,-0.239934,-0.149888,-0.105059,-0.001912,-0.018343,-0.066715,-0.149275,-0.224706,-0.302439,3.53749,-0.036252,0.748215,...,-0.088277,-0.111016,-0.05626,-0.059286,-0.139334,-0.102631,-0.050217,-0.097774,-0.081459,-0.052925,-0.041533,-0.006894,-0.117806,9.852461,-0.065431,-0.053718,-0.030667,-0.107313,-0.076369,-0.091562,-0.095986,-0.057744,-0.010647,-0.088696,-0.089489,-0.095577,-0.061,-0.076684,-0.028565,-0.063486,-0.086963,-0.040414,-0.052786,-0.075321,-0.101681,-0.042147,-0.010647,-0.05334,-0.078094,-0.14398,-0.074804,-0.030003,-0.066908,-0.04219,-0.009561,-0.070376,-0.051943,-0.080957,-0.067101,0
4,-0.233007,-1.036318,-1.034867,-0.191011,-0.816446,0.864571,0.221437,-0.760361,0.819214,0.359164,0.91337,-0.873056,-0.251815,-0.079383,-0.770893,-0.315111,-0.308782,-0.289299,-0.325409,-0.316162,-0.321104,-0.320654,3.033883,-0.328894,-0.317112,-0.017317,-0.225468,-0.249137,-0.229945,-0.250615,-0.289551,-0.278578,3.969862,-0.206389,-0.285744,-0.286604,-0.004684,-0.252423,-0.239934,-0.149888,-0.105059,-0.001912,-0.018343,-0.066715,-0.149275,-0.224706,-0.302439,-0.282686,-0.036252,-1.336515,...,-0.088277,-0.111016,-0.05626,-0.059286,-0.139334,-0.102631,-0.050217,-0.097774,-0.081459,-0.052925,-0.041533,-0.006894,-0.117806,-0.101497,-0.065431,-0.053718,-0.030667,-0.107313,-0.076369,-0.091562,-0.095986,-0.057744,-0.010647,-0.088696,-0.089489,-0.095577,-0.061,-0.076684,-0.028565,-0.063486,-0.086963,-0.040414,-0.052786,-0.075321,-0.101681,-0.042147,-0.010647,-0.05334,-0.078094,-0.14398,-0.074804,-0.030003,-0.066908,-0.04219,-0.009561,-0.070376,-0.051943,-0.080957,-0.067101,0


---
### (2) Setting X_train, X_test, y_train & y_test

---

In [4]:
X_train = df.drop('ARRIVAL_DELAY/NO_DELAY', axis=1)
y_train = df['ARRIVAL_DELAY/NO_DELAY']
X_test = df_test.drop('ARRIVAL_DELAY/NO_DELAY', axis=1)
y_test = df_test['ARRIVAL_DELAY/NO_DELAY']

In [5]:
X_train.shape

(467036, 225)

In [6]:
X_test.shape

(68380, 225)

---
### (3) Model (I): Logistic Regression

---

In [51]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
cross_val_score(lr, X_train, y_train).mean()

0.9119682891109514

In [53]:
cross_val_score(lr, X_train, y_train, scoring='roc_auc').mean()

0.9649387867524801

In [54]:
lr.score(X_test, y_test)

0.9392073705761919

In [55]:
roc_auc_score(y_test, lr.predict(X_test))

0.8883398317552992

In [56]:
tn, fp, fn, tp = confusion_matrix(y_test, lr.predict(X_test)).ravel()

In [57]:
sensitivity = tp/(tp+fn)
sensitivity

0.8167576308874065

In [58]:
specificity = tn/(tn+fp)
specificity

0.9599220326231919

In [59]:
for i in zip(X_train.columns, np.exp(lr.coef_.tolist()[0])):
    print (i)

('DEPARTURE_DELAY', 173.860225339537)
('SCHEDULED_TIME', 0.4980094812477027)
('DISTANCE', 1.4917183321815626)
('LATE_AIRCRAFT_DELAY', 2078836.711925626)
('QNH', 0.9570570259377531)
('dew_point', 1.2255152457943181)
('snow', 0.9454328329664006)
('squall', 0.8166666537688139)
('temp', 1.3044874586925084)
('visibility', 0.9513926291374382)
('winddir', 0.8382584145216566)
('windspd', 1.3087448937672501)
('windgust', 0.879681786506036)
('NUM_ARR_AVG_3HOUR', 0.8042711808076419)
('crosswind_comp', 0.9273326097998859)
('SCHEDULED_ARRIVAL_MONTH_11', 0.9645442027371647)
('SCHEDULED_ARRIVAL_MONTH_12', 0.9688394319030015)
('SCHEDULED_ARRIVAL_MONTH_2', 1.003896150239684)
('SCHEDULED_ARRIVAL_MONTH_3', 0.9149058833555364)
('SCHEDULED_ARRIVAL_MONTH_4', 0.8297069129972954)
('SCHEDULED_ARRIVAL_MONTH_5', 0.7392226247774558)
('SCHEDULED_ARRIVAL_MONTH_6', 0.7783563418444365)
('SCHEDULED_ARRIVAL_MONTH_7', 0.7244970102535785)
('SCHEDULED_ARRIVAL_MONTH_8', 0.829321332336363)
('SCHEDULED_ARRIVAL_MONTH_9', 0.79

In [60]:
# Exporting model

# lr_filename = '../models/classification_logreg_model.sav'
# pickle.dump(lr, open(lr_filename, 'wb'))

---
### (4) Model (II): RandomForest

---

In [16]:
rf = RandomForestClassifier()

In [17]:
rf_params = {
  "n_estimators":[50,100],
  "min_samples_split":[10,20],
  "max_depth":[5,10]
}

rf_cv = RandomizedSearchCV(rf, param_distributions=rf_params, scoring='f1', n_iter=2, n_jobs=4, verbose=2)

In [18]:
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  2.0min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
            

In [19]:
rf_cv.best_params_

{'n_estimators': 100, 'min_samples_split': 10, 'max_depth': 5}

In [40]:
opt_rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, max_depth=10)

In [41]:
opt_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
cross_val_score(opt_rf, X_train, y_train).mean()

0.9208841433251665

In [43]:
cross_val_score(opt_rf, X_train, y_train, scoring='roc_auc').mean()

0.9775418640071127

In [44]:
opt_rf.score(X_test, y_test)

0.9393682363264112

In [45]:
roc_auc_score(y_test, opt_rf.predict(X_test))

0.8834374631325267

In [46]:
tn, fp, fn, tp = confusion_matrix(y_test, opt_rf.predict(X_test)).ravel()

In [47]:
sensitivity = tp/(tp+fn)
sensitivity

0.8047301394784718

In [48]:
specificity = tn/(tn+fp)
specificity

0.9621447867865814

In [63]:
# Exporting model

# rf_filename = '../models/classification_rf_model.sav'
# pickle.dump(opt_rf, open(rf_filename, 'wb'))

---
### (5) Model (III): XGBoost

---

In [29]:
xgbc = xgb.XGBClassifier()

In [32]:
xgbc_params = {
  "learning_rate":[0.01, 0.1, 0.2, 0.3],
  "max_depth":[3, 5, 10]
}

xgbc_cv = RandomizedSearchCV(xgbc, param_distributions=xgbc_params, scoring='f1', n_iter=2, n_jobs=4, verbose=2)

In [33]:
xgbc_cv.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 53.9min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid=

In [34]:
xgbc_cv.best_params_

{'max_depth': 3, 'learning_rate': 0.2}

In [16]:
opt_xgb = xgb.XGBClassifier(max_depth=3, learning_rate=0.2)

In [17]:
opt_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [18]:
cross_val_score(opt_xgb, X_train, y_train).mean()

0.9631315014089618

In [19]:
cross_val_score(opt_xgb, X_train, y_train, scoring='roc_auc').mean()

0.9878688763525808

In [20]:
opt_xgb.score(X_test, y_test)

0.958160280783855

In [21]:
roc_auc_score(y_test, opt_xgb.predict(X_test))

0.8802315193169472

In [22]:
tn, fp, fn, tp = confusion_matrix(y_test, opt_xgb.predict(X_test)).ravel()

In [23]:
sensitivity = tp/(tp+fn)
sensitivity

0.7705680210228422

In [24]:
specificity = tn/(tn+fp)
specificity

0.9898950176110523

In [64]:
# Exporting model

# xgb_filename = '../models/classification_xgb_model.sav'
# pickle.dump(opt_xgb, open(xgb_filename, 'wb'))

---
### (6) Model (IV): Neural Networks

---

In [33]:
model = Sequential()

In [34]:
model.add(Dense(150, 
                input_dim=225, 
                activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(150, 
                activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(150, 
                activation='relu'))

model.add(Dense(1, 
                activation='sigmoid'))

In [35]:
es = EarlyStopping(monitor='val_loss', patience=3)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [38]:
model.fit(X_train, y_train, batch_size=256, epochs=200, validation_split=0.2, callbacks=[es])

Train on 373628 samples, validate on 93408 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200


<keras.callbacks.callbacks.History at 0x266f9132308>

In [39]:
model.evaluate(X_test, y_test)



[0.15233314213530647, 0.9535682797431946]

In [66]:
# Exporting model

# nn_filename = '../models/classification_nn_model.sav'
# pickle.dump(model, open(nn_filename, 'wb'))