# Predicting Flight Delay

Add network features

In [1]:
# Imports
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
sns.set_style('white')

from sklearn.linear_model import LogisticRegression
from sklearn. metrics import roc_auc_score

# Load data

In [2]:
# Import custom code
from flightdelay.fld import io as flio
airlines_df, airports_df, flights_df = flio.load_data()

In [3]:
# Filter data to keys of interest
keys = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT',
       'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME',
       'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME',
       'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 'TAXI_IN',
       'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY']
flights_df = flights_df[keys]

# Remove airports with less than a certain number of flights

In [4]:
all_airports, airport_inverse, airport_count = np.unique(flights_df['ORIGIN_AIRPORT'],return_counts=True,return_inverse=True)

In [5]:
# Determine number of flights for the origin airport
Nflights_orig = np.zeros(len(airport_inverse))
for i in range(len(all_airports)):
    Nflights_orig[np.where(airport_inverse==i)] = airport_count[i]

In [6]:
flights_df = flights_df.loc[flights_df.index[Nflights_orig>=7300]]

# Remove cancelled flights

In [7]:
flights_df = flights_df.dropna()
flights_df.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,...,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY
0,1,1,4,AS,98,N407AS,ANC,SEA,5,2354.0,...,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0
1,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2.0,...,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0
2,1,1,4,US,840,N171US,SFO,CLT,20,18.0,...,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0
3,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15.0,...,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0
4,1,1,4,AS,135,N527AS,SEA,ANC,25,24.0,...,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0


# Compute features
* one-hot day of week
* one-hot month
* one-hot airline
* one-hot airport
* one-hot departure hour

### 0. Original feat

In [8]:
def make_onehot_feat_dict_from_vals(df, feat_key, feat_name, feat_cutoffs):
    
    # Create keys
    N_feat = len(feat_cutoffs) - 1
    keys = [0]*N_feat
    for i in range(N_feat):
        keys[i] = 'f_'+feat_name+'_'+ str(feat_cutoffs[i])
    # Find the indices for each class
    feat_dict = {}
    for i in range(N_feat):
        feat_dict[keys[i]] = np.transpose(
                    np.logical_and(df[feat_key].values>= feat_cutoffs[i],
                                df[feat_key].values<feat_cutoffs[i+1]))
    return feat_dict

In [9]:
cutoffs = np.arange(60,780,60)
original_feat = make_onehot_feat_dict_from_vals(flights_df,
                    'SCHEDULED_TIME', 'dur', cutoffs)

### 1. Day of week

In [10]:
def make_onehot_feat_dict(df, feat_key, feat_name):
    # Create features for each day of the week
    feat_vals = df[feat_key].values
    all_vals = np.unique(feat_vals)
    N_vals = len(all_vals)
    N_feat = N_vals - 1

    # Create keys
    keys = [0]*N_feat
    for i in range(N_feat):
        keys[i] = 'f_'+feat_name+'_'+ str(all_vals[i])

    # Create value for each training example in dict
    feat_dict = {}
    for i, k in enumerate(keys):
        this_day = all_vals[i]
        feat_dict[k] = feat_vals == this_day
    return feat_dict

In [11]:
daysfeat_dict = make_onehot_feat_dict(flights_df, 'DAY_OF_WEEK', 'day')

### 2. Month

In [12]:
monthsfeat_dict = make_onehot_feat_dict(flights_df, 'MONTH', 'month') 

### 3. Departing airport

In [13]:
dapfeat_dict = make_onehot_feat_dict(flights_df, 'ORIGIN_AIRPORT', 'dap') 

### 4. Airline

In [14]:
alfeat_dict = make_onehot_feat_dict(flights_df, 'AIRLINE', 'al') 

### 5. Departure hour

In [15]:
# Add departure hour as a feature
flights_df['HOUR_DEPARTURE'] = np.floor(flights_df['SCHEDULED_DEPARTURE'].values/100).astype(int)
hrfeat_dict = make_onehot_feat_dict(flights_df, 'HOUR_DEPARTURE', 'hr')

# 6. Add network features

In [24]:
# Load hub and network info
airlines_df = pd.read_json('/gh/data/flightdelay/new_airlines.json')
airports_df = pd.read_json('/gh/data/flightdelay/new_airports.json')

airline_hub = airlines_df[['IATA_CODE', 'HUB']].set_index('IATA_CODE').to_dict()['HUB']
airline_hubness =  airlines_df[['IATA_CODE', 'HUBNESS']].set_index('IATA_CODE').to_dict()['HUBNESS']

airport_cen = airports_df[['IATA_CODE', 'CEN']].set_index('IATA_CODE').to_dict()['CEN']
airport_clo_cen = airports_df[['IATA_CODE', 'CLO_CEN']].set_index('IATA_CODE').to_dict()['CLO_CEN']
airport_deg = airports_df[['IATA_CODE', 'DEG']].set_index('IATA_CODE').to_dict()['DEG']
airport_ne_deg = airports_df[['IATA_CODE', 'NE_DEG']].set_index('IATA_CODE').to_dict()['NE_DEG']

In [38]:
# Calculate features for airline
h = np.array([1 if airline_hub[dat[1]] == dat[0] else 0 for dat in flights_df[['ORIGIN_AIRPORT', 'AIRLINE']].values])
hn = np.array([airline_hubness[line] for line in flights_df.AIRLINE.values])

# Calculate features for airport
ce = np.array([airport_cen[port] for port in flights_df.ORIGIN_AIRPORT.values])
cc = np.array([airport_clo_cen[port] for port in flights_df.ORIGIN_AIRPORT.values])
de = np.array([airport_deg[port] for port in flights_df.ORIGIN_AIRPORT.values])
nd = np.array([airport_ne_deg[port] for port in flights_df.ORIGIN_AIRPORT.values])

In [40]:
# Make dictionary of network features
network_dict = {'ishub': h,
                'hubfrac' : hn,
                'degree' : de,
                'degree_neighbor' : nd,
                'centrality' : ce,
                'centrality2' : cc}

# Save new feature matrix

In [41]:
all_dicts = [network_dict, original_feat, daysfeat_dict, monthsfeat_dict, dapfeat_dict, alfeat_dict, hrfeat_dict]
feat_dict = all_dicts[0].copy()
for d in all_dicts[1:]:
    feat_dict.update(d)

In [42]:
df_feat = pd.DataFrame.from_dict(feat_dict)
df_feat.head()

Unnamed: 0,centrality,centrality2,degree,degree_neighbor,f_al_AA,f_al_AS,f_al_B6,f_al_DL,f_al_EV,f_al_F9,...,f_month_2,f_month_3,f_month_4,f_month_5,f_month_6,f_month_7,f_month_8,f_month_9,hubfrac,ishub
0,0.185567,0.551136,18,58.777778,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,0.316559,0
1,0.690722,0.76378,67,40.119403,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0.201875,0
2,0.618557,0.723881,60,41.35,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0.2233,0
3,0.690722,0.76378,67,40.119403,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0.201875,0
4,0.628866,0.729323,61,41.967213,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,0.316559,1


# Make train val and test sets

In [45]:
N_flights = len(flights_df)
N_train = int(N_flights*.7)
N_test = N_flights - N_train

In [46]:
# Shuffle data
np.random.seed(0)
flight_shuff_idx = np.random.permutation(df_feat.index)
df_shuffle = df_feat.loc[flight_shuff_idx]

labels_preshuffle = flights_df['DEPARTURE_DELAY'].values
labels_shuffle = labels_preshuffle[flight_shuff_idx]
df_shuffle = df_shuffle.reset_index(drop=True)
df_shuffle.head()

Unnamed: 0,centrality,centrality2,degree,degree_neighbor,f_al_AA,f_al_AS,f_al_B6,f_al_DL,f_al_EV,f_al_F9,...,f_month_2,f_month_3,f_month_4,f_month_5,f_month_6,f_month_7,f_month_8,f_month_9,hubfrac,ishub
0,0.412371,0.62987,40,54.75,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,0.066252,0
1,0.257732,0.573964,25,51.4,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,0.316559,0
2,0.484536,0.659864,47,46.978723,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,0.127441,0
3,0.42268,0.633987,41,53.02439,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,0.066252,0
4,0.886598,0.898148,86,35.05814,False,False,False,False,True,False,...,False,False,False,False,True,False,False,False,0.110249,0


In [47]:
delay_cutoff = 15

y_train = labels_shuffle[:N_train] > delay_cutoff
y_test = labels_shuffle[N_train:N_train+N_test] > delay_cutoff

In [48]:
# # zscore all features
# from scipy.stats import zscore
# df_shuffle = df_shuffle.apply(zscore)

In [49]:
X_train = df_shuffle.loc[np.arange(N_train)]
X_test = df_shuffle.loc[np.arange(N_train,N_train+N_test)]

# 1. Train whole model

In [50]:
%%time
# Run model for all individual feature sets

models = LogisticRegression(C=1)
models.fit(X_train, y_train)

# Evaluate model
train_aucs = roc_auc_score(y_train, models.predict_proba(X_train)[:,1])
test_aucs = roc_auc_score(y_test, models.predict_proba(X_test)[:,1])

CPU times: user 1min 51s, sys: 49.3 s, total: 2min 41s
Wall time: 2min 48s


In [51]:
print(train_aucs)
print(test_aucs)

0.671730281773
0.672052160674


In [52]:
from sklearn.metrics import confusion_matrix
all_confuse = confusion_matrix(y_test, models.predict(X_test))
print('Precision: ', all_confuse[1,1]/(all_confuse[1,1]+all_confuse[0,1]))
print('Recall: ', all_confuse[1,1]/(all_confuse[1,1]+all_confuse[1,0]))

print(all_confuse)
print((all_confuse[0,0]+all_confuse[1,1])/np.sum(all_confuse))
print((all_confuse[0,0]+all_confuse[0,1])/np.sum(all_confuse))
print('Sensitivity: ', all_confuse[0,0]/(all_confuse[0,0]+all_confuse[0,1]))


Precision:  0.535384615385
Recall:  0.000613075418847
[[1290892     151]
 [ 283641     174]]
0.819798356423
0.819783751932
Sensitivity:  0.999883040302


# Interpret model coefficients

In [53]:
print(X_train.keys()[:10])
print(models.coef_[:10])

Index(['centrality', 'centrality2', 'degree', 'degree_neighbor', 'f_al_AA',
       'f_al_AS', 'f_al_B6', 'f_al_DL', 'f_al_EV', 'f_al_F9'],
      dtype='object')
[[  8.28969008e-07  -2.99055833e-01   8.04023134e-05  -4.33518651e-03
   -2.49945985e-01  -6.60562235e-01   9.08755007e-02  -3.55622183e-01
   -2.49443372e-01   1.87535055e-01  -4.71185216e-01  -3.35785721e-02
    3.51940212e-01  -2.15149926e-01   1.10160443e-01  -4.65538904e-01
   -1.65030487e-01   1.55062525e-02  -2.36935163e-01  -8.28249750e-02
    7.67633501e-02  -5.41494693e-02  -1.30818539e-01   2.60718330e-02
   -1.70407532e-02  -3.67943861e-02   1.00654362e-02   5.55977363e-01
   -1.76028756e-01  -3.60917549e-01   1.91248708e-01  -9.39536455e-02
    1.45458506e-01  -1.37390329e-01   9.67921113e-02  -3.93425449e-02
    3.77051426e-01  -6.43858992e-02   1.37770684e-01  -1.27154371e-02
   -1.59016585e-01   8.57527235e-02   2.39223618e-01   1.32065864e-01
   -7.86675557e-02  -7.01441919e-02   2.12150035e-01   3.20069240e-01