# Predicting Flight Delay

Problem Set-up:
We define a delayed flight to be one that is delayed by >= 15 minutes. 
The prediction problem is to train a model that can classify flights, to predict if they will or will not be delayed.

Use case:
- The idea is that this model would be useful to choosing airlines, flightpaths, airports, at the time of booking, relatively in advance of the scheduled departure (days, weeks, months ahead of time). Therefore, the prediction problem will focus on features that can be known in advance, rather than predicting using day-off features like weather and previous flights from that day. 

Notes:
- We restrict the analysis to relatively large airport, those with more than 20 (domestic) flights a day

# Create separate models to predict for each airport 

Motivation:
The weights that should eb put on features (e.g. airlines) may differ depending on the airport (see e.g. NB 4B) since different airports can have different environments (e.g. San Diego and Chicago winters are very different; American Airlines is better in Tuscon than in Anchorage.

In [1]:
# Imports
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
sns.set_style('white')

from sklearn.linear_model import LogisticRegression
from sklearn. metrics import roc_auc_score

# Load data

In [2]:
# Import custom code
from flightdelay.fld import io as flio
airlines_df, airports_df, flights_df = flio.load_data()

In [3]:
# Filter data to keys of interest
keys = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT',
       'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME',
       'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME',
       'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 'TAXI_IN',
       'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY']
flights_df = flights_df[keys]

# Remove airports with less than a certain number of flights

In [4]:
all_airports, airport_inverse, airport_count = np.unique(flights_df['ORIGIN_AIRPORT'],return_counts=True,return_inverse=True)

In [5]:
# Determine number of flights for the origin airport
Nflights_orig = np.zeros(len(airport_inverse))
for i in range(len(all_airports)):
    Nflights_orig[np.where(airport_inverse==i)] = airport_count[i]

In [6]:
flights_df = flights_df.loc[flights_df.index[Nflights_orig>=7300]]

# Remove cancelled flights

In [7]:
flights_df = flights_df.dropna()
flights_df.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,...,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY
0,1,1,4,AS,98,N407AS,ANC,SEA,5,2354.0,...,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0
1,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2.0,...,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0
2,1,1,4,US,840,N171US,SFO,CLT,20,18.0,...,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0
3,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15.0,...,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0
4,1,1,4,AS,135,N527AS,SEA,ANC,25,24.0,...,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0


# Compute features
* one-hot day of week
* one-hot month
* one-hot airline
* one-hot airport
* one-hot departure hour

### 0. Original feat

In [8]:
def make_onehot_feat_dict_from_vals(df, feat_key, feat_name, feat_cutoffs):
    
    # Create keys
    N_feat = len(feat_cutoffs) - 1
    keys = [0]*N_feat
    for i in range(N_feat):
        keys[i] = 'f_'+feat_name+'_'+ str(feat_cutoffs[i])
    # Find the indices for each class
    feat_dict = {}
    for i in range(N_feat):
        feat_dict[keys[i]] = np.transpose(
                    np.logical_and(df[feat_key].values>= feat_cutoffs[i],
                                df[feat_key].values<feat_cutoffs[i+1]))
    return feat_dict

In [9]:
cutoffs = np.arange(60,780,60)
original_feat = make_onehot_feat_dict_from_vals(flights_df,
                    'SCHEDULED_TIME', 'dur', cutoffs)

### 1. Day of week

In [10]:
def make_onehot_feat_dict(df, feat_key, feat_name):
    # Create features for each day of the week
    feat_vals = df[feat_key].values
    all_vals = np.unique(feat_vals)
    N_vals = len(all_vals)
    N_feat = N_vals - 1

    # Create keys
    keys = [0]*N_feat
    for i in range(N_feat):
        keys[i] = 'f_'+feat_name+'_'+ str(all_vals[i])

    # Create value for each training example in dict
    feat_dict = {}
    for i, k in enumerate(keys):
        this_day = all_vals[i]
        feat_dict[k] = feat_vals == this_day
    return feat_dict

In [11]:
daysfeat_dict = make_onehot_feat_dict(flights_df, 'DAY_OF_WEEK', 'day')

### 2. Month

In [12]:
monthsfeat_dict = make_onehot_feat_dict(flights_df, 'MONTH', 'month') 

### 3. Departing airport

In [13]:
dapfeat_dict = make_onehot_feat_dict(flights_df, 'ORIGIN_AIRPORT', 'dap') 

### 4. Airline

In [14]:
alfeat_dict = make_onehot_feat_dict(flights_df, 'AIRLINE', 'al') 

### 5. Departure hour

In [15]:
# Add departure hour as a feature
flights_df['HOUR_DEPARTURE'] = np.floor(flights_df['SCHEDULED_DEPARTURE'].values/100).astype(int)
hrfeat_dict = make_onehot_feat_dict(flights_df, 'HOUR_DEPARTURE', 'hr')

# Save new feature matrix

In [16]:
all_dicts = [original_feat, daysfeat_dict, monthsfeat_dict, dapfeat_dict, alfeat_dict, hrfeat_dict]
feat_dict = all_dicts[0].copy()
for d in all_dicts[1:]:
    feat_dict.update(d)

In [17]:
df_feat = pd.DataFrame.from_dict(feat_dict)
df_feat.head()

Unnamed: 0,f_al_AA,f_al_AS,f_al_B6,f_al_DL,f_al_EV,f_al_F9,f_al_HA,f_al_MQ,f_al_NK,f_al_OO,...,f_month_10,f_month_11,f_month_2,f_month_3,f_month_4,f_month_5,f_month_6,f_month_7,f_month_8,f_month_9
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Make train val and test sets

In [18]:
N_flights = len(flights_df)
N_train = int(N_flights*.7)
N_test = N_flights - N_train

In [19]:
# Shuffle data
np.random.seed(0)
flight_shuff_idx = np.random.permutation(df_feat.index)
df_shuffle = df_feat.loc[flight_shuff_idx]

labels_preshuffle = flights_df['DEPARTURE_DELAY'].values
labels_shuffle = labels_preshuffle[flight_shuff_idx]
df_shuffle = df_shuffle.reset_index(drop=True)
df_shuffle.head()

Unnamed: 0,f_al_AA,f_al_AS,f_al_B6,f_al_DL,f_al_EV,f_al_F9,f_al_HA,f_al_MQ,f_al_NK,f_al_OO,...,f_month_10,f_month_11,f_month_2,f_month_3,f_month_4,f_month_5,f_month_6,f_month_7,f_month_8,f_month_9
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [20]:
delay_cutoff = 15

y_train = labels_shuffle[:N_train] > delay_cutoff
y_test = labels_shuffle[N_train:N_train+N_test] > delay_cutoff

In [21]:
# # zscore all features
# from scipy.stats import zscore
# df_shuffle = df_shuffle.apply(zscore)

In [22]:
X_train = df_shuffle.loc[np.arange(N_train)]
X_test = df_shuffle.loc[np.arange(N_train,N_train+N_test)]

# 1. Train whole model

In [97]:
%%time
# Run model for all individual feature sets

models = LogisticRegression(C=1)
models.fit(X_train, y_train)

# Evaluate model
train_aucs = roc_auc_score(y_train, models.predict_proba(X_train)[:,1])
test_aucs = roc_auc_score(y_test, models.predict_proba(X_test)[:,1])

CPU times: user 54.3 s, sys: 6.53 s, total: 1min
Wall time: 59.1 s


In [98]:
print(train_aucs)
print(test_aucs)

0.671633430506
0.671929005322


In [104]:
from sklearn.metrics import confusion_matrix
all_confuse = confusion_matrix(y_test, models.predict(X_test))
print('Precision: ', all_confuse[1,1]/(all_confuse[1,1]+all_confuse[0,1]))
print('Recall: ', all_confuse[1,1]/(all_confuse[1,1]+all_confuse[1,0]))

print(all_confuse)
print((all_confuse[0,0]+all_confuse[1,1])/np.sum(all_confuse))
print((all_confuse[0,0]+all_confuse[0,1])/np.sum(all_confuse))
print('Sensitivity: ', all_confuse[0,0]/(all_confuse[0,0]+all_confuse[0,1]))


Precision:  0.53813559322
Recall:  0.000447474587319
[[1290934     109]
 [ 283688     127]]
0.819795181534
0.819783751932
Sensitivity:  0.999915572138


# Interpret model coefficients

In [59]:
als_idx = np.arange(0,13)
all_als = np.array(list(X_train.keys()))[als_idx]

al_max_idx = np.argmax(models.coef_[0][als_idx])
print(X_train.keys()[al_max_idx])
print(models.coef_[0][al_max_idx])

al_min_idx = np.argmin(models.coef_[0][als_idx])
print(X_train.keys()[al_min_idx])
print(models.coef_[0][al_min_idx])

f_al_NK
0.324596033373
f_al_AS
-0.838896002258


In [75]:
als_idx = np.arange(13,110)
all_als = np.array(list(X_train.keys()))[als_idx]

al_max_idx = np.argmax(models.coef_[0][als_idx]) + 13
print(X_train.keys()[al_max_idx])
print(models.coef_[0][al_max_idx])

al_min_idx = np.argmin(models.coef_[0][als_idx]) + 13
print(X_train.keys()[al_min_idx])
print(models.coef_[0][al_min_idx])

f_dap_BTR
0.168258355915
f_dap_LIH
-0.797502799085


In [74]:
als_idx = np.arange(110,116)
all_als = np.array(list(X_train.keys()))[als_idx]
print(all_als)
al_max_idx = np.argmax(models.coef_[0][als_idx]) + 110
print(X_train.keys()[al_max_idx])
print(models.coef_[0][al_max_idx])

al_min_idx = np.argmin(models.coef_[0][als_idx]) + 110
print(X_train.keys()[al_min_idx])
print(models.coef_[0][al_min_idx])

['f_day_1' 'f_day_2' 'f_day_3' 'f_day_4' 'f_day_5' 'f_day_6']
f_day_1
0.105992697892
f_day_6
-0.101191776475


In [78]:
als_idx = np.arange(116,127)
all_als = np.array(list(X_train.keys()))[als_idx]
print(all_als)
al_max_idx = np.argmax(models.coef_[0][als_idx]) + 116
print(X_train.keys()[al_max_idx])
print(models.coef_[0][al_max_idx])

al_min_idx = np.argmin(models.coef_[0][als_idx]) + 116
print(X_train.keys()[al_min_idx])
print(models.coef_[0][al_min_idx])

['f_dur_120' 'f_dur_180' 'f_dur_240' 'f_dur_300' 'f_dur_360' 'f_dur_420'
 'f_dur_480' 'f_dur_540' 'f_dur_60' 'f_dur_600' 'f_dur_660']
f_dur_480
0.931193278409
f_dur_60
0.0895921811875


In [83]:
als_idx = np.arange(127,150)
all_als = np.array(list(X_train.keys()))[als_idx]
print(all_als)
al_max_idx = np.argmax(models.coef_[0][als_idx]) + 127
print(X_train.keys()[al_max_idx])
print(models.coef_[0][al_max_idx])

al_min_idx = np.argmin(models.coef_[0][als_idx]) + 127
print(X_train.keys()[al_min_idx])
print(models.coef_[0][al_min_idx])

['f_hr_0' 'f_hr_1' 'f_hr_10' 'f_hr_11' 'f_hr_12' 'f_hr_13' 'f_hr_14'
 'f_hr_15' 'f_hr_16' 'f_hr_17' 'f_hr_18' 'f_hr_19' 'f_hr_2' 'f_hr_20'
 'f_hr_21' 'f_hr_22' 'f_hr_3' 'f_hr_4' 'f_hr_5' 'f_hr_6' 'f_hr_7' 'f_hr_8'
 'f_hr_9']
f_hr_3
0.602734617711
f_hr_5
-1.4564623433


In [86]:
als_idx = np.arange(150,161)
all_als = np.array(list(X_train.keys()))[als_idx]
print(all_als)
al_max_idx = np.argmax(models.coef_[0][als_idx]) + 150
print(X_train.keys()[al_max_idx])
print(models.coef_[0][al_max_idx])

al_min_idx = np.argmin(models.coef_[0][als_idx]) + 150
print(X_train.keys()[al_min_idx])
print(models.coef_[0][al_min_idx])

['f_month_1' 'f_month_10' 'f_month_11' 'f_month_2' 'f_month_3' 'f_month_4'
 'f_month_5' 'f_month_6' 'f_month_7' 'f_month_8' 'f_month_9']
f_month_6
0.197257662881
f_month_10
-0.640531211346


# 2. Train models with individual onehot feature sets

In [26]:
feat_sets = [0]*len(all_dicts)
for i in range(len(all_dicts)):
    feat_sets[i] = list(all_dicts[i].keys())

In [91]:
# Run model for all individual feature sets
models = np.zeros(len(feat_sets),dtype=object)
train_aucs = np.zeros(len(feat_sets))
test_aucs = np.zeros(len(feat_sets))
test_confuse = np.zeros(len(feat_sets),dtype=list)
for i, ks in enumerate(feat_sets):
    print(i)
    # Train model
    models[i] = LogisticRegression(C=1)
    models[i].fit(X_train[ks], y_train)

    # Evaluate model
    train_aucs[i] = roc_auc_score(y_train, models[i].predict_proba(X_train[ks])[:,1])
    test_aucs[i] = roc_auc_score(y_test, models[i].predict_proba(X_test[ks])[:,1])
    
    test_confuse[i] = confusion_matrix(y_test, models[i].predict(X_test[ks]))

0
1
2
3
4
5


In [28]:
print(feat_sets[0][0])
print(feat_sets[1][0])
print(feat_sets[2][0])
print(feat_sets[3][0])
print(feat_sets[4][0])
print(feat_sets[5][0])

f_dur_60
f_day_1
f_month_1
f_dap_ABQ
f_al_AA
f_hr_0


In [29]:
print(train_aucs)
print(test_aucs)

[ 0.51792379  0.5177713   0.56500726  0.56018308  0.56829332  0.6284955 ]
[ 0.51777684  0.51760399  0.56546363  0.55982328  0.56793727  0.62854951]


In [93]:
for i, ks in enumerate(feat_sets):
    print(ks[0])
    print(test_confuse)
    print('Precision: ', test_confuse[i][1,1]/(test_confuse[i][1,1]+test_confuse[i][0,1]))
    print('Recall: ', test_confuse[i][1,1]/(test_confuse[i][1,1]+test_confuse[i][1,0]))

f_dur_60
Precision:  nan
Recall:  0.0
f_day_1
Precision:  nan
Recall:  0.0
f_month_1
Precision:  nan
Recall:  0.0
f_dap_ABQ
Precision:  nan
Recall:  0.0
f_al_AA
Precision:  nan
Recall:  0.0
f_hr_0
Precision:  nan
Recall:  0.0


  app.launch_new_instance()


# 3. Train models with removing 1 feature set

In [94]:
feat_sets = np.zeros(len(all_dicts),dtype=list)
for i in range(len(all_dicts)):
    feat_sets[i] = list(all_dicts[i].keys())

In [95]:
# Run model for all individual feature sets
models = np.zeros(len(feat_sets),dtype=object)
train_aucs = np.zeros(len(feat_sets))
test_aucs = np.zeros(len(feat_sets))
for i, ks in enumerate(feat_sets):
    print(i)
    # Choose feats
    fis = np.delete(np.arange(len(feat_sets)),i)
    ks2 = np.hstack(feat_sets[fis])

    # Train model
    models[i] = LogisticRegression(C=1)
    models[i].fit(X_train[ks2], y_train)

    # Evaluate model
    train_aucs[i] = roc_auc_score(y_train, models[i].predict_proba(X_train[ks2])[:,1])
    test_aucs[i] = roc_auc_score(y_test, models[i].predict_proba(X_test[ks2])[:,1])

0
1
2
3
4
5


In [32]:
print(train_aucs)
print(test_aucs)

[ 0.6708183   0.67062881  0.6581474   0.66732485  0.66153361  0.61135848]
[ 0.67108736  0.67097393  0.65820497  0.66757322  0.6619307   0.61134654]


In [96]:
for i, ks in enumerate(feat_sets):
    print(ks[0])
    print(test_confuse)
    print('Precision: ', test_confuse[i][1,1]/(test_confuse[i][1,1]+test_confuse[i][0,1]))
    print('Recall: ', test_confuse[i][1,1]/(test_confuse[i][1,1]+test_confuse[i][1,0]))

f_dur_60
Precision:  nan
Recall:  0.0
f_day_1
Precision:  nan
Recall:  0.0
f_month_1
Precision:  nan
Recall:  0.0
f_dap_ABQ
Precision:  nan
Recall:  0.0
f_al_AA
Precision:  nan
Recall:  0.0
f_hr_0
Precision:  nan
Recall:  0.0


  app.launch_new_instance()
