# Predicting Flight Delay

Problem Set-up:
We define a delayed flight to be one that is delayed by >= 15 minutes. 
The prediction problem is to train a model that can classify flights, to predict if they will or will not be delayed.

Use case:
- The idea is that this model would be useful to choosing airlines, flightpaths, airports, at the time of booking, relatively in advance of the scheduled departure (days, weeks, months ahead of time). Therefore, the prediction problem will focus on features that can be known in advance, rather than predicting using day-off features like weather and previous flights from that day. 

Notes:
- We restrict the analysis to relatively large airport, those with more than 20 (domestic) flights a day

# This notebook: more features
* one-hot day of week
* one-hot month
* one-hot airline
* one-hot airport
* one-hot departure hour

# Create separate models to predict for each airport 

Motivation:
The weights that should eb put on features (e.g. airlines) may differ depending on the airport (see e.g. NB 4B) since different airports can have different environments (e.g. San Diego and Chicago winters are very different; American Airlines is better in Tuscon than in Anchorage.

In [1]:
# Imports
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
sns.set_style('white')

from sklearn.linear_model import LogisticRegression

In [2]:
# Import custom code
from flightdelay.fld import io as flio

# Load data

In [3]:
airlines_df, airports_df, flights_df = flio.load_data()

In [4]:
# Filter data to keys of interest
keys = ['MONTH', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT',
       'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'SCHEDULED_TIME', 'DISTANCE',
       'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY']
flights_df = flights_df[keys]

# Remove airports with less than a certain number of flights

In [5]:
all_airports, airport_inverse, airport_count = np.unique(flights_df['ORIGIN_AIRPORT'],return_counts=True,return_inverse=True)

In [6]:
# Determine number of flights for the origin airport
Nflights_orig = np.zeros(len(airport_inverse))
for i in range(len(all_airports)):
    Nflights_orig[np.where(airport_inverse==i)] = airport_count[i]

In [7]:
flights_df = flights_df.loc[flights_df.index[Nflights_orig>=7300]]

# Remove cancelled flights

In [8]:
flights_df = flights_df.dropna()
flights_df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY
0,1,4,AS,98,N407AS,ANC,SEA,5,-11.0,205.0,1448,430,-22.0
1,1,4,AA,2336,N3KUAA,LAX,PBI,10,-8.0,280.0,2330,750,-9.0
2,1,4,US,840,N171US,SFO,CLT,20,-2.0,286.0,2296,806,5.0
3,1,4,AA,258,N3HYAA,LAX,MIA,20,-5.0,285.0,2342,805,-9.0
4,1,4,AS,135,N527AS,SEA,ANC,25,-1.0,235.0,1448,320,-21.0


# Make train and val set

In [9]:
N_flights = 10000
N_train = int(N_flights*.7)
np.random.seed(0)
flight_take_idx = np.random.permutation(flights_df.index)[:N_flights]
df_trainval = flights_df.loc[flight_take_idx]
df_trainval = df_trainval.reset_index(drop=True)

In [10]:
df_trainval.head(5)

Unnamed: 0,MONTH,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY
0,8,7,WN,285,N7738A,MCI,DEN,1410,-2.0,105.0,533,1455,-12.0
1,12,6,AS,335,N520AS,SJC,SEA,1550,-16.0,120.0,697,1750,-1.0
2,12,2,UA,1750,N76254,SAN,IAH,1314,40.0,188.0,1303,1822,20.0
3,5,7,WN,396,N8318F,AUS,DEN,1935,64.0,140.0,775,2055,52.0
4,6,5,EV,4899,N133EV,ATL,MYR,1454,15.0,85.0,317,1619,7.0


# Prepare dataframe

In [48]:
# Declare DFs
labels = df_trainval['DEPARTURE_DELAY'].values>15
df_trainval['label'] = labels*1
df_train = df_trainval.loc[:N_train]
df_test = df_trainval.loc[N_train:]

# Set up tensorflow model

In [31]:
import tensorflow as tf

In [32]:
CATEGORICAL_COLUMNS = ["MONTH","DAY_OF_WEEK","AIRLINE"]
CONTINUOUS_COLUMNS = ["DISTANCE"]
LABEL_COLUMN = "label"

In [49]:
df_train['label'].dtype

dtype('int64')

In [33]:
df_train.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,HOUR_DEPARTURE,HOUR_ARRIVAL,label
0,8,7,WN,285,N7738A,MCI,DEN,1410,-2.0,105.0,533,1455,-12.0,14,14,0
1,12,6,AS,335,N520AS,SJC,SEA,1550,-16.0,120.0,697,1750,-1.0,15,17,0
2,12,2,UA,1750,N76254,SAN,IAH,1314,40.0,188.0,1303,1822,20.0,13,18,1
3,5,7,WN,396,N8318F,AUS,DEN,1935,64.0,140.0,775,2055,52.0,19,20,1
4,6,5,EV,4899,N133EV,ATL,MYR,1454,15.0,85.0,317,1619,7.0,14,16,0


In [34]:
def input_fn(df):
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values)
                     for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      dense_shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    feature_cols = {**continuous_cols,**categorical_cols}#dict(continuous_cols.items() + categorical_cols.items())
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label

def train_input_fn():
    return input_fn(df_train)

def eval_input_fn():
    return input_fn(df_test)

### Set up categorical vars

In [35]:
month = tf.contrib.layers.sparse_column_with_keys(column_name="MONTH", keys=np.unique(df_train['MONTH']))
day = tf.contrib.layers.sparse_column_with_keys(column_name="DAY_OF_WEEK", keys=np.unique(df_train['DAY_OF_WEEK']))
airline = tf.contrib.layers.sparse_column_with_keys(column_name="AIRLINE", keys=np.unique(df_train['AIRLINE']))



In [36]:
distance = tf.contrib.layers.real_valued_column("DISTANCE")

# Model

In [39]:
import tempfile
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[month, day, airline, distance],
  model_dir=model_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x135c94da0>, '_master': '', '_num_ps_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000}


In [42]:
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

NotFittedError: Couldn't find trained model at /var/folders/91/b8j_vd6d0nn6s4kpqql3ypk00000gn/T/tmpbt7a907g.

# Evaluate model

In [None]:
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)
print(model.score(X_train, y_train))
print(sum(model.predict(X_train)))
print(model.score(X_val, y_val))
print(sum(model.predict(X_val)))

In [None]:
from sklearn.metrics import confusion_matrix
cmat = confusion_matrix(y_train, preds_train)
print(cmat)

In [None]:
print('True negative rate:', cmat[1,1]/sum(cmat[1]))
print('True positive rate:', cmat[0,0]/sum(cmat[0]))

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, th = roc_curve(y_train, model.predict_proba(X_train)[:,1])
print(roc_auc_score(y_train, model.predict_proba(X_train)[:,1]))

plt.figure(figsize=(6,6))
plt.plot(fpr,tpr,'r')
plt.plot([0,1],[0,1],'k--')
plt.xlim((0,1))
plt.ylim((0,1))

fpr, tpr, th = roc_curve(y_val, model.predict_proba(X_val)[:,1])
print(roc_auc_score(y_val, model.predict_proba(X_val)[:,1]))

plt.figure(figsize=(6,6))
plt.plot(fpr,tpr,'r')
plt.plot([0,1],[0,1],'k--')
plt.xlim((0,1))
plt.ylim((0,1))