## Machine Learning

We want to build a predictive model to predict whether or not a flight will be delayed based on several different parameters. Rather than collect weather data for every airport in our dataset, we will only be using the data from the following cities:
* LA
* Chicago
* Boston
* Phoenix
* Denver


In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
import datetime as dt

In [2]:
### Two helper functions that will be used later

# Round a date to the nearest hour, then convert it to a datetime object
def round_date(year, month, day, time):
    return dt.datetime(int(year), int(month), int(day), int(time[:2]))

# Round a time to the nearest 15 min, then convert it to a time object
def round_quarter_hour(timestr):
    hour = int(timestr[:2])
    minute = int(timestr[2:])
    if minute < 7.5:
        minute = 0
    elif minute >= 7.5 and minute < 22.5:
        minute = 15
    elif minute >= 22.5 and minute < 37.5:
        minute = 30
    elif minute >= 37.5 and minute < 52.5:
        minute = 45
    else:
        minute = 0
        
    return dt.time(hour, minute, 0)

In [3]:
### The list of airports in our five main cities
airports = pd.read_csv("../data/airport_names.csv")

# Split airports by city
la = airports[airports.Description.str.contains("Los Angeles, CA")]
chi = airports[airports.Description.str.contains("Chicago, IL")]
bos = airports[airports.Description.str.contains("Boston, MA")]
pho = airports[airports.Description.str.contains("Phoenix, AZ")]
den = airports[airports.Description.str.contains("Denver, CO")]

In [4]:
### Load weather data
lat_lon = pd.read_csv("../data/weather/city_attributes.csv")
humidity = pd.read_csv("../data/weather/humidity.csv")
pressure = pd.read_csv("../data/weather/pressure.csv")
temp = pd.read_csv("../data/weather/temperature.csv")
desc = pd.read_csv("../data/weather/weather_description.csv")
wind_dir = pd.read_csv("../data/weather/wind_direction.csv")
wind_spe = pd.read_csv("../data/weather/wind_speed.csv")

In [5]:
### Because each of the weather tables have the city names for each column, we need
### to rename the columns to reflect the table that they come from.
humidity = humidity[["datetime","Los Angeles", "Chicago", "Boston", "Phoenix", "Denver"]]
humidity.columns = ("datetimeh","LA_h","CHI_h","BOS_h","PHO_h","DEN_h")
pressure = pressure[["datetime","Los Angeles", "Chicago", "Boston", "Phoenix", "Denver"]]
pressure.columns = ("datetimep","LA_p","CHI_p","BOS_p","PHO_p","DEN_p")
temp = temp[["datetime","Los Angeles", "Chicago", "Boston", "Phoenix", "Denver"]]
temp.columns = ("datetimet","LA_t","CHI_t","BOS_t","PHO_t","DEN_t")
desc = desc[["datetime","Los Angeles", "Chicago", "Boston", "Phoenix", "Denver"]]
desc.columns = ("datetimed","LA_d","CHI_d","BOS_d","PHO_d","DEN_d")
wind_dir = wind_dir[["datetime","Los Angeles", "Chicago", "Boston", "Phoenix", "Denver"]]
wind_dir.columns = ("datetimewd","LA_wd","CHI_wd","BOS_wd","PHO_wd","DEN_wd")
wind_spe = wind_spe[["datetime","Los Angeles", "Chicago", "Boston", "Phoenix", "Denver"]]
wind_spe.columns = ("datetimews","LA_ws","CHI_ws","BOS_ws","PHO_ws","DEN_ws")

In [6]:
# Convert datetime column to datetime object
humidity.datetimeh = pd.to_datetime(humidity.datetimeh, format="%Y-%m-%d %H:%M:%S")
pressure.datetimep = pd.to_datetime(pressure.datetimep, format="%Y-%m-%d %H:%M:%S")
temp.datetimet = pd.to_datetime(temp.datetimet, format="%Y-%m-%d %H:%M:%S")
desc.datetimed = pd.to_datetime(desc.datetimed, format="%Y-%m-%d %H:%M:%S")
wind_dir.datetimewd = pd.to_datetime(wind_dir.datetimewd, format="%Y-%m-%d %H:%M:%S")
wind_spe.datetimews = pd.to_datetime(wind_spe.datetimews, format="%Y-%m-%d %H:%M:%S")

In [7]:
# Load in flights from August 2017
flights = pd.read_csv("../data/flights2.csv", dtype={'CRS_DEP_TIME':str})

# Keep select columns
flights = flights[['FL_DATE','OP_CARRIER_AIRLINE_ID','OP_CARRIER_FL_NUM','ORIGIN','DEST','CRS_DEP_TIME','DEP_DELAY_NEW','CRS_ELAPSED_TIME']]
flights.columns = ('date','airline_id','flight_id','origin','dest','dep_time','delay','flight_time')

In [8]:
flights.head()

Unnamed: 0,date,airline_id,flight_id,origin,dest,dep_time,delay,flight_time
0,2017-08-01,19805,1150,DFW,LGA,635,0.0,206.0
1,2017-08-01,19805,1152,DFW,JAC,1830,0.0,168.0
2,2017-08-01,19805,1153,CLT,STT,1150,0.0,233.0
3,2017-08-01,19805,1153,JFK,CLT,815,54.0,126.0
4,2017-08-01,19805,1155,LAX,LAS,1400,11.0,83.0


In [9]:
# Add three columns for date
flights['year'] = flights.date.str[:4]
flights['month'] = flights.date.str[5:7]
flights['day'] = flights.date.str[8:]
flights.drop(columns=['date'], inplace=True)

In [10]:
# Create Denver flights dataframe
flights_den = flights.merge(den, left_on="origin", right_on="Code")
for idx,row in flights_den.iterrows():
    # Use round_date to create datetime object for the time of the flight
    # This will help us match the weather info to the flight info
    flights_den.at[idx, 'time'] = round_date(row.year, row.month, row.day, row.dep_time)
    # Convert delay to a binary target. If the delay time > 0, 
    # then the flight was delayed.
    if row.delay > 0:
        flights_den.at[idx, 'delay'] = 1

# Merge new dataframe with matching weather data
flights_den = flights_den.merge(
    humidity[['datetimeh','DEN_h']], left_on='time', right_on='datetimeh').merge(
    pressure[['datetimep','DEN_p']], left_on='time', right_on='datetimep').merge(
    temp[['datetimet','DEN_t']], left_on='time', right_on='datetimet').merge(
    desc[['datetimed','DEN_d']], left_on='time', right_on='datetimed').merge(
    wind_dir[['datetimewd','DEN_wd']], left_on='time', right_on='datetimewd').merge(
    wind_spe[['datetimews','DEN_ws']], left_on='time', right_on='datetimews')

# Drop unneeded columns
flights_den.drop(columns=['Code','Description','datetimeh','datetimep','datetimet','datetimed','datetimewd','datetimews','time'], inplace=True)

In [11]:
# Create Los Angeles flights dataframe
flights_la = flights.merge(la, left_on="origin", right_on="Code")
for idx,row in flights_la.iterrows():
    # Use round_date to create datetime object for the time of the flight
    # This will help us match the weather info to the flight info
    flights_la.at[idx, 'time'] = round_date(row.year, row.month, row.day, row.dep_time)
    # Convert delay to a binary target. If the delay time > 0, 
    # then the flight was delayed.
    if row.delay > 0:
        flights_la.at[idx, 'delay'] = 1
    
# Merge new dataframe with matching weather data
flights_la = flights_la.merge(
    humidity[['datetimeh','LA_h']], left_on='time', right_on='datetimeh').merge(
    pressure[['datetimep','LA_p']], left_on='time', right_on='datetimep').merge(
    temp[['datetimet','LA_t']], left_on='time', right_on='datetimet').merge(
    desc[['datetimed','LA_d']], left_on='time', right_on='datetimed').merge(
    wind_dir[['datetimewd','LA_wd']], left_on='time', right_on='datetimewd').merge(
    wind_spe[['datetimews','LA_ws']], left_on='time', right_on='datetimews')

# Drop unneeded columns
flights_la.drop(columns=['Code','Description','datetimeh','datetimep','datetimet','datetimed','datetimewd','datetimews','time'], inplace=True)

In [12]:
# Create Boston flights dataframe
flights_bos = flights.merge(bos, left_on="origin", right_on="Code")
for idx,row in flights_bos.iterrows():
    # Use round_date to create datetime object for the time of the flight
    # This will help us match the weather info to the flight info
    flights_bos.at[idx, 'time'] = round_date(row.year, row.month, row.day, row.dep_time)
    # Convert delay to a binary target. If the delay time > 0, 
    # then the flight was delayed.
    if row.delay > 0:
        flights_bos.at[idx, 'delay'] = 1
    
# Merge new dataframe with matching weather data
flights_bos = flights_bos.merge(
    humidity[['datetimeh','BOS_h']], left_on='time', right_on='datetimeh').merge(
    pressure[['datetimep','BOS_p']], left_on='time', right_on='datetimep').merge(
    temp[['datetimet','BOS_t']], left_on='time', right_on='datetimet').merge(
    desc[['datetimed','BOS_d']], left_on='time', right_on='datetimed').merge(
    wind_dir[['datetimewd','BOS_wd']], left_on='time', right_on='datetimewd').merge(
    wind_spe[['datetimews','BOS_ws']], left_on='time', right_on='datetimews')

# Drop unneeded columns
flights_bos.drop(columns=['Code','Description','datetimeh','datetimep','datetimet','datetimed','datetimewd','datetimews','time'], inplace=True)

In [13]:
# Create Chicago flights dataframe
flights_chi = flights.merge(chi, left_on="origin", right_on="Code")
for idx,row in flights_chi.iterrows():
    # Use round_date to create datetime object for the time of the flight
    # This will help us match the weather info to the flight info
    flights_chi.at[idx, 'time'] = round_date(row.year, row.month, row.day, row.dep_time)
    # Convert delay to a binary target. If the delay time > 0, 
    # then the flight was delayed.
    if row.delay > 0:
        flights_chi.at[idx, 'delay'] = 1
    
# Merge new dataframe with matching weather data
flights_chi = flights_chi.merge(
    humidity[['datetimeh','CHI_h']], left_on='time', right_on='datetimeh').merge(
    pressure[['datetimep','CHI_p']], left_on='time', right_on='datetimep').merge(
    temp[['datetimet','CHI_t']], left_on='time', right_on='datetimet').merge(
    desc[['datetimed','CHI_d']], left_on='time', right_on='datetimed').merge(
    wind_dir[['datetimewd','CHI_wd']], left_on='time', right_on='datetimewd').merge(
    wind_spe[['datetimews','CHI_ws']], left_on='time', right_on='datetimews')

# Drop unneeded columns
flights_chi.drop(columns=['Code','Description','datetimeh','datetimep','datetimet','datetimed','datetimewd','datetimews','time'], inplace=True)

In [14]:
# Create Phoenix flights dataframe
flights_pho = flights.merge(pho, left_on="origin", right_on="Code")
for idx,row in flights_pho.iterrows():
    # Use round_date to create datetime object for the time of the flight
    # This will help us match the weather info to the flight info
    flights_pho.at[idx, 'time'] = round_date(row.year, row.month, row.day, row.dep_time)
    # Convert delay to a binary target. If the delay time > 0, 
    # then the flight was delayed.
    if row.delay > 0:
        flights_pho.at[idx, 'delay'] = 1
    
# Merge new dataframe with matching weather data
flights_pho = flights_pho.merge(
    humidity[['datetimeh','PHO_h']], left_on='time', right_on='datetimeh').merge(
    pressure[['datetimep','PHO_p']], left_on='time', right_on='datetimep').merge(
    temp[['datetimet','PHO_t']], left_on='time', right_on='datetimet').merge(
    desc[['datetimed','PHO_d']], left_on='time', right_on='datetimed').merge(
    wind_dir[['datetimewd','PHO_wd']], left_on='time', right_on='datetimewd').merge(
    wind_spe[['datetimews','PHO_ws']], left_on='time', right_on='datetimews')

# Drop unneeded columns
flights_pho.drop(columns=['Code','Description','datetimeh','datetimep','datetimet','datetimed','datetimewd','datetimews','time'], inplace=True)

In [15]:
# Rename columns to facilitate concatenating the dataframes together
flights_den.columns = ['airline_id','flight_id', 'origin', 'dest', 'dep_time','delay', 'flight_time','year','month','day', 'h', 'p','t', 'd', 'wd', 'ws']
flights_la.columns = ['airline_id', 'flight_id', 'origin', 'dest', 'dep_time','delay', 'flight_time','year','month','day', 'h', 'p','t', 'd', 'wd', 'ws']
flights_bos.columns = ['airline_id', 'flight_id', 'origin', 'dest', 'dep_time','delay', 'flight_time','year','month','day', 'h', 'p','t', 'd', 'wd', 'ws']
flights_chi.columns = ['airline_id', 'flight_id', 'origin', 'dest', 'dep_time','delay', 'flight_time','year','month','day', 'h', 'p','t', 'd', 'wd', 'ws']
flights_pho.columns = ['airline_id', 'flight_id', 'origin', 'dest', 'dep_time','delay', 'flight_time','year','month','day', 'h', 'p','t', 'd', 'wd', 'ws']

In [16]:
# Concatenate dataframes
all_flights = flights_den.append(
    flights_la, ignore_index=True).append(
    flights_bos, ignore_index=True).append(
    flights_chi, ignore_index=True).append(
    flights_pho, ignore_index=True)
# Rename target variable (delay) to y
all_flights = all_flights.rename(columns={'delay':'y'})

In [17]:
### Convert AirlineID to string, this will help when we create dummies
### since AirlineID is a categorical variable
all_flights.airline_id = all_flights.airline_id.astype(str)

In [18]:
### Use round_quarter_hour to create time object for the departure time
### If we fail to do this, we end up adding ~1400 dummy variables. To reduce
### this number, we round each flight time to the nearest quarter hour
for idx, row in all_flights.iterrows():
    all_flights.at[idx,'dep_time_dt'] = round_quarter_hour(row.dep_time)

In [19]:
### Some values in our target variable (delay) are null. If the delay is
### null, we are assuming there was no delay
all_flights = all_flights.fillna(0)

In [20]:
# Create target variable dataframe
all_flights_y = all_flights['y']

In [21]:
# Create numpy arrays of training parameters and target parameter
y = all_flights_y.to_numpy()
x = all_flights.drop(columns=['y','dep_time', 'flight_id', 'year', 'month', 'day'])
# Create dummy variables
x = pd.get_dummies(x).to_numpy()

In [22]:
# Split training and testing data
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=.4)

### Model 1: Random Forest

In [23]:
### Use RandomizedSearchCV to discover the best set of parameters to
### use while training the model
rf = RandomForestClassifier()
random_grid = {
    'n_estimators': [x for x in range(10,101,10)],
    'criterion': ['gini','entropy'],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1,2,4],
    'bootstrap': [True, False],
    'max_depth': [None, 10, 50, 100, 150]
}

rf_grid = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=50, cv=3, verbose=4, n_jobs=-1)

rf_grid.fit(train_x, train_y)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 11.0min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 10, 50, 100, 150],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'n_estimators': [10, 20, 30, 40, 50, 60,
                                                         70, 80, 90, 100]},
                   verbose=4)

In [24]:
# Best parameters
best = rf_grid.best_params_

In [25]:
best

{'n_estimators': 70,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 150,
 'criterion': 'gini',
 'bootstrap': False}

In [26]:
# Train model based on best parameters
rf = RandomForestClassifier(n_estimators=best['n_estimators'], 
                            max_depth=best['max_depth'],
                            criterion=best['criterion'], 
                            min_samples_leaf=best['min_samples_leaf'],
                            max_features=best['max_features'],
                            bootstrap=best['bootstrap'],
                           random_state=47906)
rf.fit(train_x, train_y)

# Get predictions and probabilities
pred_rf = rf.predict(test_x)
pred_prob_rf = rf.predict_proba(test_x)

In [34]:
# Accuracy score
acc_rf = accuracy_score(test_y, pred_rf)
print("Accuracy for {} forest with {} trees and a max depth of {}: {}%\n".format(best['criterion'], best['n_estimators'], best['max_depth'], round(acc_rf*100,2)))
for i in range(10):
    print("Probabilities: {} -----> Actual: {}".format(pred_prob_rf[i], test_y[i]))

Accuracy for gini forest with 70 trees and a max depth of 150: 69.24%

Probabilities: [0.65620065 0.34379935] -----> Actual: 1.0
Probabilities: [0.42668702 0.57331298] -----> Actual: 1.0
Probabilities: [0.62439403 0.37560597] -----> Actual: 1.0
Probabilities: [0.77071774 0.22928226] -----> Actual: 0.0
Probabilities: [0.68565656 0.31434344] -----> Actual: 0.0
Probabilities: [0.63994167 0.36005833] -----> Actual: 0.0
Probabilities: [0.53170801 0.46829199] -----> Actual: 0.0
Probabilities: [0.50491681 0.49508319] -----> Actual: 0.0
Probabilities: [0.44902036 0.55097964] -----> Actual: 1.0
Probabilities: [0.78282686 0.21717314] -----> Actual: 0.0


### Model 2: Neural Network

In [35]:
nn = MLPClassifier(hidden_layer_sizes=(20,50,20), verbose=False, random_state=47906)
nn.fit(train_x, train_y)

pred_nn = nn.predict(test_x)

acc = accuracy_score(test_y, pred_nn)
print("Accuracy for NN model: {}%".format(round(acc*100,2)))

Accuracy for NN model: 65.88%
