## Imports

In [111]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import class_weight

## Data Loading and Preprocessing

In [112]:
DATA_PATH = os.path.join('resources', '2018.csv')

In [113]:
def load_data():
    original_df = pd.read_csv(DATA_PATH)
    return original_df

original_df = load_data()

original_df.tail(10)

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
7213436,2018-12-31,AA,1812,PDX,PHX,1058,1100.0,2.0,17.0,1117.0,...,152.0,142.0,121.0,1009.0,,,,,,
7213437,2018-12-31,AA,1812,PHX,PDX,825,821.0,-4.0,15.0,836.0,...,172.0,175.0,158.0,1009.0,,,,,,
7213438,2018-12-31,AA,1813,CLT,ATL,2100,2100.0,0.0,12.0,2112.0,...,73.0,67.0,45.0,226.0,,,,,,
7213439,2018-12-31,AA,1814,DFW,PHL,1955,2026.0,31.0,12.0,2038.0,...,182.0,160.0,142.0,1303.0,,,,,,
7213440,2018-12-31,AA,1815,CLT,DCA,1321,1320.0,-1.0,12.0,1332.0,...,84.0,62.0,46.0,331.0,,,,,,
7213441,2018-12-31,AA,1815,DCA,CLT,1534,1530.0,-4.0,20.0,1550.0,...,100.0,99.0,72.0,331.0,,,,,,
7213442,2018-12-31,AA,1816,CLT,DFW,1751,1757.0,6.0,18.0,1815.0,...,181.0,176.0,148.0,936.0,,,,,,
7213443,2018-12-31,AA,1817,CLT,MEM,2015,2010.0,-5.0,36.0,2046.0,...,112.0,128.0,88.0,511.0,,,,,,
7213444,2018-12-31,AA,1818,CLT,RDU,1300,1323.0,23.0,11.0,1334.0,...,50.0,41.0,26.0,130.0,,,,,,
7213445,2018-12-31,AA,1818,RDU,CLT,1435,1443.0,8.0,8.0,1451.0,...,71.0,59.0,44.0,130.0,,,,,,


In [114]:
original_df.shape

(7213446, 28)

In [115]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7213446 entries, 0 to 7213445
Data columns (total 28 columns):
 #   Column               Dtype  
---  ------               -----  
 0   FL_DATE              object 
 1   OP_CARRIER           object 
 2   OP_CARRIER_FL_NUM    int64  
 3   ORIGIN               object 
 4   DEST                 object 
 5   CRS_DEP_TIME         int64  
 6   DEP_TIME             float64
 7   DEP_DELAY            float64
 8   TAXI_OUT             float64
 9   WHEELS_OFF           float64
 10  WHEELS_ON            float64
 11  TAXI_IN              float64
 12  CRS_ARR_TIME         int64  
 13  ARR_TIME             float64
 14  ARR_DELAY            float64
 15  CANCELLED            float64
 16  CANCELLATION_CODE    object 
 17  DIVERTED             float64
 18  CRS_ELAPSED_TIME     float64
 19  ACTUAL_ELAPSED_TIME  float64
 20  AIR_TIME             float64
 21  DISTANCE             float64
 22  CARRIER_DELAY        float64
 23  WEATHER_DELAY        float64
 24

In [117]:
# Determine the top 5 airports so we can narrow the scope of the analysis. 
original_df["ORIGIN"].value_counts()

ORIGIN
ATL    390046
ORD    332953
DFW    279298
DEN    235989
CLT    233317
        ...  
AKN        63
CYS        58
IFP        45
ART        25
YNG         2
Name: count, Length: 358, dtype: int64

In [118]:
# Determine the top 5 airports so we can narrow the scope of the analysis. 
original_df["DEST"].value_counts()

DEST
ATL    390079
ORD    332942
DFW    279272
DEN    236020
CLT    233309
        ...  
AKN        63
CYS        58
IFP        45
ART        26
YNG         2
Name: count, Length: 358, dtype: int64

In [119]:
# Determine the top 5 airlines so we can narrow the scope of the analysis. 
original_df["OP_CARRIER"].value_counts()

OP_CARRIER
WN    1352552
DL     949283
AA     916818
OO     774137
UA     621565
YX     316090
B6     305010
MQ     296001
OH     278457
9E     245917
AS     245761
YV     215138
EV     202890
NK     176178
F9     120035
G4      96221
HA      83723
VX      17670
Name: count, dtype: int64

In [120]:
# Create lists to represent the top 5 airports and airlines. 
top_airports = ["ATL","ORD","DFW","CLT","DEN"]
top_airlines = ["WN", "DL", "AA", "OO", "UA"]

In [121]:
# Filter the DataFrame to include only the top 5 airports and top 5 airlines.
original_df_top5 = original_df[
    (original_df["ORIGIN"].isin(top_airports)) &
    (original_df["DEST"].isin(top_airports)) &
    (original_df["OP_CARRIER"].isin(top_airlines))
]

original_df_top5 = original_df_top5.reset_index(drop=True)

original_df_top5.shape

(100170, 28)

In [122]:
# Convert dates
original_df_top5["FL_DATE"] = pd.to_datetime(original_df_top5["FL_DATE"])

In [131]:
# Drop columns
original_df_top5 = original_df_top5.drop(["FL_DATE", "OP_CARRIER_FL_NUM", "CRS_DEP_TIME", "DEP_TIME", "TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN","CRS_ARR_TIME","ARR_TIME", "CANCELLATION_CODE", "Unnamed: 27"],axis=1)
original_df_top5.head()

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,DEP_DELAY,ARR_DELAY,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,YEAR,MONTH,DAY,WEEKDAY
0,UA,DEN,ORD,-6.0,-7.0,0.0,0.0,147.0,146.0,115.0,888.0,,,,,,2018,1,1,0
1,UA,ORD,CLT,26.0,0.0,0.0,0.0,122.0,96.0,76.0,599.0,,,,,,2018,1,1,0
2,UA,ORD,DEN,-3.0,-8.0,0.0,0.0,162.0,157.0,131.0,888.0,,,,,,2018,1,1,0
3,UA,ORD,DEN,1.0,-18.0,0.0,0.0,165.0,146.0,122.0,888.0,,,,,,2018,1,1,0
4,UA,DFW,DEN,-8.0,-14.0,0.0,0.0,130.0,124.0,95.0,641.0,,,,,,2018,1,1,0


In [205]:
# handle null values
original_df_top5 = original_df_top5.fillna(0)

## Feature Engineering

In [123]:
# Extract month, day, weekday features
original_df_top5["YEAR"]=original_df_top5["FL_DATE"].dt.year
original_df_top5["MONTH"]=original_df_top5["FL_DATE"].dt.month
original_df_top5["DAY"]=original_df_top5["FL_DATE"].dt.day
original_df_top5["WEEKDAY"]=original_df_top5["FL_DATE"].dt.dayofweek

In [133]:
original_df_top5.head()

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,DEP_DELAY,ARR_DELAY,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,YEAR,MONTH,DAY,WEEKDAY
0,UA,DEN,ORD,-6.0,-7.0,0.0,0.0,147.0,146.0,115.0,888.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0
1,UA,ORD,CLT,26.0,0.0,0.0,0.0,122.0,96.0,76.0,599.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0
2,UA,ORD,DEN,-3.0,-8.0,0.0,0.0,162.0,157.0,131.0,888.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0
3,UA,ORD,DEN,1.0,-18.0,0.0,0.0,165.0,146.0,122.0,888.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0
4,UA,DFW,DEN,-8.0,-14.0,0.0,0.0,130.0,124.0,95.0,641.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0


In [134]:
original_df_top5['DELAY'] = np.where((original_df_top5.ARR_DELAY > 0), 1, 0)
original_df_top5.head()

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,DEP_DELAY,ARR_DELAY,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,...,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,YEAR,MONTH,DAY,WEEKDAY,DELAY
0,UA,DEN,ORD,-6.0,-7.0,0.0,0.0,147.0,146.0,115.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,0
1,UA,ORD,CLT,26.0,0.0,0.0,0.0,122.0,96.0,76.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,0
2,UA,ORD,DEN,-3.0,-8.0,0.0,0.0,162.0,157.0,131.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,0
3,UA,ORD,DEN,1.0,-18.0,0.0,0.0,165.0,146.0,122.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,0
4,UA,DFW,DEN,-8.0,-14.0,0.0,0.0,130.0,124.0,95.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,0


In [135]:
original_df_top5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100170 entries, 0 to 100169
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   OP_CARRIER           100170 non-null  object 
 1   ORIGIN               100170 non-null  object 
 2   DEST                 100170 non-null  object 
 3   DEP_DELAY            100170 non-null  float64
 4   ARR_DELAY            100170 non-null  float64
 5   CANCELLED            100170 non-null  float64
 6   DIVERTED             100170 non-null  float64
 7   CRS_ELAPSED_TIME     100170 non-null  float64
 8   ACTUAL_ELAPSED_TIME  100170 non-null  float64
 9   AIR_TIME             100170 non-null  float64
 10  DISTANCE             100170 non-null  float64
 11  CARRIER_DELAY        100170 non-null  float64
 12  WEATHER_DELAY        100170 non-null  float64
 13  NAS_DELAY            100170 non-null  float64
 14  SECURITY_DELAY       100170 non-null  float64
 15  LATE_AIRCRAFT_DEL

In [137]:
# Columns to drop target and other columns that contain delay-related values
target_cols = ['CANCELLED', 'DIVERTED', 'DELAY']
delay_cols = ['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
leakage_cols = ['ARR_DELAY', 'DEP_DELAY']

# Drop columns
X = original_df_top5.drop(columns=target_cols + delay_cols + leakage_cols)

In [138]:
# Convert categorical data to numeric with `pd.get_dummies`
X = pd.get_dummies(X)
X.head()

Unnamed: 0,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,YEAR,MONTH,DAY,WEEKDAY,OP_CARRIER_AA,OP_CARRIER_DL,...,ORIGIN_ATL,ORIGIN_CLT,ORIGIN_DEN,ORIGIN_DFW,ORIGIN_ORD,DEST_ATL,DEST_CLT,DEST_DEN,DEST_DFW,DEST_ORD
0,147.0,146.0,115.0,888.0,2018,1,1,0,False,False,...,False,False,True,False,False,False,False,False,False,True
1,122.0,96.0,76.0,599.0,2018,1,1,0,False,False,...,False,False,False,False,True,False,True,False,False,False
2,162.0,157.0,131.0,888.0,2018,1,1,0,False,False,...,False,False,False,False,True,False,False,True,False,False
3,165.0,146.0,122.0,888.0,2018,1,1,0,False,False,...,False,False,False,False,True,False,False,True,False,False
4,130.0,124.0,95.0,641.0,2018,1,1,0,False,False,...,False,False,False,True,False,False,False,True,False,False


In [139]:
list(X.columns)

['CRS_ELAPSED_TIME',
 'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'YEAR',
 'MONTH',
 'DAY',
 'WEEKDAY',
 'OP_CARRIER_AA',
 'OP_CARRIER_DL',
 'OP_CARRIER_OO',
 'OP_CARRIER_UA',
 'OP_CARRIER_WN',
 'ORIGIN_ATL',
 'ORIGIN_CLT',
 'ORIGIN_DEN',
 'ORIGIN_DFW',
 'ORIGIN_ORD',
 'DEST_ATL',
 'DEST_CLT',
 'DEST_DEN',
 'DEST_DFW',
 'DEST_ORD']

In [140]:
# Split out target variable
y = original_df_top5['DELAY']

# Create StandardScaler instance
scaler = StandardScaler()

In [161]:
# Split data into initial train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create scaler and fit it only on training data 
X_train_scaled = scaler.fit_transform(X_train)

# Now split scaled training data into train-val sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size=0.25, random_state=0) 

# Apply same scaler to test data
X_test_scaled = scaler.transform(X_test)

## Modeling & Evaluation 3 Decision Tree Models - Baseline , Tuned, & AdaBoost 

In [162]:
# Define baseline model
model_base = DecisionTreeClassifier()
model_base.fit(X_train, y_train)

In [204]:
# Make predictions on validation data and calculate accuracy on validation set
base_val_preds = model_base.predict(X_val)
base_val_accuracy = accuracy_score(y_val, base_val_preds)

print("Base Validation Accuracy:", base_val_accuracy)

Base Validation Accuracy: 0.7159514428708338


In [203]:
# Evaluate baseline model on test set
base_test_preds = model_base.predict(X_test_scaled)
base_test_accuracy = accuracy_score(y_test, base_test_preds)

print("Base Test Accuracy:", base_test_accuracy)

Base Test Accuracy: 0.7147306632591942


In [202]:
print(classification_report(y_test, base_test_preds))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77     15838
           1       0.61      0.63      0.62      9205

    accuracy                           0.71     25043
   macro avg       0.69      0.70      0.70     25043
weighted avg       0.72      0.71      0.72     25043



In [199]:
# Hyperparameter tuned model
# Increase max depth, to allow model to learn more complex relationships
model_tuned = DecisionTreeClassifier(max_depth=9)
model_tuned.fit(X_train, y_train) 

In [200]:
# Evaluate hyperparameter tuned model on validation set
tuned_val_preds = model_tuned.predict(X_val)
tuned_val_accuracy = accuracy_score(y_val, tuned_val_preds)

print("Tuned Validation Accuracy:", tuned_val_accuracy)

Tuned Validation Accuracy: 0.7948567777659461


In [201]:
# Evaluate hyperparameter model on test set
tuned_test_preds = model_tuned.predict(X_test_scaled)
tuned_test_accuracy = accuracy_score(y_test, tuned_test_preds)

print("Tuned Test Accuracy:", tuned_test_accuracy)

Tuned Test Accuracy: 0.7962704148863954


In [170]:
print(classification_report(y_test, tuned_test_preds))

              precision    recall  f1-score   support

           0       0.77      0.95      0.85     15838
           1       0.86      0.52      0.65      9205

    accuracy                           0.79     25043
   macro avg       0.82      0.73      0.75     25043
weighted avg       0.80      0.79      0.78     25043



In [171]:
# ADA Boost Classifier
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost model with base DecisionTree
ada_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=20)
ada_model.fit(X_train, y_train)

In [172]:
# Make predictions on validation data and calculate accuracy on validation set
ada_boost_preds = ada_model.predict(X_val)
ada_boost_accuracy = accuracy_score(y_val, ada_boost_preds)

print("ADA Boost Validation Accuracy:", ada_boost_accuracy)

ADA Boost Validation Accuracy: 0.7415610691087211


In [173]:
# Evaluate AdaBoost model on test set
ada_test_preds = ada_model.predict(X_test_scaled)
ada_test_accuracy = accuracy_score(y_test, ada_test_preds)

print("Base Test Accuracy:", base_test_accuracy)

Base Test Accuracy: 0.7147306632591942


In [174]:
print(classification_report(y_test, ada_test_preds))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80     15838
           1       0.67      0.61      0.64      9205

    accuracy                           0.74     25043
   macro avg       0.73      0.72      0.72     25043
weighted avg       0.74      0.74      0.74     25043



## Results

We tested 3 different Decision Tree machine learning models to predict whether a flight will be delayed or not:

### Basic Decision Tree Model
- Accuracy: 71%
- This basic model was pretty good at predicting delays, but also frequently mislabeled on-time flights as delayed.

### Tuned Decision Tree Model
- Accuracy: 79%
- By tweaking some settings, this model got significantly better at recognizing on-time flights. But it started missing some actually delayed flights.

### AdaBoost Model
- Accuracy: 74%
- This model was in between the basic and tuned models in accuracy. It improved on-time prediction over the basic model but wasn't as good as the tuned model.

## Summary
- The models predicted flight delays with 71-79% accuracy, which is decent but not perfect.
- The tuned model performed best overall.
- But no model was great at predicting delays for a minority of flights.

So the models are useful for travelers and airlines to set better expectations on delays. But there is still room for improvement to catch more delays and tweak the models. With more work, the flight delay predictions could get even better.