## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier


## Data path

In [2]:
DATA_PATH = os.path.join('resources', '2018.csv')

## Helper functions

In [3]:
def load_data():
    original_df = pd.read_csv(DATA_PATH)
    return original_df

original_df = load_data()

original_df.tail(10)

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
7213436,2018-12-31,AA,1812,PDX,PHX,1058,1100.0,2.0,17.0,1117.0,...,152.0,142.0,121.0,1009.0,,,,,,
7213437,2018-12-31,AA,1812,PHX,PDX,825,821.0,-4.0,15.0,836.0,...,172.0,175.0,158.0,1009.0,,,,,,
7213438,2018-12-31,AA,1813,CLT,ATL,2100,2100.0,0.0,12.0,2112.0,...,73.0,67.0,45.0,226.0,,,,,,
7213439,2018-12-31,AA,1814,DFW,PHL,1955,2026.0,31.0,12.0,2038.0,...,182.0,160.0,142.0,1303.0,,,,,,
7213440,2018-12-31,AA,1815,CLT,DCA,1321,1320.0,-1.0,12.0,1332.0,...,84.0,62.0,46.0,331.0,,,,,,
7213441,2018-12-31,AA,1815,DCA,CLT,1534,1530.0,-4.0,20.0,1550.0,...,100.0,99.0,72.0,331.0,,,,,,
7213442,2018-12-31,AA,1816,CLT,DFW,1751,1757.0,6.0,18.0,1815.0,...,181.0,176.0,148.0,936.0,,,,,,
7213443,2018-12-31,AA,1817,CLT,MEM,2015,2010.0,-5.0,36.0,2046.0,...,112.0,128.0,88.0,511.0,,,,,,
7213444,2018-12-31,AA,1818,CLT,RDU,1300,1323.0,23.0,11.0,1334.0,...,50.0,41.0,26.0,130.0,,,,,,
7213445,2018-12-31,AA,1818,RDU,CLT,1435,1443.0,8.0,8.0,1451.0,...,71.0,59.0,44.0,130.0,,,,,,


In [4]:
original_df.shape

(7213446, 28)

In [5]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7213446 entries, 0 to 7213445
Data columns (total 28 columns):
 #   Column               Dtype  
---  ------               -----  
 0   FL_DATE              object 
 1   OP_CARRIER           object 
 2   OP_CARRIER_FL_NUM    int64  
 3   ORIGIN               object 
 4   DEST                 object 
 5   CRS_DEP_TIME         int64  
 6   DEP_TIME             float64
 7   DEP_DELAY            float64
 8   TAXI_OUT             float64
 9   WHEELS_OFF           float64
 10  WHEELS_ON            float64
 11  TAXI_IN              float64
 12  CRS_ARR_TIME         int64  
 13  ARR_TIME             float64
 14  ARR_DELAY            float64
 15  CANCELLED            float64
 16  CANCELLATION_CODE    object 
 17  DIVERTED             float64
 18  CRS_ELAPSED_TIME     float64
 19  ACTUAL_ELAPSED_TIME  float64
 20  AIR_TIME             float64
 21  DISTANCE             float64
 22  CARRIER_DELAY        float64
 23  WEATHER_DELAY        float64
 24

In [6]:
top_airports= ["ATL","ORD","DFW","CLT","DEN"]
##"LAX","PHX","IAH","SFO","LAS"]

original_df_top10 = original_df.loc[original_df['ORIGIN'].isin(top_airports)]
original_df_top10.shape

(1471603, 28)

In [7]:
original_df_top10["FL_DATE"] = pd.to_datetime(original_df_top10["FL_DATE"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df_top10["FL_DATE"] = pd.to_datetime(original_df_top10["FL_DATE"])


In [8]:
original_df_top10["YEAR"]=original_df_top10["FL_DATE"].dt.year
original_df_top10["MONTH"]=original_df_top10["FL_DATE"].dt.month
original_df_top10["DAY"]=original_df_top10["FL_DATE"].dt.day
original_df_top10["WEEKDAY"]=original_df_top10["FL_DATE"].dt.dayofweek

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df_top10["YEAR"]=original_df_top10["FL_DATE"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df_top10["MONTH"]=original_df_top10["FL_DATE"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df_top10["DAY"]=original_df_top10["FL_DATE"].dt.day
A value is tryin

In [9]:
flight_data_df = original_df_top10.drop(["FL_DATE", "OP_CARRIER_FL_NUM", "CRS_DEP_TIME", "DEP_TIME", "TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN","CRS_ARR_TIME","ARR_TIME", "CANCELLATION_CODE", "Unnamed: 27"],axis=1)
flight_data_df.head()
flight_data_df["WEEKDAY"].unique()

array([0, 1, 2, 3, 4, 5, 6], dtype=int32)

In [10]:
flight_data_df = flight_data_df.fillna(0)

In [11]:
flight_data_df.head()

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,DEP_DELAY,ARR_DELAY,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,YEAR,MONTH,DAY,WEEKDAY
4,UA,ORD,ALB,20.0,14.0,0.0,0.0,112.0,106.0,83.0,723.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0
5,UA,ORD,OMA,3.0,-11.0,0.0,0.0,93.0,79.0,62.0,416.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0
7,UA,DEN,CID,-6.0,-19.0,0.0,0.0,115.0,102.0,85.0,692.0,0.0,0.0,0.0,0.0,0.0,2018,1,1,0
11,UA,ORD,CLE,121.0,129.0,0.0,0.0,72.0,80.0,48.0,316.0,121.0,0.0,8.0,0.0,0.0,2018,1,1,0
14,UA,ORD,BTV,76.0,73.0,0.0,0.0,121.0,118.0,99.0,763.0,0.0,11.0,0.0,0.0,62.0,2018,1,1,0


In [12]:
flight_data_df['DELAY'] = np.where((flight_data_df.ARR_DELAY > 0), 1, 0)
flight_data_df.head()

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,DEP_DELAY,ARR_DELAY,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,...,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,YEAR,MONTH,DAY,WEEKDAY,DELAY
4,UA,ORD,ALB,20.0,14.0,0.0,0.0,112.0,106.0,83.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,1
5,UA,ORD,OMA,3.0,-11.0,0.0,0.0,93.0,79.0,62.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,0
7,UA,DEN,CID,-6.0,-19.0,0.0,0.0,115.0,102.0,85.0,...,0.0,0.0,0.0,0.0,0.0,2018,1,1,0,0
11,UA,ORD,CLE,121.0,129.0,0.0,0.0,72.0,80.0,48.0,...,121.0,0.0,8.0,0.0,0.0,2018,1,1,0,1
14,UA,ORD,BTV,76.0,73.0,0.0,0.0,121.0,118.0,99.0,...,0.0,11.0,0.0,0.0,62.0,2018,1,1,0,1


In [13]:
flight_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1471603 entries, 4 to 7213444
Data columns (total 21 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   OP_CARRIER           1471603 non-null  object 
 1   ORIGIN               1471603 non-null  object 
 2   DEST                 1471603 non-null  object 
 3   DEP_DELAY            1471603 non-null  float64
 4   ARR_DELAY            1471603 non-null  float64
 5   CANCELLED            1471603 non-null  float64
 6   DIVERTED             1471603 non-null  float64
 7   CRS_ELAPSED_TIME     1471603 non-null  float64
 8   ACTUAL_ELAPSED_TIME  1471603 non-null  float64
 9   AIR_TIME             1471603 non-null  float64
 10  DISTANCE             1471603 non-null  float64
 11  CARRIER_DELAY        1471603 non-null  float64
 12  WEATHER_DELAY        1471603 non-null  float64
 13  NAS_DELAY            1471603 non-null  float64
 14  SECURITY_DELAY       1471603 non-null  float64
 15  LAT

In [14]:
# Columns to drop target and other columns that contain delay-related values
target_cols = ['CANCELLED', 'DIVERTED', 'DELAY']
delay_cols = ['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
leakage_cols = ['ARR_DELAY', 'DEP_DELAY']

# Drop columns
X = flight_data_df.drop(columns=target_cols + delay_cols + leakage_cols)

In [15]:
# Convert categorical data to numeric with `pd.get_dummies`
X = pd.get_dummies(X)
X.head()

Unnamed: 0,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,YEAR,MONTH,DAY,WEEKDAY,OP_CARRIER_9E,OP_CARRIER_AA,...,DEST_TUS,DEST_TVC,DEST_TXK,DEST_TYR,DEST_TYS,DEST_UIN,DEST_VEL,DEST_VLD,DEST_VPS,DEST_XNA
4,112.0,106.0,83.0,723.0,2018,1,1,0,False,False,...,False,False,False,False,False,False,False,False,False,False
5,93.0,79.0,62.0,416.0,2018,1,1,0,False,False,...,False,False,False,False,False,False,False,False,False,False
7,115.0,102.0,85.0,692.0,2018,1,1,0,False,False,...,False,False,False,False,False,False,False,False,False,False
11,72.0,80.0,48.0,316.0,2018,1,1,0,False,False,...,False,False,False,False,False,False,False,False,False,False
14,121.0,118.0,99.0,763.0,2018,1,1,0,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
list(X.columns)

['CRS_ELAPSED_TIME',
 'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'YEAR',
 'MONTH',
 'DAY',
 'WEEKDAY',
 'OP_CARRIER_9E',
 'OP_CARRIER_AA',
 'OP_CARRIER_AS',
 'OP_CARRIER_B6',
 'OP_CARRIER_DL',
 'OP_CARRIER_EV',
 'OP_CARRIER_F9',
 'OP_CARRIER_G4',
 'OP_CARRIER_MQ',
 'OP_CARRIER_NK',
 'OP_CARRIER_OH',
 'OP_CARRIER_OO',
 'OP_CARRIER_UA',
 'OP_CARRIER_VX',
 'OP_CARRIER_WN',
 'OP_CARRIER_YV',
 'OP_CARRIER_YX',
 'ORIGIN_ATL',
 'ORIGIN_CLT',
 'ORIGIN_DEN',
 'ORIGIN_DFW',
 'ORIGIN_ORD',
 'DEST_ABE',
 'DEST_ABI',
 'DEST_ABQ',
 'DEST_ABY',
 'DEST_ACK',
 'DEST_ACT',
 'DEST_ACY',
 'DEST_AEX',
 'DEST_AGS',
 'DEST_ALB',
 'DEST_ALO',
 'DEST_AMA',
 'DEST_ANC',
 'DEST_ASE',
 'DEST_ATL',
 'DEST_ATW',
 'DEST_AUS',
 'DEST_AVL',
 'DEST_AVP',
 'DEST_AZO',
 'DEST_BDL',
 'DEST_BFF',
 'DEST_BFL',
 'DEST_BGR',
 'DEST_BHM',
 'DEST_BIL',
 'DEST_BIS',
 'DEST_BKG',
 'DEST_BLI',
 'DEST_BMI',
 'DEST_BNA',
 'DEST_BOI',
 'DEST_BOS',
 'DEST_BPT',
 'DEST_BQK',
 'DEST_BRO',
 'DEST_BTR',
 'DEST_BTV',
 'DEST_BUF',
 '

In [17]:
# Split out target variable
y = flight_data_df['DELAY']

# Create StandardScaler instance
scaler = StandardScaler()

In [18]:
# Split data into initial train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create scaler and fit it only on training data 
X_train_scaled = scaler.fit_transform(X_train)

# Now split scaled training data into train-val sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size=0.33, random_state=0) 

# Apply same scaler to test data
X_test_scaled = scaler.transform(X_test)

In [37]:
# Define baseline model
model_base = DecisionTreeClassifier()
model_base.fit(X_train, y_train)

In [38]:
# Make predictions on validation data and calculate accuracy on validation set
base_val_preds = model_base.predict(X_val)
base_val_accuracy = accuracy_score(y_val, base_val_preds)

print("Base Validation Accuracy:", base_val_accuracy)

Base Validation Accuracy: 0.7051559445803572


In [39]:
# Evaluate baseline model on test set
base_test_preds = model_base.predict(X_test_scaled)
base_test_accuracy = accuracy_score(y_test, base_test_preds)

print("Base Test Accuracy:", base_test_accuracy)

Base Test Accuracy: 0.7024518716962949


In [40]:
print(classification_report(y_test, base_test_preds))

              precision    recall  f1-score   support

           0       0.76      0.75      0.76    300551
           1       0.61      0.62      0.61    185078

    accuracy                           0.70    485629
   macro avg       0.69      0.69      0.69    485629
weighted avg       0.70      0.70      0.70    485629



In [34]:
# Hyperparameter tuned model
# Increase max depth, to allow model to learn more complex relationships
model_tuned = DecisionTreeClassifier(max_depth=10)
model_tuned.fit(X_train, y_train) 

In [35]:
# Evaluate hyperparameter tuned model on validation set
tuned_val_preds = model_tuned.predict(X_val)
tuned_val_accuracy = accuracy_score(y_val, tuned_val_preds)

print("Tuned Validation Accuracy:", tuned_val_accuracy)

Tuned Validation Accuracy: 0.7545793737629544


In [36]:
# Evaluate baseline model on test set
tuned_test_preds = model_tuned.predict(X_test_scaled)
tuned_test_accuracy = accuracy_score(y_test, tuned_test_preds)

print("Tuned Test Accuracy:", tuned_test_accuracy)

Tuned Test Accuracy: 0.7530625230371333


In [41]:
print(classification_report(y_test, tuned_test_preds))

              precision    recall  f1-score   support

           0       0.73      0.96      0.83    300551
           1       0.86      0.42      0.56    185078

    accuracy                           0.75    485629
   macro avg       0.80      0.69      0.70    485629
weighted avg       0.78      0.75      0.73    485629



In [29]:
# ADA Boost Classifier
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost model with base DecisionTree
ada_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=20)
ada_model.fit(X_train, y_train)

In [30]:
# Make predictions on validation data and calculate accuracy on validation set
ada_boost_preds = ada_model.predict(X_val)
ada_boost_accuracy = accuracy_score(y_val, ada_boost_preds)

print("ADA Boost Validation Accuracy:", ada_boost_accuracy)

ADA Boost Validation Accuracy: 0.7444402099750439


In [31]:
# Evaluate baseline model on test set
ada_test_preds = ada_model.predict(X_test_scaled)
ada_test_accuracy = accuracy_score(y_test, ada_test_preds)

print("Base Test Accuracy:", base_test_accuracy)

Base Test Accuracy: 0.7025177656194338


In [42]:
print(classification_report(y_test, ada_test_preds))

              precision    recall  f1-score   support

           0       0.74      0.89      0.81    300551
           1       0.73      0.51      0.60    185078

    accuracy                           0.74    485629
   macro avg       0.74      0.70      0.70    485629
weighted avg       0.74      0.74      0.73    485629

