In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
#  Import and read the flight data for 2018.

clean_flights_18_df = pd.read_csv("Data_files/REV_2018_cleaned_delays.csv")
clean_flights_18_df.head()

Unnamed: 0,CRS_ELAPSED_TIME,DISTANCE,OP_CARRIER_AA,OP_CARRIER_DL,OP_CARRIER_OO,OP_CARRIER_UA,OP_CARRIER_WN,ORIGIN_ATL,ORIGIN_CLT,ORIGIN_DEN,...,sched_dep_time_Night,sched_arr_time_Afternoon,sched_arr_time_Evening,sched_arr_time_Morning,sched_arr_time_Night,season_Autumn,season_Spring,season_Summer,season_Winter,DELAY
0,0.191684,0.454929,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
1,-0.567662,-0.589053,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0.647291,0.454929,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,0.738413,0.454929,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,-0.324671,-0.437333,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [3]:
list(clean_flights_18_df.columns)

['CRS_ELAPSED_TIME',
 'DISTANCE',
 'OP_CARRIER_AA',
 'OP_CARRIER_DL',
 'OP_CARRIER_OO',
 'OP_CARRIER_UA',
 'OP_CARRIER_WN',
 'ORIGIN_ATL',
 'ORIGIN_CLT',
 'ORIGIN_DEN',
 'ORIGIN_DFW',
 'ORIGIN_ORD',
 'DEST_ATL',
 'DEST_CLT',
 'DEST_DEN',
 'DEST_DFW',
 'DEST_ORD',
 'WEEKDAY_Friday',
 'WEEKDAY_Monday',
 'WEEKDAY_Saturday',
 'WEEKDAY_Sunday',
 'WEEKDAY_Thursday',
 'WEEKDAY_Tuesday',
 'WEEKDAY_Wednesday',
 'sched_dep_time_Afternoon',
 'sched_dep_time_Evening',
 'sched_dep_time_Morning',
 'sched_dep_time_Night',
 'sched_arr_time_Afternoon',
 'sched_arr_time_Evening',
 'sched_arr_time_Morning',
 'sched_arr_time_Night',
 'season_Autumn',
 'season_Spring',
 'season_Summer',
 'season_Winter',
 'DELAY']

In [4]:
X = clean_flights_18_df.copy()
X.drop("DELAY", axis=1, inplace=True)
X.head()

Unnamed: 0,CRS_ELAPSED_TIME,DISTANCE,OP_CARRIER_AA,OP_CARRIER_DL,OP_CARRIER_OO,OP_CARRIER_UA,OP_CARRIER_WN,ORIGIN_ATL,ORIGIN_CLT,ORIGIN_DEN,...,sched_dep_time_Morning,sched_dep_time_Night,sched_arr_time_Afternoon,sched_arr_time_Evening,sched_arr_time_Morning,sched_arr_time_Night,season_Autumn,season_Spring,season_Summer,season_Winter
0,0.191684,0.454929,0,0,0,1,0,0,0,1,...,1,0,1,0,0,0,0,0,0,1
1,-0.567662,-0.589053,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.647291,0.454929,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0.738413,0.454929,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,-0.324671,-0.437333,0,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1


In [5]:
y = clean_flights_18_df["DELAY"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [6]:
# Ravel the target array y: 
y = y.ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [8]:
gbt = GradientBoostingClassifier()

In [9]:
gbt.fit(X_train, y_train)

In [10]:
y_pred = gbt.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.653116639380266


In [11]:
# Tuning the hyperparameters (first round).

In [12]:
# Set different hyperparameters to tune.
hyperparameters = {
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
}

In [13]:
# Use the grid search to identify the best hyperparameters.
grid_search = GridSearchCV(GradientBoostingClassifier(), hyperparameters, cv=5, refit=True, return_train_score=True)
grid_search.fit(X_train, y_train)

In [14]:
# Access the best hyperparameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300}


In [15]:
# Run a new GBT model with the best hyperparameters. 
gbt_tuned = GradientBoostingClassifier(learning_rate=best_params["learning_rate"],n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"])

In [16]:
gbt_tuned.fit(X_train, y_train)

In [17]:
y_pred_tuned = gbt_tuned.predict(X_test)

In [18]:
print("Accuracy:", accuracy_score(y_test, y_pred_tuned))

Accuracy: 0.6575090843748752


In [None]:
# Trying another one to see if we can do any better.

In [19]:
hyperparameters_2 = {
    "learning_rate": [0.3, 0.05, 0.07],
    "n_estimators": [250, 300, 350],
    "max_depth": [4, 5, 6],
}

In [20]:
grid_search_2 = GridSearchCV(GradientBoostingClassifier(), hyperparameters_2, cv=5, refit=True, return_train_score=True)
grid_search_2.fit(X_train, y_train)

In [21]:
best_params_2 = grid_search_2.best_params_

In [22]:
print("Best parameters:", best_params_2)

Best parameters: {'learning_rate': 0.07, 'max_depth': 5, 'n_estimators': 350}


In [23]:
# Run a new GBT tuned model with the best hyperparameters from this latest grid search. 
gbt_tuned_2 = GradientBoostingClassifier(learning_rate=best_params_2["learning_rate"],n_estimators=best_params_2["n_estimators"], max_depth=best_params_2["max_depth"])

In [24]:
gbt_tuned_2.fit(X_train, y_train)

In [25]:
y_pred_tuned_2 = gbt_tuned_2.predict(X_test)

In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred_tuned_2))

Accuracy: 0.6565108014215549


In [None]:
import pickle 

In [None]:
file_name = "Flights18_gbt_tuned.sav"
pickle.dump(gbt_tuned_2,open(file_name,'wb'))

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred_tuned_2)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred_tuned_2)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred_tuned_2))

In [None]:
 # Get the feature importance array
importances = gbt_tuned_2.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(gbt_tuned_2.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

In [None]:
importances_df = pd.DataFrame(sorted(zip(gbt_tuned_2.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)