# Mini flight delay prediction

### Kaggle imports and directory/path configurations

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Imports for preprocessing and evaluation

In [None]:
# Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Evaluation metrics
from sklearn.metrics import classification_report

### Open and display dataframes

#### Train dataframe

In [None]:
# Load dataframe
train_df = pd.read_csv('../input/mini-flight-delay-prediction/flight_delays_train.csv')

# Display dataframe's head
train_df.head()

In [None]:
# Display dataframe info
train_df.info()

#### Test dataframe

In [None]:
# Load test dataframe
test_df = pd.read_csv('../input/mini-flight-delay-prediction/flight_delays_test.csv')

# Display test dataframe's head
test_df.head()

In [None]:
# Display test dataframe info
test_df.info()

As we can see, both train and test dataframes have no missing values, so we can proceed to treat their data with that in mind.

## Data handling

### Map carrier, origin/destination airport codes and delayed to numeric attributes

#### Train DF

In [None]:
# UniqueCarrier
uc_labels = train_df.UniqueCarrier.unique().tolist()
label_dict_uc_train = {}
for index, possible_label in enumerate(uc_labels):
    label_dict_uc_train[possible_label] = index

# Origin
origin_labels = train_df.Origin.unique().tolist()
label_dict_origin_train = {}
for index, possible_label in enumerate(origin_labels):
    label_dict_origin_train[possible_label] = index

# Dest
dest_labels = train_df.Dest.unique().tolist()
label_dict_dest_train = {}
for index, possible_label in enumerate(dest_labels):
    label_dict_dest_train[possible_label] = index

# Mapping 'UniqueCarrier', 'Origin' and 'Dest'
train_df['UniqueCarrier'] = train_df.UniqueCarrier.replace(label_dict_uc_train)
train_df['Origin'] = train_df.Origin.replace(label_dict_origin_train)
train_df['Dest'] = train_df.Dest.replace(label_dict_dest_train)

# Map 'dep_delayed...' to 1/0 and save it to 'delayed' variable
delayed = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values

train_df.head()

#### Test DF

In [None]:
# UniqueCarrier
uc_labels = test_df.UniqueCarrier.unique().tolist()
label_dict_uc_test = {}
for index, possible_label in enumerate(uc_labels):
    label_dict_uc_test[possible_label] = index

# Origin
origin_labels = test_df.Origin.unique().tolist()
label_dict_origin_test = {}
for index, possible_label in enumerate(origin_labels):
    label_dict_origin_test[possible_label] = index

# Dest
dest_labels = test_df.Dest.unique().tolist()
label_dict_dest_test = {}
for index, possible_label in enumerate(dest_labels):
    label_dict_dest_test[possible_label] = index

# Map 'dep_delayed...' to 1/0 and save it to 'delayed' variable
test_df['UniqueCarrier'] = test_df.UniqueCarrier.replace(label_dict_uc_test)
test_df['Origin'] = test_df.Origin.replace(label_dict_origin_test)
test_df['Dest'] = test_df.Dest.replace(label_dict_dest_test)

test_df.head()

### Clean attributes related to date and cast them to int

#### Train

In [None]:
# Removing the 'c-' from the data related to dates
month = train_df['Month'].str.split('-')
train_df['Mon']=month.apply(lambda x:int(x[1]))

day = train_df['DayofMonth'].str.split('-')
train_df['DOM']=day.apply(lambda x:int(x[1]))

dow = train_df['DayOfWeek'].str.split('-')
train_df['DOW']=dow.apply(lambda x:int(x[1]))

# Drop redundant columns
train_df = train_df.drop(['Month', 'DayofMonth', 'DayOfWeek'], axis=1)

# Rename columns to 'Month', 'Day' and 'DayOfWeek'
train_df.rename(columns={'Mon': 'Month',  'DOM': 'DayOfMonth',
                         'DOW': 'DayOfWeek'}, inplace=True)

train_df.head()

#### Test

In [None]:
# Removing the 'c-' from the data related to dates
month = test_df['Month'].str.split('-')
test_df['Mon']=month.apply(lambda x:int(x[1]))

day = test_df['DayofMonth'].str.split('-')
test_df['DOM']=day.apply(lambda x:int(x[1]))

dow = test_df['DayOfWeek'].str.split('-')
test_df['DOW']=dow.apply(lambda x:int(x[1]))

test_df.head()

### Convert 'DepTime' to 'timedelta' and cast it to numeric

#### Train

In [None]:
# Separate hours and minutes into their respective columns
train_df['DepHour'] = train_df['DepTime']//100
train_df['DepHour'].replace(to_replace=[24,25], value=0, inplace=True)

train_df['DepMinute'] = train_df['DepTime']%100

# Save the time in minutes
train_df['Minutes'] = train_df['DepMinute'] + train_df['DepHour']*60

# Convert time to 'timedelta'
train_df['Time'] = pd.to_timedelta(train_df['Minutes'], unit='m')

# Drop irrelevant columns
train_df = train_df.drop(['DepHour', 'DepMinute', 'Minutes', 'DepTime'], axis=1)

# Cast 'datetime' to numeric
train_df['Time'] = pd.to_numeric(train_df['Time'], downcast='float')

# Rename column 'Time' to 'DepTime'
train_df.rename(columns={'Time' : 'DepTime'}, inplace=True)

train_df.head()

#### Test

In [None]:
# Separate hours and minutes into their respective columns
test_df['DepHour'] = test_df['DepTime']//100
test_df['DepHour'].replace(to_replace=[24,25], value=0, inplace=True)

test_df['DepMinute'] = test_df['DepTime']%100

# Save the time in minutes
test_df['Minutes'] = test_df['DepMinute'] + test_df['DepHour']*60

# Convert time to 'timedelta'
test_df['Time'] = pd.to_timedelta(test_df['Minutes'], unit='m')

# Cast 'datetime' to numeric
test_df['Time'] = pd.to_numeric(test_df['Time'], downcast='float')

# Drop redundant/irrelevant columns
test_df = test_df.drop(['Month', 'DayofMonth', 'DayOfWeek', 'DepHour',
                          'DepMinute', 'Minutes', 'DepTime'], axis=1)

# Rename columns to 'Month', 'Day', 'DayOfWeek' and 'DepTime'
test_df.rename(columns={'Mon': 'Month',  'DOM': 'DayOfMonth',
                         'DOW': 'DayOfWeek', 'Time': 'DepTime'}, inplace=True)

test_df.head()

The data is mostly treated. I'll be normalizing the values afterwards, as I am using PyCaret to compare the classifiers and it already does it in its runtime.

## Automated model test using PyCaret

### Split between modeling and validation

In [None]:
# Split data (the split is stratified by default)
data = train_df.sample(frac=0.75, random_state=31415)

data_unseen = train_df.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

### Install and import PyCaret

In [None]:
!pip install pycaret
from pycaret.classification import *

### Setup

In [None]:
exp_cls101 = setup(data = data, target = 'dep_delayed_15min', session_id=27182,
                   numeric_features = ['UniqueCarrier', 'Origin', 'Dest',
                                       'Distance', 'Month', 'DayOfMonth',
                                       'DayOfWeek', 'DepTime'],
                   data_split_stratify=True, silent=True)

### Model compairson

In [None]:
best_model = compare_models()

As we can see, the best classifiers (according to those metrics) are CatBoost, LGBM and Extreme Gradient Boosting, with LGBM being quicker than both, while obtaining similar scores.

Since this submission is for a course I'm taking, I'll further evaluate LGBM, Random Forest, Gradient Boosting and Decision Tree, as we have studied the last three and LGBM has one of the best scores, while being quick to model.

I won't be evaluating CatBoost or Extreme Gradient Boosting as these two take too long to run, as well as SVM and MLP, even though we have studied them, they too take a long time to run, but don't offer good results (I've run them beforehand).

#### LGBM

Creating the model with PyCaret is as simple as the following code suggests. Then, we can further evaluate the results and scores by plotting the confusion matrix.

In [None]:
# Create model for evaluation
lgbm = create_model('lightgbm')

# Plot the confusion matrix
plot_model(lgbm, plot='confusion_matrix')

In [None]:
# Tune the model's hyperparameters
tuned_lgbm = tune_model(lgbm)

# Plot the tuned model's confusion matrix
plot_model(tuned_lgbm, plot='confusion_matrix')

#### Random forest

In [None]:
# Create model for evaluation
rfc = create_model('rf')

# Plot the confusion matrix
plot_model(rfc, plot='confusion_matrix')

In [None]:
# Tune the model's hyperparameters
tuned_rfc = tune_model(rfc)

# Plot the tuned model's confusion matrix
plot_model(tuned_rfc, plot='confusion_matrix')

#### Gradient Boosting

In [None]:
# Create model for evaluation
gbc = create_model('gbc')

# Plot the confusion matrix
plot_model(gbc, plot='confusion_matrix')

In [None]:
# Tune the model's hyperparameters
tuned_gbc = tune_model(gbc)

# Plot the tuned model's confusion matrix
plot_model(tuned_gbc, plot='confusion_matrix')

#### Decision tree

In [None]:
# Create model for evaluation
dtc = create_model('dt')

# Plot the confusion matrix
plot_model(dtc, plot='confusion_matrix')

In [None]:
# Tune the model's hyperparameters
tuned_dtc = tune_model(dtc)

# Plot the confusion matrix
plot_model(tuned_dtc, plot='confusion_matrix')

As we can see, all selected models suffer to predict delays with accuracy. I've tested both CatBoost and Extreme Gradient Boosting outside of this scope and they respond just the same (you can do the same if you want to check for yourself).

It is also notable, that both LGBM and Gradient Boosting respond somewhat well to hyperparameters tuning, while Random Forest just classifies everything as won't have any delays and Decision Tree start to generalize towards this path.

It should be noted, though, that Decision Tree (with default parameters) have the best results towards predicting delays, but then suffer slightly to predict non delayed flights.

My approach will be to try and predict more accurately non delayed flights, as, with previous tests, I couldn't find a model that predicted the delays well. I'm going to use Gradient Boosting Classifier, as a colleague used this same method with PyCaret, as well, and choose to use LGBM.

## Data handling: normalization

#### Train

In [None]:
# Drop the column with truth values
train_df = train_df.drop(['dep_delayed_15min'], axis=1)

# Save columns names
att = list(train_df.columns.values)

# Normalize
train_values = train_df.values
scaler = preprocessing.MinMaxScaler()
values_scaled = scaler.fit_transform(train_values)

# Save to new dataframe
train_scaled_df = pd.DataFrame(values_scaled,columns=att)

train_scaled_df.head()

#### Test

In [None]:
# Save columns names
test_att = list(test_df.columns.values)

# Normalize
test_values = test_df.values
test_scaler = preprocessing.MinMaxScaler()
test_values_scaled = test_scaler.fit_transform(test_values)

# Save to new dataframe
test_scaled_df = pd.DataFrame(test_values_scaled,columns=test_att)

test_scaled_df.head()

## Gradient Boost Classifier

### Print tuned hyperparameters

In [None]:
# Print tuned hyperparameters
print(tuned_gbc)

### Train model using the whole training dataframe

In [None]:
# Import gradient boost classifier
from sklearn.ensemble import GradientBoostingClassifier

# Save normalized train dataframe values to 'train_data' variable
train_data = train_scaled_df.values

gbc_model = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse',
                                       init=None, learning_rate=0.036,
                                       loss='deviance', max_depth=7,
                                       max_features=1.0, max_leaf_nodes=None,
                                       min_impurity_decrease=0,
                                       min_impurity_split=None,
                                       min_samples_leaf=5, min_samples_split=9,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=190, n_iter_no_change=None,
                                       presort='deprecated', subsample=0.3,
                                       tol=0.0001, validation_fraction=0.1,
                                       verbose=0, warm_start=False)

# Fit data from training dataset
gbc_model.fit(train_data, delayed)

### Try and predict the results with the above model

In [None]:
# Get values from scaled test dataframe
test_sdf_values = test_scaled_df.values

# Predict values using the trained GB classifier
predicted = gbc_model.predict(test_sdf_values)

### Submission

In [None]:
submission_df = pd.Series(predicted)
submission_df = submission_df.map(lambda label: 'N' if label==0 else 'Y')
submission_df.to_csv('submission.csv', index=False)