# Predicting Flight Takeoff Delays:
* An analysis of Feature Engineering, Balancing, Encoding, and Machine Learning Models for Predicting Flight Takeoff Delays

Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Cleaning and Feature Engineering
* Preparing the dataset for machine learning modelings

Load flights dataset

In [None]:
df = pd.read_csv('dataset_SCL.csv')
df.head()

In [None]:
# remove rows where flight_num_pre != flight_num_post
df = df[df['flight_num_pre'] == df['flight_num_post']]

The flights are all from Santiago, so the origin-related columns are not necessary

In [None]:
print(df['origin_pre'].unique())
print(df['origin_post'].unique())
print(df['origin_city'].unique())
df = df.drop(['origin_pre', 'origin_post', 'origin_city'], axis=1)

In [None]:
# Convert dates to datetime format
df['date_pre'] = pd.to_datetime(df['date_pre'])
df['date_post'] = pd.to_datetime(df['date_post'])

# Convert international to binary (I = International, N = National)
df['international'] = df['international'].map({'I': 1, 'N': 0})
df.head()

In [None]:
# Separate date into 4 features
df["day"] = df["date_pre"].dt.day
df["month"] = df["date_pre"].dt.month
df["weekday"] = df["date_pre"].dt.weekday
df.head()

In [None]:
high_season_conditions = [
    (((df["date_pre"].dt.month == 12) & (df["date_pre"].dt.day >= 15)) |
    ((df["date_pre"].dt.month == 1) | (df["date_pre"].dt.month == 2)) |
    ((df["date_pre"].dt.month == 3) & (df["date_pre"].dt.day <= 3))),
    ((df["date_pre"].dt.month == 7) & (df["date_pre"].dt.day >= 15)),
    ((df["date_pre"].dt.month == 9) & (df["date_pre"].dt.day >= 11)),
]
df["high_season"] = np.select(high_season_conditions, [True, True, True], default=False)

In [None]:
# Calculate the difference in minutes (actual takeoff - predicted takeoff)
df['time_diff'] = (df['date_post'] - df['date_pre']).dt.total_seconds() / 60

If the difference is greater or equal to 15, the flight is delayed

In [None]:
df['delayed'] = df['time_diff'] >= 15

### External Data

Distances from Santiago's Airport to every destiny

In [None]:
df_dist = pd.read_csv('external_data/airports_distances.csv')
df = pd.merge(df, df_dist, on='dest_pre', how='left')
df[["dest_pre", "distance"]].sample(5)

## Data Analysis

In [None]:
df["time_diff"].describe()

We can see there are time differences with negative value, up to -14. This would suggest the plane took off 14 minutes before it was programmed to. This could be an error, or maybe all the passengers already were on board, so it actually took off earlier. I'll keep those flights.

In [None]:
delayed_count = df["delayed"].value_counts()
print(f"There are {delayed_count[1]} flights with a delay of more than 15 minutes, and {delayed_count[0]} without delay.")
print(f"The {round(delayed_count[1] / (delayed_count[0] + delayed_count[1]) * 100, 2)}% of the flights are delayed.")

In [None]:
df['weekday'].value_counts().sort_index().plot(kind='bar')
plt.xticks(np.arange(7), ('Monday', 'Tuesday', 'Wednesday', 'Thursday',
                          'Friday', 'Saturday', 'Sunday'))
plt.title('Amount of flights by day of the week')
plt.xlabel('Day of the week')
plt.ylabel('Amount of flights')
plt.show()

The amount of flights of each day is quite similar

In [None]:
#df['month'].value_counts().sort_index().plot(kind='bar')
fig, ax = plt.subplots(figsize=(10, 5))
df['month'].value_counts().sort_index().plot(kind='bar', ax=ax)
ax.set_xticklabels(('January', 'February', 'March', 'April', 'May', 'June',
                    'July', 'August', 'September', 'October', 'November', 'December'))
plt.title('Amount of flights by month')
plt.xlabel('Month')
plt.ylabel('Amount of flights')
plt.show()

In [None]:
delay_rate_by_type = df.groupby("international")["delayed"].mean()
print(f"The delay rate for international flights is {round(delay_rate_by_type[1] * 100, 2)}%")
print(f"The delay rate for national flights is {round(delay_rate_by_type[0] * 100, 2)}%")

As expected, international flights have a higher delay rate than national flights.