# Import Libraries

In [None]:
import numpy as np 
import pandas as pd 

import os

airlines = pd.read_csv('../input/flight-delays/airlines.csv')
airports = pd.read_csv('../input/flight-delays/airports.csv')
flights = pd.read_csv('../input/flight-delays/flights.csv')

# Cleaning the Data and Get the Big Picture

In [None]:
airlines

In [None]:
airports.head()

In [None]:
flights.head()

In [None]:
flights.info(show_counts=True)

**Drop columns that have a lot of missing values**

In [None]:
columns_to_drop = ['CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 
                  "AIRLINE_DELAY", "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"]

flights = flights.drop(columns_to_drop, axis=1, errors='ignore')

Let's see how many flights have been canceled or diverted

In [None]:
print(flights['DIVERTED'].value_counts())
print('*' * 30)
print(flights['CANCELLED'].value_counts())

Flights that are 'CANCELLED' or 'DIVERTED' should be removed because they are outliers (rare cases)

In [None]:
flights = flights[flights['CANCELLED'] == 0]   # keep only non-concelled flights
flights = flights[flights['DIVERTED'] == 0]    # keep only non-diverted flights
assert len(flights) == 5819079 - 89884 - 15187 # all flights - conceled - diverted (= 5714008)

**Correlations**

Let's look at how much each attributes correlates with the arrival delay:

In [None]:
corr_matrix = flights.corr()
corr_matrix['ARRIVAL_DELAY'].sort_values(ascending=False)

Let's keep only useful attributes

In [None]:
flights = flights[['MONTH', 'DAY', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 
                    'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 
                    'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'ELAPSED_TIME', ]]
flights.head()

**Check for null values**

In [None]:
flights.isnull().sum()

**Summary of the numerical attributes**

In [None]:
flights.describe()

**Distribution of the arrival delay attribute**

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
flights.ARRIVAL_DELAY.hist(ax=ax, bins=1000, range=(-10, 1000))
ax.set_xscale('log')
plt.ylim(0, 150000)
plt.xlabel('Delay (minutes)')
plt.ylabel('Number of flights')

# Exploration with Regard to the Mean Delay

In [None]:
# Inspired from this kaggle kernel: https://www.kaggle.com/fabiendaniel/predicting-flight-delays-tutorial
import seaborn as sns 

def delay_by_attribute(attribute, df=flights, figsize=(10, 7)):
    # Delay with less than 10 min are mapped to 0 otherwise they are mapped to 1
    delay_type = lambda x: 0 if x < 10 else 1
    flights['DELAY_TYPE'] = flights['DEPARTURE_DELAY'].apply(delay_type)
    
    plt.figure(1, figsize=figsize)
    ax = sns.countplot(y=attribute, hue='DELAY_TYPE', data=df)
    
    plt.xlabel('Flight count', fontsize=16, weight='bold')
    plt.ylabel(attribute, fontsize=16, weight='bold')
    plt.title(f'Delay by {attribute}', weight='bold')
    L = plt.legend()
    L.get_texts()[0].set_text('small delay (t < 10 min)')
    L.get_texts()[1].set_text('large delay (t > 10 min)')
    plt.grid(True)
    plt.show()

delay_by_attribute('AIRLINE')

We can see that the proportion between small and large delay is related to the airline, for example: the airline **UA** (United Air Lines Inc.) almost 50% of their flights have a large delay, on the other hand, the airline **DL** (Delta Air Lines Inc.) ~25% of their flights have a large delay.

In [None]:
delay_by_attribute('MONTH')

In [None]:
delay_by_attribute('DAY')

From the correlation matrix and the plots above we can see that delays are not correlated with months nor days.

In [None]:
result = pd.merge(flights[['ORIGIN_AIRPORT', 'DELAY_TYPE']], 
                  airports[['IATA_CODE', 'STATE']], 
                  left_on='ORIGIN_AIRPORT', right_on='IATA_CODE')

delay_by_attribute('STATE', df=result, figsize=(10, 15))

**Statistiques of outliers**

if we consider delays of more than 10 minutes to be significant delays, than let's see how much in percentage these delays represent of the dataset

In [None]:
nb_of_large_delays = (flights.ARRIVAL_DELAY > 10).sum()
percent_of_large_delays = np.round(nb_of_large_delays * 100 / len(flights), 2)
print('There are {} flights with large delays (more than 10min), which represent {}% of the flights'
      .format(nb_of_large_delays, percent_of_large_delays))

In [None]:
# Number of flights that have more than 150min (2.5h) delay
nb_of_rare_delays = (flights.ARRIVAL_DELAY > 150).sum()
percent_of_rare_delays = np.round(nb_of_rare_delays * 100 / len(flights), 1)

# percent of rare delays with regard to large delays
percent_rare_large = np.round((nb_of_rare_delays * 100 / nb_of_large_delays), 1)
print(
    'There are {} flights with rare delays (> 5h) which represent {}% of all flights, which also represent {}% of large delays'
     .format(nb_of_rare_delays, percent_of_rare_delays, percent_rare_large))

 <table style="width:50%; border: 1px solid black; border-collapse: collapse;">
  <tr>
    <th></th>
    <th>All flights</th>
    <th>Large delays (> 10min)</th>
    <th>Rare delays (> 150min)</th>
  </tr>
    <tr>
    <td>All flights</td>
    <td>100%</td>
    <td>/</td>
    <td>/</td>
  </tr>
  <tr>
    <td>Large delays (> 10min)</td>
    <td>22%</td>
    <td>100%</td>
    <td>/</td>
  </tr>
  <tr>
    <td>Rare delays (> 150min)</td>
    <td>1.3%</td>
    <td>5.8%</td>
    <td>100%</td>
  </tr>
</table> 

We can think of delays longer than 2.5 hours as outliers, so we need to remove them from the dataset

In [None]:
# Keep only delays less than 150min
flights = flights[flights.ARRIVAL_DELAY < 150]

# Split the Dataset

In [None]:
flights['ORIGIN_AIRPORT'] = flights['ORIGIN_AIRPORT'].astype(str)
flights['DESTINATION_AIRPORT'] = flights['DESTINATION_AIRPORT'].astype(str)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(flights, test_size=0.2, random_state=42)

X_train = train_set.drop("ARRIVAL_DELAY", axis=1)
y_train = train_set['ARRIVAL_DELAY'].copy()

X_test = test_set.drop('ARRIVAL_DELAY', axis=1)
y_test = test_set['ARRIVAL_DELAY'].copy()

# Encoding Categorical Attributes

I chose embedding to convert categorical attributes to numbers: ordinal encoding is not good since the model will assume that two nearby values are more familiar than to distant values, which is obviously not the case for our categorical attributes. Also, our categorical attributes have a large number of categories (629 for airports), so one-hot encoding will result in a large number of features (more than 1000). This will affect performance and training.

In [None]:
dest_airport = X_train['DESTINATION_AIRPORT'].unique()
orig_airport = X_train['ORIGIN_AIRPORT'].unique()
airports = np.unique(np.hstack([dest_airport, orig_airport]))
airlines = X_train['AIRLINE'].unique()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import tensorflow as tf 

class EmbeddingTransorfmer(BaseEstimator, TransformerMixin):
    def __init__(self, vocab, n_oov_buckets=10, embedding_dim=10):
        self.n_oov_buckets = n_oov_buckets
        self.vocab = vocab
        self.embedding_dim = embedding_dim
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        vocab = tf.constant(self.vocab)
        indices = tf.range(len(vocab), dtype=tf.int64)
        table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
        table = tf.lookup.StaticVocabularyTable(table_init, self.n_oov_buckets)
        
        embedding_dim = self.embedding_dim
        embedding_matrix = tf.random.uniform([len(vocab) + self.n_oov_buckets, embedding_dim])
        cat_indices = table.lookup(tf.constant(X))
        return tf.nn.embedding_lookup(embedding_matrix, cat_indices).numpy()

# Preprocessing the Input Features

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

X_train_num = X_train.drop(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'], axis=1, errors='ignore')
num_attribs = list(X_train_num)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat1', EmbeddingTransorfmer(airports), 'ORIGIN_AIRPORT'),
    ('cat2', EmbeddingTransorfmer(airports), 'DESTINATION_AIRPORT'),
    ('cat3', EmbeddingTransorfmer(airlines), 'AIRLINE')
])

X_train_prepared = full_pipeline.fit_transform(X_train)

# Training and Evaluation

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge_reg = Ridge(alpha=0.01, random_state=42)
ridge_reg.fit(X_train_prepared, y_train)

Let's try to predict some data

In [None]:
some_data = X_train.iloc[:5]
some_labels = y_train.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", ridge_reg.predict(some_data_prepared))

In [None]:
print("Labels:", list(some_labels))

We can see that the predictions are not exactly accurate! Let's measure the RMSE on the whole training set:

In [None]:
predictions = ridge_reg.predict(X_train_prepared)
ridge_mse = mean_squared_error(y_train, predictions)
ridge_rmse = np.sqrt(ridge_mse)
'{:.2f} min'.format(ridge_rmse)

The typical error the model make is 12.37min which is not very good. 

Unfortunatly, I couldn't use other models because training took too long, so let's try fine-tuning the ridge regression model.

## Fine Tuning Using Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'alpha': [0.01, 0.1, 1, 10, 100]},
    {'solver': ['cholesky', 'lsqr']}
]

grid_search = GridSearchCV(ridge_reg, param_grid, cv=4,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          verbose=2)
grid_search.fit(X_train_prepared, y_train)

In [None]:
np.sqrt(-grid_search.best_score_)

# Evaluation on the Test Set

In [None]:
model = grid_search.best_estimator_

X_test_prepared = full_pipeline.transform(X_test)
test_predictions = model.predict(X_test_prepared)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)

In [None]:
test_rmse

# References

* [Predicting flight delays [Tutorial]](https://www.kaggle.com/fabiendaniel/predicting-flight-delays-tutorial)
* [Flight_Delay_Prediction](https://www.kaggle.com/hrishikeshmalkar/flight-delay-prediction)
* https://github.com/Djinny/Formation-Data-Scientist/tree/master/Pr%C3%A9dire%20le%20retard%20d'avions
* https://github.com/xmontamat/OC_DataScience/tree/master/OC_Project4_Flights_delay