In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *

# Task 1. Review and Prepare the Dataset

## Task 1a) Understand the Data
Review the data set to assess distributions of the features and the target and understand 
the relationships among the features and the target.

In [2]:
train_df = pd.read_csv('./airline_dataset/train.csv')
test_df = pd.read_csv('./airline_dataset/test.csv')
df = pd.concat([train_df, test_df])
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Order,Comfort,Service,satisfaction
0,0,1,13,0,1,460,3,4,3,1,...,4,4,5,5,25,18,2.75,4.5,4.0,0
1,0,0,25,1,2,235,3,2,3,3,...,3,1,4,1,1,6,2.75,1.5,3.25,0
2,1,1,26,1,2,1142,2,2,2,2,...,4,4,4,5,0,0,2.0,5.0,3.75,1
3,1,1,25,1,2,562,2,5,5,5,...,3,1,4,2,11,9,4.25,2.0,3.5,0
4,0,1,61,1,2,214,3,3,3,3,...,4,3,3,3,0,0,3.0,4.25,3.5,1


In [3]:
print(df.shape)
df = df.dropna(how='any')
print(df.shape)

(129487, 26)
(129487, 26)


In [5]:
df.dtypes

Gender                                 int64
Customer Type                          int64
Age                                    int64
Type of Travel                         int64
Class                                  int64
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes               int64
Order     

## Task 1b) Select a Metric
Choose a metric that you will use to assess the models.
Justify your selection of a metric.

## Task 1c) Prepare the data sets
Partition the data for training and evaluation.

Use either a three-way split (train/dev/test) or cross-validation.

Justify the approach to partitioning the data.

In [6]:
from sklearn.model_selection import train_test_split

X = df[df.columns.difference(['satisfaction'])].values
y = df[['satisfaction']].values.squeeze()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5)

# Task 2. Train a Logistic Regression Model

Train a Logistic Regression Model (LogisticRegression in scikit-learn) that predicts the target.
You may need to either normalise the data or increase the model’s hyperparameter 
max_iter to 2000 to achieve convergence of the optimiser.

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

In [10]:
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=2000)

In [11]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89     10961
           1       0.87      0.84      0.85      8462

    accuracy                           0.87     19423
   macro avg       0.87      0.87      0.87     19423
weighted avg       0.87      0.87      0.87     19423



# Task 3. Train a Decision Tree
Train a Decision Tree (DecisionTreeClassifier in scikit-learn) that predicts the target.

Optimise the model’s quality of fit by tuning its min_samples_leaf hyperparameter. 
Justify  the selection of the candidate values for the hyperparameter and describe how you 
identified its optimal value.

Assess the model’s quality of fit (bias/variance).

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
tree_clf = DecisionTreeClassifier(min_samples_leaf=5)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(min_samples_leaf=5)

In [14]:
y_pred = tree_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96     10961
           1       0.94      0.94      0.94      8462

    accuracy                           0.95     19423
   macro avg       0.95      0.95      0.95     19423
weighted avg       0.95      0.95      0.95     19423



# Task 4. Train a Feed-Forward Neural Net
Train a Feed-Forward neural network that predicts the target. Use Tensorflow/Keras.

You will define your own network architecture. Use only Dense layers, only ReLU activation
functions in the hidden layers, and a single output unit with sigmoid activation function in 
the output layer. Use binary_crossentropy as the loss function.

Aim to develop the simplest (fewest layers/units) that predicts no worse than the decision 
tree classifier created in Task 3.

Remember that normalisation of the input data can have a significant impact on the 
performance of a neural net model.

Describe the process and decisions that you have employed to arrive at the final neural net 
architecture. Justify the selection of the number of hidden layers and units per layer and 
explain the (iterative) process that you have followed while optimising the neural net. 
Explain how you have chosen the values for any hyperparameters that you may have set 
(such as batch size, number of epochs, learning rate).
Assess the model’s quality of fit (bias/variance).

In [15]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

Using TensorFlow backend.


In [16]:
def process_y(y):
    new_y = []
    for i in y:
        if i == 0:
            item = [1, 0]
        else:
            item = [0, 1]
        new_y.append(item)
    return np.array(new_y)
train_y = process_y(y_train)
test_y = process_y(y_test)
valid_y = process_y(y_valid)

In [17]:
model = Sequential()
model.add(Dense(256, input_shape=(X.shape[1],), activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(2, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, train_y, batch_size=32, epochs=32, verbose=1, validation_data=(X_valid, valid_y))

Train on 90640 samples, validate on 19424 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32

In [None]:
y_pred = model.predict(X_test, batch_size=32).argmax(axis=1)
print(classification_report(y_test, y_pred))