## Import Necessary Libraries

In [1]:
#import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## Data Loading, Exploration and Preprocessing

In [3]:
#load train data
train_df = pd.read_csv("../Datasets/train.csv")

In [5]:
#check shape of train data
train_df.shape
print(f"The train data contains {train_df.shape[0]} rows and {train_df.shape[1]} columns.")

The train data contains 103904 rows and 25 columns.


In [7]:
#display first 5 rows
train_df[:5]

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [9]:
#display 10 random rows
train_df.sample(10)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
72522,72522,86882,Male,Loyal Customer,50,Personal Travel,Eco Plus,692,2,5,...,5,5,3,5,5,5,5,3,0.0,neutral or dissatisfied
54234,54234,44029,Male,Loyal Customer,52,Business travel,Business,2670,1,1,...,4,4,4,4,3,4,5,0,4.0,satisfied
37480,37480,59809,Female,disloyal Customer,47,Business travel,Business,997,1,1,...,3,2,3,2,2,2,3,0,0.0,neutral or dissatisfied
63556,63556,86753,Male,Loyal Customer,37,Personal Travel,Eco,821,0,4,...,4,1,2,3,3,4,4,0,0.0,satisfied
41928,41928,43879,Female,disloyal Customer,18,Business travel,Eco,174,2,2,...,4,4,5,4,3,4,4,0,59.0,neutral or dissatisfied
31238,31238,64607,Male,disloyal Customer,50,Business travel,Business,190,3,3,...,2,5,2,4,4,4,2,0,0.0,neutral or dissatisfied
75983,75983,59411,Female,Loyal Customer,55,Personal Travel,Eco,2367,2,5,...,5,5,1,5,3,5,4,0,0.0,neutral or dissatisfied
42081,42081,50191,Female,Loyal Customer,26,Business travel,Business,3879,3,2,...,3,3,4,3,1,4,3,0,0.0,neutral or dissatisfied
94532,94532,36266,Female,Loyal Customer,60,Business travel,Business,3617,4,5,...,4,4,4,4,3,4,2,20,36.0,neutral or dissatisfied
30410,30410,4140,Male,disloyal Customer,21,Business travel,Eco,431,3,4,...,3,1,5,4,3,3,3,0,0.0,neutral or dissatisfied


In [11]:
#display column names 
train_df.columns

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [13]:
#the unnamed column look like they have the same values as the index, check count
if train_df["Unnamed: 0"].count()  == train_df["Unnamed: 0"].nunique():
    print(f"The Unnamed column contains {train_df['Unnamed: 0'].nunique()} unique values.")

The Unnamed column contains 103904 unique values.


In [14]:
#set unnamed column as new index
train_df = train_df.set_index("Unnamed: 0")

#rename index
train_df.index.name = "Index"

In [15]:
#view changes
train_df.head(2)

Unnamed: 0_level_0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied


In [17]:
#replace spaces in column names with underscore _
string = {" ": "_"}
train_df.columns = train_df.columns.str.replace(" ", "_", regex=True)

In [20]:
#capitalize column names
train_df.columns = train_df.columns.str.title()

In [22]:
#view changes
train_df.columns

Index(['Id', 'Gender', 'Customer_Type', 'Age', 'Type_Of_Travel', 'Class',
       'Flight_Distance', 'Inflight_Wifi_Service',
       'Departure/Arrival_Time_Convenient', 'Ease_Of_Online_Booking',
       'Gate_Location', 'Food_And_Drink', 'Online_Boarding', 'Seat_Comfort',
       'Inflight_Entertainment', 'On-Board_Service', 'Leg_Room_Service',
       'Baggage_Handling', 'Checkin_Service', 'Inflight_Service',
       'Cleanliness', 'Departure_Delay_In_Minutes', 'Arrival_Delay_In_Minutes',
       'Satisfaction'],
      dtype='object')

In [25]:
#check info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer_Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type_Of_Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight_Distance                    103904 non-null  int64  
 7   Inflight_Wifi_Service              103904 non-null  int64  
 8   Departure/Arrival_Time_Convenient  103904 non-null  int64  
 9   Ease_Of_Online_Booking             103904 non-null  int64  
 10  Gate_Location                      103904 non-null  int64  
 11  Food_And_Drink                     103904 no

In [26]:
#check for missing values in train data
train_df.isnull().sum()

Id                                     0
Gender                                 0
Customer_Type                          0
Age                                    0
Type_Of_Travel                         0
Class                                  0
Flight_Distance                        0
Inflight_Wifi_Service                  0
Departure/Arrival_Time_Convenient      0
Ease_Of_Online_Booking                 0
Gate_Location                          0
Food_And_Drink                         0
Online_Boarding                        0
Seat_Comfort                           0
Inflight_Entertainment                 0
On-Board_Service                       0
Leg_Room_Service                       0
Baggage_Handling                       0
Checkin_Service                        0
Inflight_Service                       0
Cleanliness                            0
Departure_Delay_In_Minutes             0
Arrival_Delay_In_Minutes             310
Satisfaction                           0
dtype: int64

In [29]:
#view first 5 rows of the column with missing values
train_df["Arrival_Delay_In_Minutes"].head()

Index
0    18.0
1     6.0
2     0.0
3     9.0
4     0.0
Name: Arrival_Delay_In_Minutes, dtype: float64

In [31]:
#fill missing values in the column with the mean
train_df["Arrival_Delay_In_Minutes"] = train_df["Arrival_Delay_In_Minutes"].fillna(train_df["Arrival_Delay_In_Minutes"].mean())

In [33]:
#confirm missing values have been filled
train_df["Arrival_Delay_In_Minutes"].isnull().sum()

0

In [35]:
train_df.dtypes

Id                                     int64
Gender                                object
Customer_Type                         object
Age                                    int64
Type_Of_Travel                        object
Class                                 object
Flight_Distance                        int64
Inflight_Wifi_Service                  int64
Departure/Arrival_Time_Convenient      int64
Ease_Of_Online_Booking                 int64
Gate_Location                          int64
Food_And_Drink                         int64
Online_Boarding                        int64
Seat_Comfort                           int64
Inflight_Entertainment                 int64
On-Board_Service                       int64
Leg_Room_Service                       int64
Baggage_Handling                       int64
Checkin_Service                        int64
Inflight_Service                       int64
Cleanliness                            int64
Departure_Delay_In_Minutes             int64
Arrival_De

In [37]:
train_df.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [39]:
train_df.Gender = train_df["Gender"].map({"Male": 0, "Female": 1})

In [41]:
train_df.Gender.sample(4)

Index
96801    0
51590    1
93800    0
5434     1
Name: Gender, dtype: int64

In [43]:
train_df.Customer_Type.unique()

array(['Loyal Customer', 'disloyal Customer'], dtype=object)

In [45]:
train_df.Customer_Type = train_df.Customer_Type.map({"Loyal Customer": 0, "disloyal Customer": 1})
train_df.Customer_Type.unique()

array([0, 1], dtype=int64)

In [47]:
train_df.Type_Of_Travel.unique()

array(['Personal Travel', 'Business travel'], dtype=object)

In [49]:
train_df.Type_Of_Travel = train_df.Type_Of_Travel.map({"Personal Travel": 0, "Business travel": 1})
train_df.Type_Of_Travel.unique()

array([0, 1], dtype=int64)

In [51]:
train_df.Class.unique()

array(['Eco Plus', 'Business', 'Eco'], dtype=object)

In [53]:
train_df.Class = train_df.Class.map({"Eco Plus": 0, "Business": 1, "Eco": 2})
train_df.Class.unique()

array([0, 1, 2], dtype=int64)

In [55]:
train_df.Satisfaction.unique()

array(['neutral or dissatisfied', 'satisfied'], dtype=object)

In [57]:
train_df.Satisfaction = train_df.Satisfaction.map({"neutral or dissatisfied": 0, "satisfied": 1})
train_df.Satisfaction.unique()

array([0, 1], dtype=int64)

In [59]:
train_df.head()

Unnamed: 0_level_0,Id,Gender,Customer_Type,Age,Type_Of_Travel,Class,Flight_Distance,Inflight_Wifi_Service,Departure/Arrival_Time_Convenient,Ease_Of_Online_Booking,...,Inflight_Entertainment,On-Board_Service,Leg_Room_Service,Baggage_Handling,Checkin_Service,Inflight_Service,Cleanliness,Departure_Delay_In_Minutes,Arrival_Delay_In_Minutes,Satisfaction
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,70172,0,0,13,0,0,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,0
1,5047,0,1,25,1,1,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,0
2,110028,1,0,26,1,1,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,24026,1,0,25,1,1,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,119299,0,0,61,1,1,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


In [61]:
train_df = train_df.apply(lambda x : x/np.max(x), axis=0)

In [63]:
train_df.head()

Unnamed: 0_level_0,Id,Gender,Customer_Type,Age,Type_Of_Travel,Class,Flight_Distance,Inflight_Wifi_Service,Departure/Arrival_Time_Convenient,Ease_Of_Online_Booking,...,Inflight_Entertainment,On-Board_Service,Leg_Room_Service,Baggage_Handling,Checkin_Service,Inflight_Service,Cleanliness,Departure_Delay_In_Minutes,Arrival_Delay_In_Minutes,Satisfaction
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.540283,0.0,0.0,0.152941,0.0,0.0,0.092314,0.6,0.8,0.6,...,1.0,0.8,0.6,0.8,0.8,1.0,1.0,0.015704,0.011364,0.0
1,0.038859,0.0,1.0,0.294118,1.0,0.5,0.04716,0.6,0.4,0.6,...,0.2,0.2,1.0,0.6,0.2,0.8,0.2,0.000628,0.003788,0.0
2,0.847151,1.0,0.0,0.305882,1.0,0.5,0.229179,0.4,0.4,0.4,...,1.0,0.8,0.6,0.8,0.8,0.8,1.0,0.0,0.0,1.0
3,0.184986,1.0,0.0,0.294118,1.0,0.5,0.112783,0.4,1.0,1.0,...,0.4,0.4,1.0,0.6,0.2,0.8,0.4,0.00691,0.005682,0.0
4,0.918532,0.0,0.0,0.717647,1.0,0.5,0.042946,0.6,0.6,0.6,...,0.6,0.6,0.8,0.8,0.6,0.6,0.6,0.0,0.0,1.0


In [65]:
X = train_df.drop("Satisfaction", axis=1)
y = train_df["Satisfaction"]

In [67]:
X = np.array(X)

In [69]:
X

array([[5.40283338e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.57035176e-02, 1.13636364e-02],
       [3.88589467e-02, 0.00000000e+00, 1.00000000e+00, ...,
        2.00000000e-01, 6.28140704e-04, 3.78787879e-03],
       [8.47151217e-01, 1.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.29912227e-01, 0.00000000e+00, 1.00000000e+00, ...,
        8.00000000e-01, 4.39698492e-03, 8.83838384e-03],
       [4.17100400e-01, 1.00000000e+00, 1.00000000e+00, ...,
        2.00000000e-01, 0.00000000e+00, 0.00000000e+00],
       [4.81729289e-01, 0.00000000e+00, 0.00000000e+00, ...,
        2.00000000e-01, 0.00000000e+00, 0.00000000e+00]])

In [71]:
X.ndim

2

In [73]:
y = np.array(y)
y

array([0., 0., 1., ..., 0., 0., 0.])

In [75]:
split = int(0.9 * len(y))

X_train, X_val = np.split(X, [split])
y_train, y_val = np.split(y, [split])

In [77]:
display(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(93513, 23)

(10391, 23)

(93513,)

(10391,)

## Activation Functions

### Sigmoid Function

Here we define the forward and backward pass for sigmoid

In [79]:
def sigmoid_forward(x):
    sig_forward = 1/(1 + np.exp(-x))
    
    return sig_forward

def sigmoid_backward(sig_forward):
    sig_derivative = sig_forward * (1 - sig_forward)
    
    return sig_derivative

### ReLu Function

Here we define the forward and backward pass for relu

In [81]:
def relu_forward(x):
    rel_forward = np.maximum(0, x)
    
    return rel_forward

def relu_backward(x):
    rel_derivative = np.where(x > 0, 1, 0)
    
    return rel_derivative

### Softmax Function

Here we define the forward and backward pass for softmax

In [90]:
def softmax_forward(x):
    exp_x_i = np.exp(x)
    sum_exp_x_j = np.sum(exp_x_i)
    soft_forward = exp_x_i/sum_exp_x_j
    return soft_forward

#def softmax_backward(x):

In [94]:
x = np.array([8, 5, 0])
result = softmax_forward(x)

print(result)

[9.52269826e-01 4.74107229e-02 3.19450938e-04]


In [None]:
#load test data
test_df = pd.read_csv("../Datasets/test.csv")
test_df.head()

In [None]:
#inspect shape of test data
test_df.shape
print(f"The test data contains {test_df.shape[0]} rows and {test_df.shape[1]} columns.")

In [None]:
#display info
test_df.info()

In [None]:
#get missing values in test data
test_df.isna().sum()

In [None]:
#view column with missing values
test_df["Arrival Delay in Minutes"].sample(6)

In [None]:
#fill missing values in column with the mean
test_df["Arrival Delay in Minutes"] = test_df["Arrival Delay in Minutes"].fillna(test_df["Arrival Delay in Minutes"].mean())

In [None]:
#confirm missing values have been filled
test_df.isnull().sum().sum()

In [None]:
test_df.columns