In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# This is new
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
# fetch data and print the first five elements and column name

airline_data = pd.read_csv('Invistico_Airline.csv')
airline_data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [3]:
column_names = airline_data.columns
print(column_names)

Index(['satisfaction', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Seat comfort',
       'Departure/Arrival time convenient', 'Food and drink', 'Gate location',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes'],
      dtype='object')


In [4]:
airline_data.dtypes

satisfaction                          object
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival De

In [5]:
#Check NaN values

airline_data.isna()
airline_data.isna().sum()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
129876,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
129877,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
129878,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


satisfaction                           0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
dtype: int64

In [6]:
#drop NaN values
airline_data.dropna(subset=['Arrival Delay in Minutes'],inplace=True)

In [7]:
#checking Nan Values after dropping them
airline_data.isna().sum()

satisfaction                         0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Seat comfort                         0
Departure/Arrival time convenient    0
Food and drink                       0
Gate location                        0
Inflight wifi service                0
Inflight entertainment               0
Online support                       0
Ease of Online booking               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Cleanliness                          0
Online boarding                      0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
dtype: int64

#### Answer - The **"Arrival Delay in Minutes"** column has **393** NaN values in the dataset which is dropped

In [8]:
#Counting the number of rows in the dataset after removing the NaN rows
airline_data.shape

(129487, 23)

#### Answer - After dropping NaN values, the single dataframe contains **129487 rows**

### Binary Logistic Regression

In [9]:
airline_data.drop(columns='satisfaction')

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.00
1,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.00
2,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.00
3,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.00
4,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,Female,disloyal Customer,29,Personal Travel,Eco,1731,5,5,5,3,...,2,2,3,3,4,4,4,2,0,0.00
129876,Male,disloyal Customer,63,Personal Travel,Business,2087,2,3,2,4,...,1,3,2,3,3,1,2,1,174,172.00
129877,Male,disloyal Customer,69,Personal Travel,Eco,2320,3,0,3,3,...,2,4,4,3,4,2,3,2,155,163.00
129878,Male,disloyal Customer,66,Personal Travel,Eco,2450,3,2,3,2,...,2,3,3,2,3,2,1,2,193,205.00


In [10]:
airline_data.dtypes

satisfaction                          object
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival De

#### There are __22 independent variables/columns__ in the dataset.
#### We have 4 categorical features in the given dataset namely: __Gender, Customer Type, Type of Travel and Class__

In [11]:
print(set(airline_data['Gender']))
print(set(airline_data['Customer Type']))
print(set(airline_data['Type of Travel']))
print(set(airline_data['Class']))

{'Female', 'Male'}
{'Loyal Customer', 'disloyal Customer'}
{'Personal Travel', 'Business travel'}
{'Eco', 'Business', 'Eco Plus'}


In [12]:
#One hot encoding function
from sklearn.preprocessing import OneHotEncoder
def get_ohe(df, col):
    ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse_output=False, dtype='int')
    ohe.fit(df[[col]])
    temp_df = pd.DataFrame(data=ohe.transform(df[[col]]), columns=ohe.get_feature_names_out())
    df.drop(columns=[col], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df

In [13]:
airline_data = get_ohe(airline_data, 'Customer Type')
airline_data = get_ohe(airline_data, 'Class')
airline_data = get_ohe(airline_data, 'Type of Travel')
airline_data = get_ohe(airline_data, 'Gender')
airline_data.drop(columns='satisfaction')

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Class_Eco,Class_Eco Plus,Type of Travel_Personal Travel,Gender_Male
0,65,265,0,0,0,2,2,4,2,3,...,5,3,2,0,0.00,0,1,0,1,0
1,47,2464,0,0,0,3,0,2,2,3,...,2,3,2,310,305.00,0,0,0,1,1
2,15,2138,0,0,0,3,2,0,2,2,...,4,4,2,0,0.00,0,1,0,1,0
3,60,623,0,0,0,3,3,4,3,1,...,4,1,3,0,0.00,0,1,0,1,0
4,70,354,0,0,0,3,4,3,4,2,...,4,2,5,0,0.00,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129482,29,1731,5,5,5,3,2,5,2,2,...,4,4,2,0,0.00,1,1,0,1,0
129483,63,2087,2,3,2,4,2,1,1,3,...,1,2,1,174,172.00,1,0,0,1,1
129484,69,2320,3,0,3,3,3,2,2,4,...,2,3,2,155,163.00,1,1,0,1,1
129485,66,2450,3,2,3,2,3,2,2,3,...,2,1,2,193,205.00,1,1,0,1,1


#### We get **23 features(independent variables)** after performing One Hot Encoding

In [14]:
#Train and Test data splitting
X_train, X_test, y_train, y_test = train_test_split(airline_data.drop(columns = ['satisfaction']), airline_data['satisfaction'], test_size=0.2, stratify = airline_data['satisfaction'], random_state=50)
X_train
X_test
y_train
y_test

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Class_Eco,Class_Eco Plus,Type of Travel_Personal Travel,Gender_Male
77869,44,2908,3,2,4,2,5,4,4,3,...,3,3,4,60,30.00,0,0,0,0,1
59474,30,2105,4,4,4,3,3,4,3,3,...,4,5,3,24,27.00,1,0,0,0,1
119053,59,3106,1,1,1,1,5,5,4,5,...,4,5,3,0,0.00,0,0,0,0,1
84136,58,3234,3,3,3,3,3,4,5,3,...,5,3,3,57,43.00,0,0,0,0,1
8209,60,1673,2,2,3,2,4,4,3,4,...,5,4,4,8,9.00,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75705,49,2363,2,1,1,1,2,2,2,3,...,2,1,2,185,196.00,0,0,1,0,1
63094,38,1930,5,5,5,4,1,5,1,1,...,4,4,1,32,30.00,1,0,0,0,0
98210,45,3005,1,1,1,1,5,4,5,4,...,5,4,4,36,48.00,0,0,0,0,1
96020,45,3802,3,3,3,3,5,4,4,4,...,3,4,4,1,0.00,0,0,0,0,1


Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Class_Eco,Class_Eco Plus,Type of Travel_Personal Travel,Gender_Male
126910,39,1861,1,1,1,1,3,5,4,5,...,4,5,5,8,2.00,0,0,0,0,1
113265,19,1126,5,1,1,1,5,5,4,5,...,4,4,5,0,5.00,0,1,0,0,1
95461,56,433,5,5,5,5,2,4,4,4,...,4,4,3,0,0.00,0,0,0,0,0
82721,52,3949,2,2,2,2,5,4,5,3,...,4,3,4,11,9.00,0,0,0,0,1
32709,67,2147,4,2,4,2,1,4,1,1,...,4,2,1,0,0.00,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125165,52,1403,5,5,5,5,5,5,5,5,...,3,5,5,66,63.00,0,1,0,0,1
9482,68,2047,2,2,2,2,3,2,3,3,...,1,2,3,5,5.00,0,1,0,1,1
72383,41,2236,2,2,2,2,3,3,4,2,...,1,2,4,0,0.00,0,0,0,0,0
50139,11,1811,2,4,3,4,1,3,4,1,...,4,3,1,6,0.00,1,1,0,0,1


77869     dissatisfied
59474        satisfied
119053       satisfied
84136        satisfied
8209         satisfied
              ...     
75705     dissatisfied
63094        satisfied
98210        satisfied
96020        satisfied
119246       satisfied
Name: satisfaction, Length: 103589, dtype: object

126910       satisfied
113265       satisfied
95461        satisfied
82721        satisfied
32709     dissatisfied
              ...     
125165       satisfied
9482      dissatisfied
72383     dissatisfied
50139     dissatisfied
48808     dissatisfied
Name: satisfaction, Length: 25898, dtype: object

#### length of training data = __103589__
#### length of testing data  = __25898__
#### first row's index for the train data = __77869__
#### first row's index for the test data = __126910__


In [15]:
#Scaling the data
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
    X_train
    X_test
    y_train
    y_test

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Class_Eco,Class_Eco Plus,Type of Travel_Personal Travel,Gender_Male
77869,0.30,0.90,0.12,-0.65,0.80,-0.76,1.33,0.46,0.37,-0.36,...,-0.27,-0.61,0.50,1.19,0.38,-0.47,-0.90,-0.28,-0.67,1.02
59474,-0.62,0.12,0.83,0.66,0.80,0.01,-0.19,0.46,-0.40,-0.36,...,0.52,1.12,-0.27,0.24,0.31,2.11,-0.90,-0.28,-0.67,1.02
119053,1.29,1.09,-1.32,-1.30,-1.28,-1.53,1.33,1.20,0.37,1.17,...,0.52,1.12,-0.27,-0.39,-0.39,-0.47,-0.90,-0.28,-0.67,1.02
84136,1.23,1.22,0.12,0.01,0.10,0.01,-0.19,0.46,1.13,-0.36,...,1.32,-0.61,-0.27,1.11,0.72,-0.47,-0.90,-0.28,-0.67,1.02
8209,1.36,-0.30,-0.60,-0.65,0.10,-0.76,0.57,0.46,-0.40,0.40,...,1.32,0.25,0.50,-0.18,-0.16,-0.47,1.11,-0.28,1.49,-0.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75705,0.63,0.37,-0.60,-1.30,-1.28,-1.53,-0.95,-1.03,-1.16,-0.36,...,-1.06,-2.35,-1.04,4.47,4.68,-0.47,-0.90,3.59,-0.67,1.02
63094,-0.10,-0.05,1.55,1.32,1.49,0.77,-1.71,1.20,-1.93,-1.90,...,0.52,0.25,-1.81,0.45,0.38,2.11,-0.90,-0.28,-0.67,-0.98
98210,0.37,1.00,-1.32,-1.30,-1.28,-1.53,1.33,0.46,1.13,0.40,...,1.32,0.25,0.50,0.56,0.85,-0.47,-0.90,-0.28,-0.67,1.02
96020,0.37,1.77,0.12,0.01,0.10,0.01,1.33,0.46,0.37,0.40,...,-0.27,0.25,0.50,-0.36,-0.39,-0.47,-0.90,-0.28,-0.67,1.02


Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Class_Eco,Class_Eco Plus,Type of Travel_Personal Travel,Gender_Male
126910,-0.03,-0.12,-1.32,-1.30,-1.28,-1.53,-0.19,1.20,0.37,1.17,...,0.52,1.12,1.27,-0.18,-0.34,-0.47,-0.90,-0.28,-0.67,1.02
113265,-1.35,-0.83,1.55,-1.30,-1.28,-1.53,1.33,1.20,0.37,1.17,...,0.52,0.25,1.27,-0.39,-0.26,-0.47,1.11,-0.28,-0.67,1.02
95461,1.09,-1.50,1.55,1.32,1.49,1.54,-0.95,0.46,0.37,0.40,...,0.52,0.25,-0.27,-0.39,-0.39,-0.47,-0.90,-0.28,-0.67,-0.98
82721,0.83,1.91,-0.60,-0.65,-0.59,-0.76,1.33,0.46,1.13,-0.36,...,0.52,-0.61,0.50,-0.10,-0.16,-0.47,-0.90,-0.28,-0.67,1.02
32709,1.82,0.16,0.83,-0.65,0.80,-0.76,-1.71,0.46,-1.93,-1.90,...,0.52,-1.48,-1.81,-0.39,-0.39,-0.47,1.11,-0.28,1.49,1.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125165,0.83,-0.56,1.55,1.32,1.49,1.54,1.33,1.20,1.13,1.17,...,-0.27,1.12,1.27,1.35,1.24,-0.47,1.11,-0.28,-0.67,1.02
9482,1.89,0.06,-0.60,-0.65,-0.59,-0.76,-0.19,-1.03,-0.40,-0.36,...,-1.85,-1.48,-0.27,-0.25,-0.26,-0.47,1.11,-0.28,1.49,1.02
72383,0.10,0.25,-0.60,-0.65,-0.59,-0.76,-0.19,-0.28,0.37,-1.13,...,-1.85,-1.48,0.50,-0.39,-0.39,-0.47,-0.90,-0.28,-0.67,-0.98
50139,-1.88,-0.16,-0.60,0.66,0.10,0.77,-1.71,-0.28,0.37,-1.90,...,0.52,-0.61,-1.81,-0.23,-0.39,2.11,1.11,-0.28,-0.67,1.02


77869     dissatisfied
59474        satisfied
119053       satisfied
84136        satisfied
8209         satisfied
              ...     
75705     dissatisfied
63094        satisfied
98210        satisfied
96020        satisfied
119246       satisfied
Name: satisfaction, Length: 103589, dtype: object

126910       satisfied
113265       satisfied
95461        satisfied
82721        satisfied
32709     dissatisfied
              ...     
125165       satisfied
9482      dissatisfied
72383     dissatisfied
50139     dissatisfied
48808     dissatisfied
Name: satisfaction, Length: 25898, dtype: object

In [16]:
#Binary logistic regression model
model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = None)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
accuracy_train = model.score(X_train, y_train)

# This is the coefficient Beta_1, ..., Beta_23
model.coef_

# This is the coefficient Beta_0
model.intercept_
print("The fraction of accurate prediction is: ",round(accuracy_train,4))

array([[-0.10933188, -0.11289484,  0.40343082, -0.31087083, -0.31133854,
         0.14813228, -0.0951655 ,  0.91064184,  0.11763942,  0.304656  ,
         0.39189821,  0.28826001,  0.12069634,  0.37708687,  0.09094497,
         0.20885878,  0.13813465, -0.34225785, -0.76953739, -0.36740616,
        -0.20807181, -0.35546439, -0.48941764]])

array([0.35141685])

The fraction of accurate prediction is:  0.8351


#### The result is __0.8351, or 83.51%__, for the score, which represents the fraction of accurate predictions. This indicates a fairly good fit since the model accounts for __83%__ of the variability in the dependent variable.

#### Also, the __βˆ0, βˆ1, βˆ2 are 0.3514, -0.1093 and -0.1128__ respectively.

In [17]:
#Calculation of the accuracy of test dataset
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Satisfaction'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
print(model.score(X_test, y_test))

0.8365896980461812


#### The accuracy of the model on test dataset is __83.66__

In [18]:
test_output.head()

Unnamed: 0,pred_Satisfaction,satisfaction
126910,satisfied,satisfied
113265,satisfied,satisfied
95461,satisfied,satisfied
82721,satisfied,satisfied
32709,dissatisfied,dissatisfied


In [19]:
#Calculation of Probability for output variable satisfaction being 1
test_output['Probability'] =  model.predict_proba(X_test)[:, 1]
test_output.head()

Unnamed: 0,pred_Satisfaction,satisfaction,Probability
126910,satisfied,satisfied,0.98
113265,satisfied,satisfied,0.98
95461,satisfied,satisfied,0.96
82721,satisfied,satisfied,0.7
32709,dissatisfied,dissatisfied,0.09


#### For the first index value of the test data frame, the chance that satisfaction is 1 is equal to __0.98 or 98%__.  