In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# This is new
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
airline_data = pd.read_csv('Invistico_Airline.csv')
airline_data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [3]:
print("The column names in the dataframe are")
list(airline_data.columns)

The column names in the dataframe are


['satisfaction',
 'Gender',
 'Customer Type',
 'Age',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Seat comfort',
 'Departure/Arrival time convenient',
 'Food and drink',
 'Gate location',
 'Inflight wifi service',
 'Inflight entertainment',
 'Online support',
 'Ease of Online booking',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Cleanliness',
 'Online boarding',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes']

In [4]:
airline_data.isna().sum()

satisfaction                           0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
dtype: int64

In [5]:
airline_data = airline_data.dropna()
airline_data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [6]:
print("Number of rows in the dataframe are",airline_data.shape[0])

Number of rows in the dataframe are 129487


In [7]:
airline_data.dtypes

satisfaction                          object
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival De

#### Binary Logistic Regression

In [8]:
Y = airline_data['satisfaction']

In [9]:
X = airline_data.drop(columns = ['satisfaction'])
print("Number of features in the dataframe are",X.shape[1])

Number of features in the dataframe are 22


In [10]:
categorical_columns = X.select_dtypes(include=['object']).columns 
print("Categorical columns after excluding the dependent column:", categorical_columns)

Categorical columns after excluding the dependent column: Index(['Gender', 'Customer Type', 'Type of Travel', 'Class'], dtype='object')


In [11]:
gender_values = airline_data['Gender'].unique()
print("Set of values for 'gender':", gender_values)

# To get unique values for the 'color' categorical variable
customer_type_values = airline_data['Customer Type'].unique()
print("Set of values for 'customer_type':", customer_type_values)

# To get unique values for the 'clarity' categorical variable
travel_type_values = airline_data['Type of Travel'].unique()
print("Set of values for 'travel type':", travel_type_values)

class_values = airline_data['Class'].unique()
print("Set of values for 'class':", class_values)

Set of values for 'gender': ['Female' 'Male']
Set of values for 'customer_type': ['Loyal Customer' 'disloyal Customer']
Set of values for 'travel type': ['Personal Travel' 'Business travel']
Set of values for 'class': ['Eco' 'Business' 'Eco Plus']


In [12]:
from sklearn.preprocessing import OneHotEncoder

def get_ohe(df, col):
    ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse_output=False, dtype='int')
    ohe.fit(df[[col]])
    temp_df = pd.DataFrame(data=ohe.transform(df[[col]]), columns=ohe.get_feature_names_out())
    # If you have a newer version, replace with columns=ohe.get_feature_names_out()
    df.drop(columns=[col], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df

In [15]:
airline_data = get_ohe(airline_data, 'Gender')
airline_data = get_ohe(airline_data, 'Customer Type')
airline_data = get_ohe(airline_data, 'Type of Travel')
airline_data = get_ohe(airline_data, 'Class')

In [16]:
airline_data.head()

Unnamed: 0,satisfaction,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
0,satisfied,65,265,0,0,0,2,2,4,2,...,5,3,2,0,0.0,0,0,1,1,0
1,satisfied,47,2464,0,0,0,3,0,2,2,...,2,3,2,310,305.0,1,0,1,0,0
2,satisfied,15,2138,0,0,0,3,2,0,2,...,4,4,2,0,0.0,0,0,1,1,0
3,satisfied,60,623,0,0,0,3,3,4,3,...,4,1,3,0,0.0,0,0,1,1,0
4,satisfied,70,354,0,0,0,3,4,3,4,...,4,2,5,0,0.0,0,0,1,1,0


For the attribute 'Gender', two distinct categories exist: 'Male' and 'Female'. Post one-hot encoding, the data is represented in a single column due to the exclusion of one category. Regarding 'Customer Type', it encompasses two distinct categories, 'Loyal Customer' and 'Disloyal Customer'. Following one-hot encoding, this results in a single column by omitting one category. In terms of 'Type of Travel', the data differentiates between 'Business Travel' and 'Personal Travel', which, after applying one-hot encoding, is reduced to a single column by leaving out one category. For the 'Class' attribute, it is segmented into three categories: 'Eco', 'Business', and 'Eco Plus'. One-hot encoding of these results in the creation of two columns as one category is omitted. Consequently, there is an additional column for the 'Class' attribute following the one-hot encoding with the drop-first approach. Therefore, the total comes to 24 columns, with 23 feature columns after excluding the 'satisfaction' column, which serves as the dependent variable.

In [18]:
X_train, X_test, y_train, y_test = train_test_split(airline_data.drop(columns = ['satisfaction']), airline_data['satisfaction'], test_size=0.20, stratify = airline_data['satisfaction'], random_state = 50)

In [19]:
train_length = X_train.shape[0]
test_length = X_test.shape[0]

print('Length of train and test data are:', train_length , test_length )

Length of train and test data are: 103589 25898


In [20]:
first_row_index_train = X_train.index[0]
first_row_index_test = X_test.index[0]

print("First row index of X_train:", first_row_index_train)
print("First row index of X_test:", first_row_index_test)

First row index of X_train: 77869
First row index of X_test: 126910


In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
X_train
X_test
y_train
y_test

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
77869,0.30,0.90,0.12,-0.65,0.80,-0.76,1.33,0.46,0.37,-0.36,...,-0.27,-0.61,0.50,1.19,0.38,1.02,-0.47,-0.67,-0.90,-0.28
59474,-0.62,0.12,0.83,0.66,0.80,0.01,-0.19,0.46,-0.40,-0.36,...,0.52,1.12,-0.27,0.24,0.31,1.02,2.11,-0.67,-0.90,-0.28
119053,1.29,1.09,-1.32,-1.30,-1.28,-1.53,1.33,1.20,0.37,1.17,...,0.52,1.12,-0.27,-0.39,-0.39,1.02,-0.47,-0.67,-0.90,-0.28
84136,1.23,1.22,0.12,0.01,0.10,0.01,-0.19,0.46,1.13,-0.36,...,1.32,-0.61,-0.27,1.11,0.72,1.02,-0.47,-0.67,-0.90,-0.28
8209,1.36,-0.30,-0.60,-0.65,0.10,-0.76,0.57,0.46,-0.40,0.40,...,1.32,0.25,0.50,-0.18,-0.16,-0.98,-0.47,1.49,1.11,-0.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75705,0.63,0.37,-0.60,-1.30,-1.28,-1.53,-0.95,-1.03,-1.16,-0.36,...,-1.06,-2.35,-1.04,4.47,4.68,1.02,-0.47,-0.67,-0.90,3.59
63094,-0.10,-0.05,1.55,1.32,1.49,0.77,-1.71,1.20,-1.93,-1.90,...,0.52,0.25,-1.81,0.45,0.38,-0.98,2.11,-0.67,-0.90,-0.28
98210,0.37,1.00,-1.32,-1.30,-1.28,-1.53,1.33,0.46,1.13,0.40,...,1.32,0.25,0.50,0.56,0.85,1.02,-0.47,-0.67,-0.90,-0.28
96020,0.37,1.77,0.12,0.01,0.10,0.01,1.33,0.46,0.37,0.40,...,-0.27,0.25,0.50,-0.36,-0.39,1.02,-0.47,-0.67,-0.90,-0.28


Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
126910,-0.03,-0.12,-1.32,-1.30,-1.28,-1.53,-0.19,1.20,0.37,1.17,...,0.52,1.12,1.27,-0.18,-0.34,1.02,-0.47,-0.67,-0.90,-0.28
113265,-1.35,-0.83,1.55,-1.30,-1.28,-1.53,1.33,1.20,0.37,1.17,...,0.52,0.25,1.27,-0.39,-0.26,1.02,-0.47,-0.67,1.11,-0.28
95461,1.09,-1.50,1.55,1.32,1.49,1.54,-0.95,0.46,0.37,0.40,...,0.52,0.25,-0.27,-0.39,-0.39,-0.98,-0.47,-0.67,-0.90,-0.28
82721,0.83,1.91,-0.60,-0.65,-0.59,-0.76,1.33,0.46,1.13,-0.36,...,0.52,-0.61,0.50,-0.10,-0.16,1.02,-0.47,-0.67,-0.90,-0.28
32709,1.82,0.16,0.83,-0.65,0.80,-0.76,-1.71,0.46,-1.93,-1.90,...,0.52,-1.48,-1.81,-0.39,-0.39,1.02,-0.47,1.49,1.11,-0.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125165,0.83,-0.56,1.55,1.32,1.49,1.54,1.33,1.20,1.13,1.17,...,-0.27,1.12,1.27,1.35,1.24,1.02,-0.47,-0.67,1.11,-0.28
9482,1.89,0.06,-0.60,-0.65,-0.59,-0.76,-0.19,-1.03,-0.40,-0.36,...,-1.85,-1.48,-0.27,-0.25,-0.26,1.02,-0.47,1.49,1.11,-0.28
72383,0.10,0.25,-0.60,-0.65,-0.59,-0.76,-0.19,-0.28,0.37,-1.13,...,-1.85,-1.48,0.50,-0.39,-0.39,-0.98,-0.47,-0.67,-0.90,-0.28
50139,-1.88,-0.16,-0.60,0.66,0.10,0.77,-1.71,-0.28,0.37,-1.90,...,0.52,-0.61,-1.81,-0.23,-0.39,1.02,2.11,-0.67,1.11,-0.28


77869     dissatisfied
59474        satisfied
119053       satisfied
84136        satisfied
8209         satisfied
              ...     
75705     dissatisfied
63094        satisfied
98210        satisfied
96020        satisfied
119246       satisfied
Name: satisfaction, Length: 103589, dtype: object

126910       satisfied
113265       satisfied
95461        satisfied
82721        satisfied
32709     dissatisfied
              ...     
125165       satisfied
9482      dissatisfied
72383     dissatisfied
50139     dissatisfied
48808     dissatisfied
Name: satisfaction, Length: 25898, dtype: object

In [22]:
model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'none')
# If the lbfgs throws an error, try to increase max_iter (add max_iter = 1000), 
# also try another algorithm e.g. newton-cg, scaling is also suggested
# While using multiclass case do multi_class = 'ovr' or 'auto'; can also try other solvers
# While doing regularization, use penalty = 'l2' and also C = 10.0 (need to try other values too)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_



0.8350210929731922

array([[-0.10973443, -0.11317424,  0.40257117, -0.30990641, -0.31140629,
         0.14777451, -0.09516118,  0.91137643,  0.11772809,  0.30486743,
         0.39202792,  0.28840228,  0.12049755,  0.37683233,  0.09093239,
         0.20875991,  0.13767634, -0.34180282, -0.48951396, -0.76951525,
        -0.35596546, -0.36740668, -0.20810458]])

array([0.35162794])

The performance of the logistic regression model is satisfactory, showing an accuracy rate of approximately 83.51%. This indicates that in about 83.51% of instances within the test dataset, the model successfully forecasts the level of satisfaction. An examination of the model's coefficients reveals that specific attributes, notably the type of travel and the seating class, play a crucial role in determining satisfaction outcomes.

In [23]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_satisfaction'])
test_output.head()

Unnamed: 0,pred_satisfaction
126910,satisfied
113265,satisfied
95461,satisfied
82721,satisfied
32709,dissatisfied


In [24]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Percentage of correct predictions is ')
print(model.score(X_test, y_test))

Unnamed: 0,pred_satisfaction,satisfaction
126910,satisfied,satisfied
113265,satisfied,satisfied
95461,satisfied,satisfied
82721,satisfied,satisfied
32709,dissatisfied,dissatisfied


Percentage of correct predictions is 
0.8365124720055602


In [25]:
test_output = test_output.merge(X_test, left_index = True, right_index = True)
test_output.head()

Unnamed: 0,pred_satisfaction,satisfaction,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,...,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
126910,satisfied,satisfied,-0.03,-0.12,-1.32,-1.3,-1.28,-1.53,-0.19,1.2,...,0.52,1.12,1.27,-0.18,-0.34,1.02,-0.47,-0.67,-0.9,-0.28
113265,satisfied,satisfied,-1.35,-0.83,1.55,-1.3,-1.28,-1.53,1.33,1.2,...,0.52,0.25,1.27,-0.39,-0.26,1.02,-0.47,-0.67,1.11,-0.28
95461,satisfied,satisfied,1.09,-1.5,1.55,1.32,1.49,1.54,-0.95,0.46,...,0.52,0.25,-0.27,-0.39,-0.39,-0.98,-0.47,-0.67,-0.9,-0.28
82721,satisfied,satisfied,0.83,1.91,-0.6,-0.65,-0.59,-0.76,1.33,0.46,...,0.52,-0.61,0.5,-0.1,-0.16,1.02,-0.47,-0.67,-0.9,-0.28
32709,dissatisfied,dissatisfied,1.82,0.16,0.83,-0.65,0.8,-0.76,-1.71,0.46,...,0.52,-1.48,-1.81,-0.39,-0.39,1.02,-0.47,1.49,1.11,-0.28


In [26]:
model.predict_proba(airline_data.drop(columns = ['satisfaction']))

array([[1.00000000e+000, 4.26651582e-013],
       [1.00000000e+000, 1.58558503e-147],
       [1.00000000e+000, 4.03345362e-104],
       ...,
       [1.00000000e+000, 3.53835333e-130],
       [1.00000000e+000, 2.79934444e-141],
       [1.00000000e+000, 3.90288123e-227]])

In [27]:
test_output['Probability'] = model.predict_proba(test_output.drop(columns = ['satisfaction', 'pred_satisfaction']))[:,1]
test_output.head()

Unnamed: 0,pred_satisfaction,satisfaction,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,...,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,Probability
126910,satisfied,satisfied,-0.03,-0.12,-1.32,-1.3,-1.28,-1.53,-0.19,1.2,...,1.12,1.27,-0.18,-0.34,1.02,-0.47,-0.67,-0.9,-0.28,0.98
113265,satisfied,satisfied,-1.35,-0.83,1.55,-1.3,-1.28,-1.53,1.33,1.2,...,0.25,1.27,-0.39,-0.26,1.02,-0.47,-0.67,1.11,-0.28,0.98
95461,satisfied,satisfied,1.09,-1.5,1.55,1.32,1.49,1.54,-0.95,0.46,...,0.25,-0.27,-0.39,-0.39,-0.98,-0.47,-0.67,-0.9,-0.28,0.96
82721,satisfied,satisfied,0.83,1.91,-0.6,-0.65,-0.59,-0.76,1.33,0.46,...,-0.61,0.5,-0.1,-0.16,1.02,-0.47,-0.67,-0.9,-0.28,0.7
32709,dissatisfied,dissatisfied,1.82,0.16,0.83,-0.65,0.8,-0.76,-1.71,0.46,...,-1.48,-1.81,-0.39,-0.39,1.02,-0.47,1.49,1.11,-0.28,0.09


In [28]:
first_row_prob = test_output.iloc[0]['Probability']
first_row_prob_rounded = round(first_row_prob, 2)
print("Probability that satisfaction is 1 for the first row:", first_row_prob_rounded)

Probability that satisfaction is 1 for the first row: 0.98
