### Part 03 - Multinomial Logistic Regression

In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# This is new
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
# fetch data 
airline_data = pd.read_csv('Invistico_Airline.csv')
airline_data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [3]:
airline_data.drop(columns='satisfaction', inplace = True)

In [4]:
airline_data.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [5]:
airline_data.isna().sum()

Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
dtype: int64

In [6]:
airline_data.dropna(inplace = True)

In [7]:
airline_data.isna().sum()

Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Seat comfort                         0
Departure/Arrival time convenient    0
Food and drink                       0
Gate location                        0
Inflight wifi service                0
Inflight entertainment               0
Online support                       0
Ease of Online booking               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Cleanliness                          0
Online boarding                      0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
dtype: int64

In [8]:
airline_data.dtypes

Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
dtype: obj

In [9]:
from sklearn.preprocessing import OneHotEncoder
def get_ohe(airline_data, col):
    ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse_output=False, dtype='int')
    ohe.fit(airline_data[[col]])
    temp_df = pd.DataFrame(data=ohe.transform(airline_data[[col]]), columns=ohe.get_feature_names_out())
    airline_data.drop(columns=[col], axis=1, inplace=True)
    airline_data = pd.concat([airline_data.reset_index(drop=True), temp_df], axis=1)
    return airline_data

In [10]:
airline_data = get_ohe(airline_data, 'Customer Type')
airline_data = get_ohe(airline_data, 'Type of Travel')
airline_data = get_ohe(airline_data, 'Gender')

In [11]:
airline_data.head()

Unnamed: 0,Age,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,...,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
0,65,Eco,265,0,0,0,2,2,4,2,...,0,3,5,3,2,0,0.0,0,1,0
1,47,Business,2464,0,0,0,3,0,2,2,...,4,4,2,3,2,310,305.0,0,1,1
2,15,Eco,2138,0,0,0,3,2,0,2,...,3,4,4,4,2,0,0.0,0,1,0
3,60,Eco,623,0,0,0,3,3,4,3,...,0,1,4,1,3,0,0.0,0,1,0
4,70,Eco,354,0,0,0,3,4,3,4,...,0,2,4,2,5,0,0.0,0,1,0


#### After conducting one hot encoding, the total number of independent variables stands at __21__.

In [12]:
airline_data.columns
len(airline_data.columns) 

Index(['Age', 'Class', 'Flight Distance', 'Seat comfort',
       'Departure/Arrival time convenient', 'Food and drink', 'Gate location',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'Customer Type_disloyal Customer', 'Type of Travel_Personal Travel',
       'Gender_Male'],
      dtype='object')

22

In [13]:
X_train, X_test, y_train, y_test = train_test_split(airline_data.drop(columns = ['Class']), airline_data['Class'], test_size=0.2, stratify = airline_data['Class'], random_state=50)
X_train
X_test
y_train
y_test

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
100897,59,2559,1,1,1,1,5,4,5,4,...,4,4,3,4,4,0,0.00,0,0,1
76456,37,520,2,3,3,3,5,4,4,2,...,2,2,1,2,3,0,0.00,0,0,0
44676,26,1487,2,0,2,3,3,2,3,3,...,2,4,4,5,3,0,0.00,1,0,0
37815,50,550,4,5,4,4,1,5,5,5,...,4,5,3,5,1,0,0.00,0,1,0
122902,39,1980,5,1,5,5,5,5,5,5,...,3,5,5,1,5,0,0.00,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106779,46,2035,3,3,3,3,3,4,5,4,...,4,4,5,4,3,0,0.00,0,0,0
67611,54,269,1,3,4,3,2,3,4,1,...,1,1,3,1,4,49,64.00,0,0,0
119841,58,101,4,4,4,4,5,3,4,5,...,5,5,5,5,4,0,0.00,0,0,0
119381,55,309,1,1,1,1,2,4,4,5,...,5,5,3,5,5,0,0.00,0,0,0


Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
42248,32,1727,1,1,1,3,5,1,5,5,...,3,4,3,3,5,0,0.00,1,0,1
102458,57,1889,5,5,5,5,3,5,4,4,...,4,4,4,4,5,75,77.00,0,0,1
236,58,622,0,3,0,4,3,1,4,1,...,0,1,3,1,3,0,0.00,0,1,0
74922,41,2506,2,2,2,2,4,4,4,2,...,2,2,4,2,4,0,0.00,0,0,1
14632,40,3669,4,3,3,3,4,4,4,4,...,4,4,5,1,4,0,0.00,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102243,64,256,4,5,5,5,4,3,4,4,...,4,4,2,4,2,0,0.00,0,0,0
58476,26,2130,4,4,4,1,2,4,2,2,...,4,5,5,4,2,3,16.00,1,0,1
44093,26,2154,1,4,1,4,5,1,2,5,...,4,3,3,3,5,0,0.00,1,0,0
71815,72,1111,2,1,1,1,5,3,2,2,...,2,2,1,2,3,3,0.00,0,0,0


100897    Business
76456     Business
44676     Business
37815          Eco
122902         Eco
            ...   
106779    Business
67611     Business
119841    Business
119381    Business
111902    Business
Name: Class, Length: 103589, dtype: object

42248     Business
102458    Business
236            Eco
74922     Business
14632          Eco
            ...   
102243    Eco Plus
58476     Business
44093     Business
71815          Eco
92998     Business
Name: Class, Length: 25898, dtype: object

#### The model's performance metric is __0.6570 or 65.70%__, indicating a fit that could be improved..
#### As we are implementing multinomial regression the βˆ0, βˆ1, βˆ2 are arrays. βˆ0 is a 1 dimensional array while the others are 2 dimensional arrays.

In [14]:
model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'ovr', penalty = None)
model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.657077488922569

array([[ 6.10678351e-03,  1.00559165e-04, -3.42022248e-01,
        -2.19381683e-01,  2.45684317e-01, -4.72982119e-02,
        -3.14194543e-01,  6.21256713e-01,  1.87038585e-01,
         3.16454508e-03,  2.04028018e-01,  1.82557100e-02,
        -6.51275665e-02,  3.70569235e-02, -1.15740988e-01,
        -1.97748495e-01,  1.16350047e-03,  1.66888689e-04,
        -2.01922703e-01, -1.20092403e+00, -8.40260720e-02],
       [-1.33311158e-03, -2.15649372e-04,  2.57774453e-01,
         2.37864707e-01, -1.23283846e-01,  9.76031040e-02,
         1.83352395e-01, -4.82635788e-01, -1.73671279e-01,
        -4.19756570e-02, -1.50667348e-01, -4.01320320e-02,
         3.49607653e-02, -2.92347032e-02,  6.62486325e-02,
         1.01211661e-01,  2.45350276e-02, -1.50523828e-02,
         1.85508190e-01,  8.49635670e-01,  6.50789634e-02],
       [-7.69634696e-03, -2.46855477e-04,  2.44788889e-01,
         9.11397567e-02, -5.62327415e-02, -1.18581995e-01,
        -1.58243906e-02, -2.08524144e-01, -1.36452846e

array([-0.22698245,  0.13802955, -0.04127887])

#### Scale the training dataset.

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
X_train_scaled
X_test_scaled
y_train
y_test

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
100897,1.29,0.56,-1.32,-1.30,-1.28,-1.52,1.33,0.46,1.13,0.41,...,0.40,0.26,-0.27,0.26,0.50,-0.38,-0.39,-0.47,-0.67,1.02
76456,-0.16,-1.42,-0.60,0.01,0.10,0.01,1.33,0.46,0.37,-1.13,...,-1.15,-1.47,-1.86,-1.48,-0.27,-0.38,-0.39,-0.47,-0.67,-0.98
44676,-0.89,-0.48,-0.60,-1.95,-0.59,0.01,-0.19,-1.03,-0.40,-0.36,...,-1.15,0.26,0.52,1.12,-0.27,-0.38,-0.39,2.11,-0.67,-0.98
37815,0.70,-1.39,0.84,1.32,0.80,0.78,-1.71,1.20,1.13,1.17,...,0.40,1.13,-0.27,1.12,-1.81,-0.38,-0.39,-0.47,1.50,-0.98
122902,-0.03,-0.00,1.55,-1.30,1.49,1.54,1.33,1.20,1.13,1.17,...,-0.38,1.13,1.32,-2.35,1.27,-0.38,-0.39,-0.47,-0.67,-0.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106779,0.44,0.05,0.12,0.01,0.10,0.01,-0.19,0.46,1.13,0.41,...,0.40,0.26,1.32,0.26,-0.27,-0.38,-0.39,-0.47,-0.67,-0.98
67611,0.96,-1.67,-1.32,0.01,0.80,0.01,-0.95,-0.29,0.37,-1.89,...,-1.92,-2.34,-0.27,-2.35,0.50,0.90,1.26,-0.47,-0.67,-0.98
119841,1.23,-1.83,0.84,0.66,0.80,0.78,1.33,-0.29,0.37,1.17,...,1.17,1.13,1.32,1.12,0.50,-0.38,-0.39,-0.47,-0.67,-0.98
119381,1.03,-1.63,-1.32,-1.30,-1.28,-1.52,-0.95,0.46,0.37,1.17,...,1.17,1.13,-0.27,1.12,1.27,-0.38,-0.39,-0.47,-0.67,-0.98


Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
42248,-0.49,-0.25,-1.32,-1.30,-1.28,0.01,1.33,-1.77,1.13,1.17,...,-0.38,0.26,-0.27,-0.61,1.27,-0.38,-0.39,2.11,-0.67,1.02
102458,1.16,-0.09,1.55,1.32,1.49,1.54,-0.19,1.20,0.37,0.41,...,0.40,0.26,0.52,0.26,1.27,1.58,1.60,-0.47,-0.67,1.02
236,1.23,-1.32,-2.04,0.01,-1.98,0.78,-0.19,-1.77,0.37,-1.89,...,-2.70,-2.34,-0.27,-2.35,-0.27,-0.38,-0.39,-0.47,1.50,-0.98
74922,0.10,0.51,-0.60,-0.65,-0.59,-0.76,0.57,0.46,0.37,-1.13,...,-1.15,-1.47,0.52,-1.48,0.50,-0.38,-0.39,-0.47,-0.67,1.02
14632,0.04,1.64,0.84,0.01,0.10,0.01,0.57,0.46,0.37,0.41,...,0.40,0.26,1.32,-2.35,0.50,-0.38,-0.39,-0.47,1.50,-0.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102243,1.63,-1.68,0.84,1.32,1.49,1.54,0.57,-0.29,0.37,0.41,...,0.40,0.26,-1.06,0.26,-1.04,-0.38,-0.39,-0.47,-0.67,-0.98
58476,-0.89,0.14,0.84,0.66,0.80,-1.52,-0.95,0.46,-1.16,-1.13,...,0.40,1.13,1.32,0.26,-1.04,-0.31,0.02,2.11,-0.67,1.02
44093,-0.89,0.17,-1.32,0.66,-1.28,0.78,1.33,-1.77,-1.16,1.17,...,0.40,-0.60,-0.27,-0.61,1.27,-0.38,-0.39,2.11,-0.67,-0.98
71815,2.15,-0.85,-0.60,-1.30,-1.28,-1.52,1.33,-0.29,-1.16,-1.13,...,-1.15,-1.47,-1.86,-1.48,-0.27,-0.31,-0.39,-0.47,-0.67,-0.98


100897    Business
76456     Business
44676     Business
37815          Eco
122902         Eco
            ...   
106779    Business
67611     Business
119841    Business
119381    Business
111902    Business
Name: Class, Length: 103589, dtype: object

42248     Business
102458    Business
236            Eco
74922     Business
14632          Eco
            ...   
102243    Eco Plus
58476     Business
44093     Business
71815          Eco
92998     Business
Name: Class, Length: 25898, dtype: object

#### 1. Increase the number of iterations to 1000.
#### Increasing the iteration count to 1000 results in the model failing to converge. Nonetheless, it achieves a score of __0.7670 or 76.70%__, an improvement over the outcome observed with the model's default 100 iterations.

In [16]:
increased_iter_model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'ovr', penalty = None, max_iter=1000)
increased_iter_model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
increased_iter_model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
increased_iter_model.coef_

# This is the coefficient Beta_0
increased_iter_model.intercept_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7670312484916352

array([[-4.21582119e-03,  2.31418158e-04, -4.28885211e-01,
         4.83639531e-02,  8.14375191e-02, -3.52690203e-02,
        -3.22230966e-02,  3.52233438e-01,  2.04419637e-01,
        -3.37819939e-01,  2.60816839e-01,  4.81116809e-02,
         1.95361007e-01,  1.65817667e-01,  1.93422065e-01,
         4.88389148e-02,  4.41038946e-03, -6.23967146e-03,
        -1.72986882e+00, -4.56174980e+00, -3.32232198e-02],
       [-8.11312992e-04, -2.08400731e-04,  3.46946164e-01,
        -2.74790858e-02, -9.67192269e-02, -1.02003124e-02,
         2.04496617e-02, -2.57880110e-01, -1.59741201e-01,
         1.92278879e-01, -1.87366295e-01, -3.79895912e-02,
        -1.36779443e-01, -9.92724592e-02, -1.33800825e-01,
        -1.14638943e-02, -2.43535247e-03,  3.98514759e-03,
         1.74953505e+00,  3.35595967e+00,  7.01556432e-03],
       [-6.90104724e-03, -1.99633923e-04,  1.78279234e-01,
         4.45665196e-03, -1.80283953e-02, -6.17435991e-02,
         1.16511471e-02, -1.91097679e-01, -1.15544158e

array([-1.74330546,  0.99711881, -0.11405396])

#### 2. Use a scalar.
#### When using the scaled train dataset, the model does converge in the default 100 iterations.
#### The score is __0.7723 or 77.23%__ which is a better fit than the previous two models.

In [17]:
scaled_model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'ovr', penalty = None)
scaled_model.fit(X_train_scaled, y_train) 

# The following gives the mean accuracy on the given data and labels
scaled_model.score(X_train_scaled, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
scaled_model.coef_

# This is the coefficient Beta_0
scaled_model.intercept_

0.7722538107327999

array([[ 1.92350151e-01,  3.98843275e-01, -7.48912432e-01,
         7.38734817e-02,  2.24779006e-01,  3.63847633e-02,
         5.92900411e-02,  5.73412810e-01,  3.13650234e-01,
        -3.81258474e-01,  3.51603750e-01,  1.21784845e-01,
         2.93748295e-01,  2.64173994e-01,  2.66793956e-01,
         1.51422229e-02,  1.05983253e-01, -1.67093697e-01,
        -5.20854523e-01, -2.01951663e+00,  2.44247769e-02],
       [-1.23328479e-01, -3.00680892e-01,  4.94654072e-01,
        -3.00284129e-02, -1.69629613e-01, -3.01794344e-02,
        -5.13443440e-02, -3.71721706e-01, -2.14756365e-01,
         2.84019325e-01, -2.30218397e-01, -9.80804454e-02,
        -2.05483289e-01, -1.86622955e-01, -1.98943371e-01,
        -2.87800670e-02, -6.43083629e-02,  1.16245397e-01,
         6.18625698e-01,  1.50362227e+00, -1.69963887e-02],
       [-1.01247359e-01, -2.05972879e-01,  2.72977991e-01,
         2.16801420e-03, -5.29509488e-02, -6.50657223e-02,
         1.02114691e-03, -2.59700973e-01, -1.38714436e

array([-0.28500927, -0.3125446 , -2.73450827])

#### 3. Change the algorithm to Newton-cg
#### Switching the solver to newton-cg allows the model to converge within the standard 100 iterations, even without employing a scaled dataset.
#### The score is __0.7723 or 77.23%__ which is the same as the previous model when we used scaled dataset.

In [18]:
newton_cg_model = LogisticRegression(fit_intercept = True, solver='newton-cg', multi_class = 'ovr', penalty = None)
newton_cg_model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
newton_cg_model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
newton_cg_model.coef_

# This is the coefficient Beta_0
newton_cg_model.intercept_

0.7722924248713666

array([[ 1.26792799e-02,  3.87909680e-04, -5.37865956e-01,
         4.81492083e-02,  1.56305350e-01,  2.77521224e-02,
         4.44519500e-02,  4.25899157e-01,  2.40195955e-01,
        -2.91995390e-01,  2.76504857e-01,  9.43919150e-02,
         2.54552282e-01,  2.09875863e-01,  2.31967599e-01,
         1.19584258e-02,  2.77820359e-03, -4.31843776e-03,
        -1.34728059e+00, -4.37062368e+00,  4.87865583e-02],
       [-8.14193772e-03, -2.92679658e-04,  3.55409523e-01,
        -1.94638825e-02, -1.18243166e-01, -2.30003209e-02,
        -3.85257425e-02, -2.75947358e-01, -1.64424863e-01,
         2.17229999e-01, -1.81214593e-01, -7.57477363e-02,
        -1.78120274e-01, -1.48272644e-01, -1.72927942e-01,
        -2.24316072e-02, -1.66424083e-03,  2.98638063e-03,
         1.59906119e+00,  3.25415802e+00, -3.38552052e-02],
       [-6.71405615e-03, -2.01119910e-04,  1.93681159e-01,
         1.02316895e-03, -3.40528722e-02, -5.05162720e-02,
         2.67520125e-03, -1.93298261e-01, -1.07623078e

array([-4.34757312,  2.38615442, -0.15695209])

#### Test accuracy when using the model with 1000 iterations.
#### The test accuracy is __0.7696 or 76.96%__.

In [19]:
test_output_with_increased_iter = pd.DataFrame(increased_iter_model.predict(X_test), index = X_test.index, columns = ['predicted_class'])
test_output_with_increased_iter = test_output_with_increased_iter.merge(y_test, left_index = True, right_index = True)
test_output_with_increased_iter.head()
print(increased_iter_model.score(X_test, y_test))

Unnamed: 0,predicted_class,Class
42248,Eco,Business
102458,Business,Business
236,Eco,Eco
74922,Business,Business
14632,Eco,Eco


0.7696733338481736


#### Test accuracy when using the model with scaled dataset.
* The test accuracy comes out to be __0.7750 or 77.50%__.

In [20]:
test_output_with_scaling = pd.DataFrame(scaled_model.predict(X_test_scaled), index = X_test_scaled.index, columns = ['predicted_class'])
test_output_with_scaling = test_output_with_scaling.merge(y_test, left_index = True, right_index = True)
test_output_with_scaling.head()
print(scaled_model.score(X_test_scaled, y_test))

Unnamed: 0,predicted_class,Class
42248,Eco,Business
102458,Business,Business
236,Eco,Eco
74922,Business,Business
14632,Eco,Eco


0.775040543671326


#### Test accuracy when using the model with newton-cg algorithm.
* The test accuracy comes out to be __0.7750 or 77.50%__.

In [21]:
test_output_with_newton_cg = pd.DataFrame(newton_cg_model.predict(X_test), index = X_test.index, columns = ['predicted_class'])
test_output_with_newton_cg = test_output_with_newton_cg.merge(y_test, left_index = True, right_index = True)
test_output_with_newton_cg.head()
print(newton_cg_model.score(X_test, y_test))

Unnamed: 0,predicted_class,Class
42248,Eco,Business
102458,Business,Business
236,Eco,Eco
74922,Business,Business
14632,Eco,Eco


0.7750791566916364


#### The 'eco' class had the highest number of incorrect predictions, while the 'Business' class had the fewest.

In [22]:
y_pred = test_output_with_scaling['predicted_class']
y_true = test_output_with_scaling['Class']

comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_true})

comparison_df['Correct'] = comparison_df['Predicted'] == comparison_df['Actual']

wrong_predictions_count = comparison_df[comparison_df['Correct'] == False]['Actual'].value_counts()

most_wrong_class = wrong_predictions_count.idxmax()
fewest_wrong_class = wrong_predictions_count.idxmin()

print(f"The class predicted wrong the most: {most_wrong_class}")
print(f"The class predicted wrong the fewest times: {fewest_wrong_class}")

The class predicted wrong the most: Eco
The class predicted wrong the fewest times: Business


### Part 04 -  K-neighbours Method

#### The score for the training data comes out to be __0.8566 or 85.66%__ suggesting a good fit.

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=7, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)

knn_model.fit(X_train_scaled, y_train)

score = knn_model.score(X_train_scaled, y_train) 
print(score)

0.8566063964320536


#### The accuracy of the test set is __82.86%__.

In [24]:
knn_test_output = pd.DataFrame(knn_model.predict(X_test_scaled), index = X_test_scaled.index, columns = ['predicted_satisfication'])
knn_test_output = knn_test_output.merge(y_test, left_index = True, right_index = True)
knn_test_output.head()
print(knn_model.score(X_test_scaled, y_test))

Unnamed: 0,predicted_satisfication,Class
42248,Eco,Business
102458,Business,Business
236,Eco,Eco
74922,Business,Business
14632,Eco,Eco


0.8285581898216079


### Part 05 - Multinomial Logistic Regression with penalty

* Use only the rows where the ’Inflight entertainment’ is equal to 0. This should leave you with __2968 rows__.

In [25]:
airline_data = airline_data[airline_data['Inflight entertainment'] == 0]
airline_data.head()
airline_data.shape

Unnamed: 0,Age,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,...,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
2,15,Eco,2138,0,0,0,3,2,0,2,...,3,4,4,4,2,0,0.0,0,1,0
5,30,Eco,1894,0,0,0,3,2,0,2,...,4,5,5,4,2,0,0.0,0,1,1
7,10,Eco,1812,0,0,0,3,2,0,2,...,3,4,5,4,2,0,0.0,0,1,1
9,22,Eco,1556,0,0,0,3,2,0,2,...,4,5,3,4,2,30,26.0,0,1,1
11,34,Eco,3633,0,0,0,4,2,0,2,...,2,5,2,5,2,0,0.0,0,1,0


(2968, 22)

In [26]:
X_train_multi_with_penalty, X_test_multi_with_penalty, y_train_multi_with_penalty, y_test_multi_with_penalty = train_test_split(airline_data.drop(columns = ['Class', 'Inflight entertainment']), airline_data['Class'], test_size=0.2, stratify = airline_data['Class'], random_state=50)
X_train_multi_with_penalty
X_test_multi_with_penalty
y_train_multi_with_penalty
y_test_multi_with_penalty

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
22481,22,1317,3,3,0,4,3,3,3,3,1,5,2,4,3,18,19.00,0,1,1
11971,9,1625,2,4,0,5,2,4,2,4,2,4,4,5,2,0,0.00,0,1,1
39315,28,1505,5,5,0,3,3,3,3,4,4,5,5,2,3,0,0.00,0,1,0
19899,25,1322,3,0,0,5,2,2,2,3,4,4,5,4,2,0,0.00,0,1,1
13,35,1766,0,1,0,1,4,4,4,3,5,2,3,2,4,0,0.00,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,30,1582,1,0,0,3,1,1,1,2,4,3,5,1,1,0,2.00,0,1,1
3180,7,1865,1,4,0,1,1,1,1,4,4,5,5,4,1,0,0.00,0,1,1
24045,45,1475,3,4,0,4,4,4,4,5,4,1,2,2,4,0,0.00,0,1,1
65331,21,3857,0,5,0,2,3,3,3,2,3,5,5,1,3,0,1.00,0,0,1


Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
40172,22,1570,0,0,0,3,4,4,4,3,5,4,5,4,4,0,0.00,1,0,1
39921,23,1329,0,0,0,1,5,2,5,2,4,3,4,4,5,0,0.00,1,0,0
1200,64,1428,0,5,0,3,1,2,1,3,5,5,5,4,1,48,75.00,0,1,0
39891,23,2439,0,0,0,1,1,1,1,4,4,4,5,5,1,0,0.00,1,0,1
3481,7,2155,0,0,0,2,3,2,3,4,1,1,5,4,3,69,58.00,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,64,1254,0,5,0,5,5,5,5,4,5,5,3,5,5,0,0.00,0,1,1
63512,30,2186,0,0,0,1,4,4,4,4,5,3,1,4,4,6,23.00,0,0,0
40657,25,2202,0,5,0,3,4,4,4,3,4,4,5,4,4,0,0.00,1,0,1
217,17,1865,0,3,0,3,2,4,2,5,3,1,5,2,2,0,0.00,0,1,1


22481    Business
11971         Eco
39315         Eco
19899         Eco
13            Eco
           ...   
795           Eco
3180          Eco
24045         Eco
65331    Eco Plus
40281    Business
Name: Class, Length: 2374, dtype: object

40172    Business
39921    Eco Plus
1200          Eco
39891         Eco
3481          Eco
           ...   
772           Eco
63512         Eco
40657    Business
217           Eco
5756          Eco
Name: Class, Length: 594, dtype: object

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_multi_with_penalty = pd.DataFrame(sc.fit_transform(X_train_multi_with_penalty), columns = X_train_multi_with_penalty.columns, index = X_train_multi_with_penalty.index)
X_test_multi_with_penalty = pd.DataFrame(sc.transform(X_test_multi_with_penalty), columns = X_test_multi_with_penalty.columns, index = X_test_multi_with_penalty.index)
X_train_multi_with_penalty
X_test_multi_with_penalty
y_train_multi_with_penalty
y_test_multi_with_penalty

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
22481,-0.57,-0.89,1.49,0.29,-0.14,0.95,-0.03,-0.00,-0.03,-0.26,-1.66,1.06,-1.05,0.24,-0.03,0.43,0.47,-0.74,0.95,0.85
11971,-1.45,-0.41,0.78,0.78,-0.14,1.79,-0.77,0.73,-0.77,0.53,-0.91,0.25,0.51,1.09,-0.77,-0.46,-0.46,-0.74,0.95,0.85
39315,-0.17,-0.60,2.91,1.27,-0.14,0.11,-0.03,-0.00,-0.03,0.53,0.60,1.06,1.28,-1.45,-0.03,-0.46,-0.46,-0.74,0.95,-1.17
19899,-0.37,-0.88,1.49,-1.17,-0.14,1.79,-0.77,-0.73,-0.77,-0.26,0.60,0.25,1.28,0.24,-0.77,-0.46,-0.46,-0.74,0.95,0.85
13,0.30,-0.19,-0.64,-0.68,-0.14,-1.57,0.71,0.73,0.71,-0.26,1.36,-1.38,-0.27,-1.45,0.71,-0.46,-0.46,-0.74,0.95,0.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-0.03,-0.48,0.07,-1.17,-0.14,0.11,-1.50,-1.45,-1.50,-1.04,0.60,-0.56,1.28,-2.30,-1.50,-0.46,-0.36,-0.74,0.95,0.85
3180,-1.58,-0.04,0.07,0.78,-0.14,-1.57,-1.50,-1.45,-1.50,0.53,0.60,1.06,1.28,0.24,-1.50,-0.46,-0.46,-0.74,0.95,0.85
24045,0.98,-0.64,1.49,0.78,-0.14,0.95,0.71,0.73,0.71,1.31,0.60,-2.19,-1.05,-1.45,0.71,-0.46,-0.46,-0.74,0.95,0.85
65331,-0.64,3.06,-0.64,1.27,-0.14,-0.73,-0.03,-0.00,-0.03,-1.04,-0.15,1.06,1.28,-2.30,-0.03,-0.46,-0.41,-0.74,-1.06,0.85


Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Gender_Male
40172,-0.57,-0.50,-0.64,-1.17,-0.14,0.11,0.71,0.73,0.71,-0.26,1.36,0.25,1.28,0.24,0.71,-0.46,-0.46,1.36,-1.06,0.85
39921,-0.50,-0.87,-0.64,-1.17,-0.14,-1.57,1.44,-0.73,1.44,-1.04,0.60,-0.56,0.51,0.24,1.44,-0.46,-0.46,1.36,-1.06,-1.17
1200,2.25,-0.72,-0.64,1.27,-0.14,0.11,-1.50,-0.73,-1.50,-0.26,1.36,1.06,1.28,0.24,-1.50,1.91,3.21,-0.74,0.95,-1.17
39891,-0.50,0.86,-0.64,-1.17,-0.14,-1.57,-1.50,-1.45,-1.50,0.53,0.60,0.25,1.28,1.09,-1.50,-0.46,-0.46,1.36,-1.06,0.85
3481,-1.58,0.41,-0.64,-1.17,-0.14,-0.73,-0.03,-0.73,-0.03,0.53,-1.66,-2.19,1.28,0.24,-0.03,2.94,2.38,-0.74,0.95,-1.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,2.25,-0.99,-0.64,1.27,-0.14,1.79,1.44,1.45,1.44,0.53,1.36,1.06,-0.27,1.09,1.44,-0.46,-0.46,-0.74,0.95,0.85
63512,-0.03,0.46,-0.64,-1.17,-0.14,-1.57,0.71,0.73,0.71,0.53,1.36,-0.56,-1.83,0.24,0.71,-0.16,0.67,-0.74,-1.06,-1.17
40657,-0.37,0.49,-0.64,1.27,-0.14,0.11,0.71,0.73,0.71,-0.26,0.60,0.25,1.28,0.24,0.71,-0.46,-0.46,1.36,-1.06,0.85
217,-0.91,-0.04,-0.64,0.29,-0.14,0.11,-0.77,0.73,-0.77,1.31,-0.15,-2.19,1.28,-1.45,-0.77,-0.46,-0.46,-0.74,0.95,0.85


22481    Business
11971         Eco
39315         Eco
19899         Eco
13            Eco
           ...   
795           Eco
3180          Eco
24045         Eco
65331    Eco Plus
40281    Business
Name: Class, Length: 2374, dtype: object

40172    Business
39921    Eco Plus
1200          Eco
39891         Eco
3481          Eco
           ...   
772           Eco
63512         Eco
40657    Business
217           Eco
5756          Eco
Name: Class, Length: 594, dtype: object

#### The train and test prediction accuracy comes out to be __0.7809 and 0.7575 respectively__.

In [28]:
model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'ovr', penalty = None)
model.fit(X_train_multi_with_penalty, y_train_multi_with_penalty) 

# The following gives the mean accuracy on the given data and labels
model.score(X_train_multi_with_penalty, y_train_multi_with_penalty) 
model.score(X_test_multi_with_penalty, y_test_multi_with_penalty)

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

0.780960404380792

0.7575757575757576

array([[-0.47621213,  0.50458457, -0.06448408, -0.33471057,  0.0214228 ,
         0.136524  , -0.02677323,  0.08269893, -0.02677323,  0.19339392,
         0.18297614,  0.44785498,  0.17571178,  0.3999089 , -0.02677323,
        -0.17356831,  0.24310981,  0.83162775, -0.61930582,  0.05412453],
       [ 0.11940852, -0.26717072,  0.09330552,  0.24956706, -0.03542464,
         0.0216197 ,  0.00890864,  0.0291108 ,  0.00890864, -0.17001247,
        -0.09845438, -0.24200546, -0.14589332, -0.23354806,  0.00890864,
         0.08068188, -0.06336826,  0.07606072,  0.92910376, -0.02415577],
       [ 0.10242211, -0.039159  , -0.06795014,  0.05612588, -0.75621926,
        -0.22784554,  0.02064203, -0.15553278,  0.02064203,  0.00927705,
        -0.00599129, -0.05912747,  0.0382984 , -0.02452526,  0.02064203,
        -0.00153895, -0.12096803, -1.36270782, -0.89377005, -0.04915172]])

array([-2.23536051,  0.9030872 , -2.52839323])

#### The train and test prediction accuracy comes out to be __0.7772 and 0.7643 respectively__.
#### On using the l1 penalty, the training accuracy seems to have dropped slightly while the test accuracy has slightly improved.

In [29]:
l1_model = LogisticRegression(fit_intercept = True, solver='liblinear', multi_class = 'ovr', penalty = 'l1', C = 0.1)
l1_model.fit(X_train_multi_with_penalty, y_train_multi_with_penalty) 

# The following gives the mean accuracy on the given data and labels
l1_model.score(X_train_multi_with_penalty, y_train_multi_with_penalty) 
l1_model.score(X_test_multi_with_penalty, y_test_multi_with_penalty)

# This is the coefficient Beta_1, ..., Beta_7
l1_model.coef_

# This is the coefficient Beta_0
l1_model.intercept_

0.7771693344566133

0.7643097643097643

array([[-3.48250466e-01,  4.35071864e-01,  0.00000000e+00,
        -2.88254651e-01,  0.00000000e+00,  8.80853812e-02,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.66934438e-01,  1.29720733e-01,  3.99516293e-01,
         1.41966291e-01,  3.55856448e-01,  0.00000000e+00,
         0.00000000e+00,  3.44575507e-02,  8.10158281e-01,
        -5.51344658e-01,  8.31079273e-03],
       [ 9.04738546e-02, -2.50367626e-01,  4.58458976e-02,
         2.17872926e-01,  0.00000000e+00,  5.33678910e-04,
         1.17182967e-03,  2.68401076e-02,  0.00000000e+00,
        -1.46933884e-01, -6.77980651e-02, -2.13670681e-01,
        -1.26779288e-01, -2.07502053e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         8.61684640e-01,  0.00000000e+00],
       [ 3.16981244e-02,  0.00000000e+00, -6.30075976e-02,
         0.00000000e+00,  0.00000000e+00, -1.67041123e-01,
         0.00000000e+00, -5.90888088e-02,  0.00000000e+00,
         0.00000000e+00,  0.0

array([-2.01980213,  0.84976158, -2.288445  ])

#### The train and test prediction accuracy comes out to be __0.7814 and 0.7623 respectively__.
#### Applying the l2 penalty results in a slight improvement in training accuracy over both the model with the l1 penalty and the model without any penalty.
#### The test accuracy remains similar to the scenario with the l1 penalty applied, yet it exhibits a slight improvement over the model without any penalty.

In [30]:
l2_model = LogisticRegression(fit_intercept = True, solver='liblinear', multi_class = 'ovr', penalty = 'l2', C = 0.1)
l2_model.fit(X_train_multi_with_penalty, y_train_multi_with_penalty) 

# The following gives the mean accuracy on the given data and labels
l2_model.score(X_train_multi_with_penalty, y_train_multi_with_penalty) 
l2_model.score(X_test_multi_with_penalty, y_test_multi_with_penalty)

# This is the coefficient Beta_1, ..., Beta_7
l2_model.coef_

# This is the coefficient Beta_0
l2_model.intercept_

0.7813816343723673

0.7626262626262627

array([[-0.38252057,  0.44361472, -0.07361526, -0.32634472,  0.03287364,
         0.12495599, -0.02299086,  0.06673279, -0.02299086,  0.18940882,
         0.16436397,  0.41089333,  0.16568819,  0.36659023, -0.02299086,
        -0.08634753,  0.15032678,  0.77692149, -0.5352716 ,  0.04914887],
       [ 0.12065161, -0.26105232,  0.1018327 ,  0.25260082, -0.03925346,
         0.02070732,  0.00786924,  0.03089209,  0.00786924, -0.1642268 ,
        -0.09309192, -0.22982241, -0.1401279 , -0.22075727,  0.00786924,
         0.06192805, -0.04712102,  0.03423618,  0.85764125, -0.0233684 ],
       [ 0.08098705, -0.01948679, -0.07991317,  0.03393322, -0.13813498,
        -0.19504592,  0.01616941, -0.12862822,  0.01616941,  0.00424908,
        -0.01175292, -0.06820488,  0.02646627, -0.03935987,  0.01616941,
        -0.02735243, -0.07798507, -1.10915329, -0.74178725, -0.03641305]])

array([-1.99740698,  0.86650215, -2.23776556])