In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

from tabulate import tabulate
from klib import data_cleaning


In [2]:
df = pd.read_csv('airline_passenger_satisfaction.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


# Data PreProcessing - missing values

In [3]:
df.drop(['Unnamed: 0','id'], axis=1, inplace=True)

In [4]:
df['satisfaction'] = df['satisfaction'].map({
    'neutral or dissatisfied': 0, 
    'satisfied': 1})

In [5]:
for i in df.columns:
    if df[i].isnull().any():
        if df[i].dtype == 'object':
            df[i].fillna(df[i].mode()[0], inplace=True)
        else:
            df[i].fillna(df[i].mean(), inplace=True)

In [6]:
df.isnull().sum()

Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64

In [7]:
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    print(col, df[col].unique())

Gender ['Male' 'Female']
Customer Type ['Loyal Customer' 'disloyal Customer']
Type of Travel ['Personal Travel' 'Business travel']
Class ['Eco Plus' 'Business' 'Eco']


In [8]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  object 
 1   Customer Type                      103904 non-null  object 
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  object 
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    1039

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,0
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,0
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


# Encoding

In [9]:
cat_cols = df.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in cat_cols:
    cardinality = df[col].nunique()
    if cardinality <= 4:
        # one-hot encoding
        dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
        # drop the original column and merge the dummy columns
        df = pd.concat([df.drop(col, axis=1), dummies], axis=1)
    else:
        # Apply label encoding
        df[col] = encoder.fit_transform(df[col])


# Scaling

In [10]:
scaler = StandardScaler()
cols = df.columns[df.columns != 'satisfaction']
df[cols] = scaler.fit_transform(df[cols])
df.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,satisfaction,Gender_Female,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,-1.745279,-0.731539,0.203579,0.616172,0.173776,-1.547323,1.352264,-0.185532,1.183099,1.231704,...,0,-1.015031,1.015031,0.472767,-0.472767,-1.490614,1.490614,-0.956906,-0.904327,3.586776
1,-0.95136,-0.957184,0.203579,-0.695245,0.173776,0.018094,-1.656326,-0.185532,-1.849315,-1.769081,...,0,-1.015031,1.015031,-2.115208,2.115208,0.670865,-0.670865,1.045034,-0.904327,-0.278802
2,-0.8852,-0.047584,-0.549533,-0.695245,-0.54106,-0.764614,1.352264,1.296496,1.183099,1.231704,...,1,0.985192,-0.985192,0.472767,-0.472767,0.670865,-0.670865,1.045034,-0.904327,-0.278802
3,-0.95136,-0.629246,-0.549533,1.27188,1.603448,1.583511,-0.904178,-0.926545,-1.091211,-1.018885,...,0,0.985192,-0.985192,0.472767,-0.472767,0.670865,-0.670865,1.045034,-0.904327,-0.278802
4,1.430397,-0.978244,0.203579,-0.039537,0.173776,0.018094,0.600117,1.296496,1.183099,-0.268688,...,1,-1.015031,1.015031,0.472767,-0.472767,0.670865,-0.670865,1.045034,-0.904327,-0.278802


# Model training

In [11]:
x = df.copy()
y = df['satisfaction']
x.drop(columns=['satisfaction'], inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

# Model Selection - Decision Tree Classifier

In [12]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

In [13]:
y_pred = dt_model.predict(x_val)
cr_report = classification_report(y_val, y_pred)
print(classification_report(y_val, y_pred))
ac_score = accuracy_score(y_val, y_pred)
print('accuracy score: ', accuracy_score(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      5862
           1       0.94      0.94      0.94      4529

    accuracy                           0.95     10391
   macro avg       0.95      0.95      0.95     10391
weighted avg       0.95      0.95      0.95     10391

accuracy score:  0.9468771051871813


# Model Selection - Random Forest Classifier

In [14]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)


In [15]:
y_pred = rf.predict(x_val)
cr_report1 = classification_report(y_val, y_pred)
print(cr_report1)
ac_score1 = accuracy_score(y_val, y_pred)
print('accuracy score: ', accuracy_score(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      5862
           1       0.98      0.94      0.96      4529

    accuracy                           0.96     10391
   macro avg       0.97      0.96      0.96     10391
weighted avg       0.96      0.96      0.96     10391

accuracy score:  0.964392262534886


In [16]:
data = [
  ["Decision Tree Classifier", ac_score],
  ["Random Forest Classifier", ac_score1]
]


headers = ['Model', 'accuracy score']

In [17]:
	
table = tabulate(data, headers, tablefmt='grid')
print(table)

+--------------------------+------------------+
| Model                    |   accuracy score |
| Decision Tree Classifier |         0.946877 |
+--------------------------+------------------+
| Random Forest Classifier |         0.964392 |
+--------------------------+------------------+


# Hyperparameter tuning

In [27]:
param_grid={
    'max_depth':[2,5,10,None],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,3,5],
    'max_features':['auto','sqrt','log','None']

}

In [28]:
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

540 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
101 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_param_validation.py", line 95, in va

In [29]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10}


In [30]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_val)
ac_after = accuracy_score(y_val, y_pred)
print('accuracy score: ', ac_after)

accuracy score:  0.9446636512366471


# Randomized Search

In [24]:
random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_grid,
    n_iter=10, # number of iterations
    verbose=1,
    cv=5,
    n_jobs=-1,
    random_state=42,
)
random_search.fit(x_train, y_train) # 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/user/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_param_validation.py", line 95, in valid

In [25]:
random_model_dt = random_search.best_estimator_ # get the best model from the random search
y_pred=random_model_dt.predict(x_test) # predict the test data using the best model
accuracy_score_dt = random_model_dt.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_dt)

Accuracy Score:  0.9340712223291626


In [32]:
data = [
  ["Decision Tree Classifier", ac_score],
  ["Random Forest Classifier", ac_score1],
  ["Best Decision Tree Classifier", ac_after],
  ["Random Search Decision Tree Classifier", accuracy_score_dt]
]
headers = ['Model', 'accuracy score']
sorted_data = sorted(data, key=lambda x: x[1], reverse=True)
data = sorted_data
table = tabulate(data, headers, tablefmt='grid')
print(table)

+----------------------------------------+------------------+
| Model                                  |   accuracy score |
| Random Forest Classifier               |         0.964392 |
+----------------------------------------+------------------+
| Decision Tree Classifier               |         0.946877 |
+----------------------------------------+------------------+
| Best Decision Tree Classifier          |         0.944664 |
+----------------------------------------+------------------+
| Random Search Decision Tree Classifier |         0.934071 |
+----------------------------------------+------------------+
