# Data: https://www.kaggle.com/teejmahal20/airline-passenger-satisfaction

# Import data

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
data1 = pd.read_csv('Airline1.csv')
data2 = pd.read_csv('Airline2.csv')

In [None]:
data = pd.concat([data1,data2])

In [None]:
data.shape

(129880, 25)

In [None]:
data.tail()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
25971,25971,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,...,4,3,2,4,4,5,4,0,0.0,neutral or dissatisfied
25972,25972,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,...,4,4,5,5,5,5,4,0,0.0,satisfied
25973,25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,...,2,4,3,4,5,4,2,0,0.0,neutral or dissatisfied
25974,25974,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,...,4,3,2,5,4,5,4,0,0.0,satisfied
25975,25975,34799,Female,Loyal Customer,42,Personal Travel,Eco,264,2,5,...,1,1,2,1,1,1,1,0,0.0,neutral or dissatisfied


In [None]:
X = data.iloc[:,:-1]

In [None]:
X.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,5,4,3,4,4,5,5,25,18.0
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,1,5,3,1,4,1,1,6.0
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,5,4,3,4,4,4,5,0,0.0
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,2,5,3,1,4,2,11,9.0
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,5,3,3,4,4,3,3,3,0,0.0


In [None]:
y = data.iloc[:,-1]

In [None]:
y.head()

0    neutral or dissatisfied
1    neutral or dissatisfied
2                  satisfied
3    neutral or dissatisfied
4                  satisfied
Name: satisfaction, dtype: object

# Handling missing data - Numeric type

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
#X.dtypes

In [None]:
numerical_cols = list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64))[0])

In [None]:
imp_mean.fit(X.iloc[:,numerical_cols])

SimpleImputer()

In [None]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols])

### Handling missing string data

In [None]:
string_cols = list(np.where((X.dtypes == object))[0])

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [None]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

# One Hot encoder method

In [None]:
def OneHotEncoderMethod(indices, data):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(),indices )], remainder='passthrough')
    return columnTransformer.fit_transform(data)

# Label encoding method

In [None]:
def LabelEncoderMethod(series):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(series)
    #print('Actual labels',le.classes_)
    #print('Encoding values',le.transform(pd.unique(series)))
    return le.transform(series)

# Label encoding target feature

In [None]:
y = LabelEncoderMethod(y)

# Encoding selection for X

In [None]:
def EncodingSelection(X, threshold=10):
    # Step 01 : Select the string col
    string_cols = list(np.where((X.dtypes == object))[0])
    one_hot_encoding_indices = []

    # Step 02: The number of categoty is 2 and more than threshold, label encode
    for col in string_cols:
        lenght = len(pd.unique(X[X.columns[col]]))
        if lenght == 2 or lenght > threshold:
            X[X.columns[col]] = LabelEncoderMethod(X[X.columns[col]])
        else:
            one_hot_encoding_indices.append(col)

    # Step 03: One hot encode otherwise
    X = OneHotEncoderMethod(one_hot_encoding_indices, X)
    return X

In [None]:
X = EncodingSelection(X)

In [None]:
X.shape

(129880, 26)

# Feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
kbest = SelectKBest(score_func=chi2, k='all')

In [None]:
from sklearn import preprocessing
MMS = preprocessing.MinMaxScaler()

In [None]:
K_features = 10

In [None]:
x_temp = MMS.fit_transform(X)

In [None]:
x_temp = kbest.fit(x_temp,y)

In [None]:
best_features = np.argsort(x_temp.scores_)[-K_features:]

In [None]:
best_features

array([10,  2, 16,  9, 17, 15,  6,  1,  0,  8], dtype=int64)

In [None]:
features_to_delete = best_features = np.argsort(x_temp.scores_)[:-K_features]

In [None]:
X = np.delete(X, features_to_delete, axis=1)

In [None]:
X.shape

(129880, 10)

In [None]:
del x_temp

# Train test split

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
print(X_train.shape)

(103904, 10)


# Feature scaling

### Standardization: (X - mean(X)) / std(X)

### Normalization = (x - min(x)) / (max(x) - min(x))

In [None]:
from sklearn import preprocessing

In [None]:
sc = preprocessing.StandardScaler(with_mean=False)

In [None]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [None]:
X_train = sc.transform(X_train)

In [None]:
print(X_train.shape)

(103904, 10)


In [None]:
X_test = sc.transform(X_test)

In [None]:
print(X_test.shape)

(25976, 10)


#### The data is ready!!

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#Note that in binary classification,
#recall of the positive class is also known as “sensitivity”;
#recall of the negative class is “specificity”.

# Building KNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knnClassifier = KNeighborsClassifier(n_neighbors=10)
knnClassifier.fit(X_train,y_train)
y_pred = knnClassifier.predict(X_test)
print(accuracy_score(y_pred,y_test))
# Rename class
# target_names = ['No' 'Yes']
# print(classification_report(y_pred, y_test,target_names = target_names))
print(classification_report(y_pred, y_test))

0.932668617185094
              precision    recall  f1-score   support

           0       0.96      0.92      0.94     15395
           1       0.90      0.94      0.92     10581

    accuracy                           0.93     25976
   macro avg       0.93      0.93      0.93     25976
weighted avg       0.93      0.93      0.93     25976



# Building Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
LRM = LogisticRegression()
LRM.fit(X_train,y_train)
y_pred = LRM.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.8557129658145981
              precision    recall  f1-score   support

           0       0.88      0.87      0.87     15068
           1       0.82      0.84      0.83     10908

    accuracy                           0.86     25976
   macro avg       0.85      0.85      0.85     25976
weighted avg       0.86      0.86      0.86     25976



# Building GaussianNB model

In [None]:
from sklearn.naive_bayes import GaussianNB
model_GNB = GaussianNB()
model_GNB.fit(X_train,y_train)
y_pred = model_GNB.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.828688019710502
              precision    recall  f1-score   support

           0       0.84      0.85      0.85     14662
           1       0.81      0.80      0.80     11314

    accuracy                           0.83     25976
   macro avg       0.83      0.82      0.83     25976
weighted avg       0.83      0.83      0.83     25976



# Building SVM (SVC) model

In [None]:
from sklearn.svm import SVC
model_SVC = SVC()
model_SVC.fit(X_train,y_train)
y_pred = model_SVC.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.9325916230366492
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     15033
           1       0.91      0.93      0.92     10943

    accuracy                           0.93     25976
   macro avg       0.93      0.93      0.93     25976
weighted avg       0.93      0.93      0.93     25976



# Building Decision tree model

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_DTC = DecisionTreeClassifier(max_leaf_nodes=25, min_samples_split=4, random_state=42)
model_DTC.fit(X_train,y_train)
y_pred = model_DTC.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.9256621496766245
              precision    recall  f1-score   support

           0       0.95      0.92      0.94     15213
           1       0.90      0.93      0.91     10763

    accuracy                           0.93     25976
   macro avg       0.92      0.93      0.92     25976
weighted avg       0.93      0.93      0.93     25976



# Building Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_RFC = RandomForestClassifier(n_estimators = 500, max_depth= 5)
model_RFC.fit(X_train,y_train)
y_pred = model_RFC.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.9181937172774869
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     14837
           1       0.90      0.91      0.90     11139

    accuracy                           0.92     25976
   macro avg       0.92      0.92      0.92     25976
weighted avg       0.92      0.92      0.92     25976



# Building ADABoost model

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model_ABC = AdaBoostClassifier()
model_ABC.fit(X_train,y_train)
y_pred = model_ABC.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.9101863258392362
              precision    recall  f1-score   support

           0       0.93      0.92      0.92     14977
           1       0.89      0.90      0.89     10999

    accuracy                           0.91     25976
   macro avg       0.91      0.91      0.91     25976
weighted avg       0.91      0.91      0.91     25976



# Building XGBoost model

In [None]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train,y_train)
y_pred = model_xgb.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))



0.9410994764397905
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     15122
           1       0.92      0.94      0.93     10854

    accuracy                           0.94     25976
   macro avg       0.94      0.94      0.94     25976
weighted avg       0.94      0.94      0.94     25976

