# Data: https://www.kaggle.com/teejmahal20/airline-passenger-satisfaction

# Import data

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
# data1 = pd.read_csv('Airline1.csv')
# data2 = pd.read_csv('Airline2.csv')
# data = pd.concat([data1,data2])

data = pd.read_csv('Airline2_tiny.csv')

In [None]:
data.shape

(4999, 23)

In [None]:
data.tail()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
4994,Female,Loyal Customer,41,Personal Travel,Eco Plus,746,3,5,3,4,...,3,3,3,3,5,3,4,4,28.0,neutral or dissatisfied
4995,Male,Loyal Customer,53,Business travel,Business,3095,3,3,3,3,...,3,3,3,3,3,3,5,0,0.0,satisfied
4996,Male,disloyal Customer,21,Business travel,Eco,125,4,0,4,4,...,5,3,4,4,1,1,5,58,60.0,satisfied
4997,Male,Loyal Customer,59,Business travel,Eco,302,4,3,3,3,...,4,5,2,4,1,2,4,0,0.0,satisfied
4998,Male,disloyal Customer,25,Business travel,Business,738,1,4,2,4,...,3,5,5,5,4,5,3,86,71.0,neutral or dissatisfied


In [None]:
X = data.iloc[:,:-1]

In [None]:
X.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,4,...,3,5,5,5,5,2,5,5,50,44.0
1,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,1,...,5,4,4,4,4,3,4,5,0,0.0
2,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,4,...,2,2,4,1,3,2,2,2,0,0.0
3,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,2,...,4,1,1,1,1,3,1,4,0,6.0
4,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,3,...,2,2,2,2,2,4,2,4,0,20.0


In [None]:
y = data.iloc[:,-1]

In [None]:
y.head()

0                  satisfied
1                  satisfied
2    neutral or dissatisfied
3                  satisfied
4                  satisfied
Name: satisfaction, dtype: object

# Handling missing data - Numeric type

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
#X.dtypes

In [None]:
numerical_cols = list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64))[0])

In [None]:
imp_mean.fit(X.iloc[:,numerical_cols])

SimpleImputer()

In [None]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols])

### Handling missing string data

In [None]:
string_cols = list(np.where((X.dtypes == object))[0])

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [None]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

# One Hot encoder method

In [None]:
def OneHotEncoderMethod(indices, data):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(),indices )], remainder='passthrough')
    return columnTransformer.fit_transform(data)

# Label encoding method

In [None]:
def LabelEncoderMethod(series):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(series)
    #print('Actual labels',le.classes_)
    #print('Encoding values',le.transform(pd.unique(series)))
    return le.transform(series)

# Label encoding target feature

In [None]:
y = LabelEncoderMethod(y)

# Encoding selection for X

In [None]:
def EncodingSelection(X, threshold=10):
    # Step 01 : Select the string col
    string_cols = list(np.where((X.dtypes == object))[0])
    one_hot_encoding_indices = []

    # Step 02: The number of categoty is 2 and more than threshold, label encode
    for col in string_cols:
        lenght = len(pd.unique(X[X.columns[col]]))
        if lenght == 2 or lenght > threshold:
            X[X.columns[col]] = LabelEncoderMethod(X[X.columns[col]])
        else:
            one_hot_encoding_indices.append(col)

    # Step 03: One hot encode otherwise
    X = OneHotEncoderMethod(one_hot_encoding_indices, X)
    return X

In [None]:
X = EncodingSelection(X)

In [None]:
X.shape

(4999, 24)

# Feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
kbest = SelectKBest(score_func=chi2, k='all')

In [None]:
from sklearn import preprocessing
MMS = preprocessing.MinMaxScaler()

In [None]:
K_features = 10

In [None]:
x_temp = MMS.fit_transform(X)

In [None]:
x_temp = kbest.fit(x_temp,y)

In [None]:
best_features = np.argsort(x_temp.scores_)[-K_features:]

In [None]:
best_features

array([ 8,  7, 15, 21, 14, 13,  4,  1,  0,  6], dtype=int64)

In [None]:
features_to_delete = best_features = np.argsort(x_temp.scores_)[:-K_features]

In [None]:
X = np.delete(X, features_to_delete, axis=1)

In [None]:
X.shape

(4999, 10)

In [None]:
del x_temp

# Train test split

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
print(X_train.shape)

(3999, 10)


# Feature scaling

### Standardization: (X - mean(X)) / std(X)

### Normalization = (x - min(x)) / (max(x) - min(x))

In [None]:
from sklearn import preprocessing

In [None]:
sc = preprocessing.StandardScaler(with_mean=False)

In [None]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [None]:
X_train = sc.transform(X_train)

In [None]:
print(X_train.shape)

(3999, 10)


In [None]:
X_test = sc.transform(X_test)

In [None]:
print(X_test.shape)

(1000, 10)


#### The data is ready!!

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 17:26:58


# Building KNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

params = {'n_neighbors':[4,5,6,7],
              'leaf_size':[1,3,5],
              'algorithm':['auto', 'kd_tree'],
               'weights': ['uniform', 'distance']
         }
cv = StratifiedKFold(n_splits=2)
random_search_cv = RandomizedSearchCV(estimator=model,
                              param_distributions = params, verbose=1, cv=cv,scoring='f1',n_jobs=-1 )



random_search_cv.fit(X_train, y_train)
print("Best Estimator", random_search_cv.best_estimator_)
print("Best score", random_search_cv.best_score_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    4.7s remaining:    0.4s


Best Estimator KNeighborsClassifier(leaf_size=1)
Best score 0.8774673417446253


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    5.1s finished


# Building Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

params = {'solver':['newton-cg', 'lbfgs', 'liblinear'],
          'penalty': ['l1', 'l2'],
          'C':[100, 10, 1.0, 0.1, 0.01]
         }

cv = StratifiedKFold(n_splits=2)
random_search_cv = RandomizedSearchCV(estimator=model,
                              param_distributions = params, verbose=1, cv=cv,scoring='f1',n_jobs=-1 )



random_search_cv.fit(X_train, y_train)
print("Best Estimator", random_search_cv.best_estimator_)
print("Best score", random_search_cv.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best Estimator LogisticRegression(C=0.01)
Best score 0.8295203666687819


# Building GaussianNB model

In [None]:
from sklearn.naive_bayes import GaussianNB
model_GNB = GaussianNB()
model_GNB.fit(X_train,y_train)
y_pred = model_GNB.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.84
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       564
           1       0.82      0.81      0.82       436

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



# Building SVM (SVC) model

In [None]:
from sklearn.svm import SVC
model = SVC()

params = {
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'C' : [1,5,10],
    'degree' : [3,8],
    'coef0' : [0.01,10,0.5],
    'gamma' : ['auto','scale']
}

cv = StratifiedKFold(n_splits=2)
random_search_cv = RandomizedSearchCV(estimator=model,
                              param_distributions = params, verbose=1, cv=cv,scoring='f1',n_jobs=-1 )



random_search_cv.fit(X_train, y_train)
print("Best Estimator", random_search_cv.best_estimator_)
print("Best score", random_search_cv.best_score_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    1.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    6.2s finished


Best Estimator SVC(C=10, coef0=0.5, degree=8)
Best score 0.9165979221213969


# Building Decision tree model

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

params = {'max_leaf_nodes': list(range(2, 100)),
          'min_samples_split': [2, 3, 4],
         }

cv = StratifiedKFold(n_splits=2)
random_search_cv = RandomizedSearchCV(estimator=model,
                              param_distributions = params, verbose=1, cv=cv,scoring='f1',n_jobs=-1 )



random_search_cv.fit(X_train, y_train)
print("Best Estimator", random_search_cv.best_estimator_)
print("Best score", random_search_cv.best_score_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best Estimator DecisionTreeClassifier(max_leaf_nodes=30, min_samples_split=4)
Best score 0.9069240944070234


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


# Building Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()


param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

cv = StratifiedKFold(n_splits=2)
random_search_cv = RandomizedSearchCV(estimator=model,
                              param_distributions = params, verbose=1, cv=cv,scoring='f1',n_jobs=-1 )



random_search_cv.fit(X_train, y_train)
print("Best Estimator", random_search_cv.best_estimator_)
print("Best score", random_search_cv.best_score_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    1.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.9s finished


Best Estimator RandomForestClassifier(max_leaf_nodes=96, min_samples_split=3)
Best score 0.9227615146702333


# Building ADABoost model

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

params = {
     'n_estimators': np.arange(10,300,10),
     'learning_rate': [0.01, 0.05, 0.1, 1],
 }

cv = StratifiedKFold(n_splits=2)
random_search_cv = RandomizedSearchCV(estimator=model,
                              param_distributions = params, verbose=1, cv=cv,scoring='f1',n_jobs=-1 )



random_search_cv.fit(X_train, y_train)
print("Best Estimator", random_search_cv.best_estimator_)
print("Best score", random_search_cv.best_score_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    3.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.6s finished


Best Estimator AdaBoostClassifier(learning_rate=0.1, n_estimators=200)
Best score 0.8906331862757826


# Building XGBoost model

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier()

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators': [100,500,1000],
        'learning_rate':[0.01,0.3,0.5,0.1],
        'reg_lambda':[1,2]

        }

cv = StratifiedKFold(n_splits=2)
random_search_cv = RandomizedSearchCV(estimator=model,
                              param_distributions = params, verbose=1, cv=cv,scoring='f1',n_jobs=-1 )



random_search_cv.fit(X_train, y_train)
print("Best Estimator", random_search_cv.best_estimator_)
print("Best score", random_search_cv.best_score_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    7.5s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   12.3s finished


Best Estimator XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.6,
              tree_method='exact', validate_parameters=1, verbosity=None)
Best score 0.922052180776655


$\;\;\;\;$ Fill the hyper parameter from seach and chack model agian.

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=500, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.6,
              tree_method='exact', validate_parameters=1, verbosity=None)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

0.937
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       583
           1       0.91      0.94      0.93       417

    accuracy                           0.94      1000
   macro avg       0.93      0.94      0.94      1000
weighted avg       0.94      0.94      0.94      1000



In [None]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 17:27:32
