In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.options.display.max_columns = 25

In [None]:
# reading the train and test datasets
train_df = pd.read_csv('/kaggle/input/airline-passenger-satisfaction/train.csv', index_col = 0)
test_df = pd.read_csv('/kaggle/input/airline-passenger-satisfaction/test.csv', index_col = 0)

In [None]:
# getting brief overview of the train dataset - number of columns and rows (shape of dataset), 
# columns names and its dtype, 
# how many non-null values it has 
# and memory usage
train_df.info()

In [None]:
# getting brief overview of the test dataset - number of columns and rows (shape of dataset), 
# columns names and its dtype, 
# how many non-null values it has 
# and memory usage
test_df.info()

In [None]:
# getting all the columns name of the train dataset
train_df.columns

In [None]:
# changing the column name as per Python naming convention
train_df.rename(columns = {'Gender' : 'gender', 'Customer Type': 'customer_type', 'Age' : 'age', 
                           'Type of Travel': 'travel_type', 'Class': 'class', 
                           'Flight Distance': 'flight_distance', 
                           'Inflight wifi service': 'inflight_wifi',
                           'Departure/Arrival time convenient': 'departure_n_arrival_time_convenient', 
                           'Ease of Online booking' : 'easy_onlinebooking',
                           'Gate location' : 'gate_location', 'Food and drink' : 'food_n_drink', 
                           'Online boarding' : 'online_boarding', 'Seat comfort': 'seat_comfort',
                           'Inflight entertainment' : 'inflight_entertainment', 
                           'On-board service' : 'onboard_service', 
                           'Leg room service' : 'leg_room_service',
                           'Baggage handling' : 'baggage_handling',
                           'Checkin service' : 'checkin_service', 
                           'Inflight service' : 'inflight_service','Cleanliness' : 'cleanliness',
                           'Departure Delay in Minutes': 'departure_delay_min', 
                           'Arrival Delay in Minutes' : 'arrival_delay_minutes',
                           'satisfaction' : 'satisfaction'}, inplace= True)

In [None]:
# changing the column name as per Python naming convention
test_df.rename(columns = {'Gender' : 'gender', 'Customer Type': 'customer_type', 'Age' : 'age', 
                           'Type of Travel': 'travel_type', 'Class': 'class', 
                           'Flight Distance': 'flight_distance', 
                           'Inflight wifi service': 'inflight_wifi',
                           'Departure/Arrival time convenient': 'departure_n_arrival_time_convenient', 
                           'Ease of Online booking' : 'easy_onlinebooking',
                           'Gate location' : 'gate_location', 'Food and drink' : 'food_n_drink', 
                           'Online boarding' : 'online_boarding', 'Seat comfort': 'seat_comfort',
                           'Inflight entertainment' : 'inflight_entertainment', 
                           'On-board service' : 'onboard_service', 
                           'Leg room service' : 'leg_room_service',
                           'Baggage handling' : 'baggage_handling',
                           'Checkin service' : 'checkin_service', 
                           'Inflight service' : 'inflight_service','Cleanliness' : 'cleanliness',
                           'Departure Delay in Minutes': 'departure_delay_min', 
                           'Arrival Delay in Minutes' : 'arrival_delay_minutes',
                           'satisfaction' : 'satisfaction'}, inplace= True)

In [None]:
# first 5 rows content of the train dataset
train_df.head()

In [None]:
# first 5 rows content of the test dataset
test_df.head()

In [None]:
# shape of the dataset
train_df.shape, test_df.shape

In [None]:
# checking for any duplicate records
train_df.duplicated().sum()

In [None]:
round(train_df['satisfaction'].value_counts(normalize = True) * 100, 2)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style = 'darkgrid')
sns.countplot(x = 'satisfaction', data = train_df)
plt.show()

#### converting object dtype columns to numeric dtype

In [None]:
# getting all object dtype column name

colname = list(train_df.select_dtypes(['object']).columns)
colname

#### checking whether the train and test dataset object column have same unique values

In [None]:
# finding the unique values count
train_df['gender'].value_counts()

In [None]:
train_df['gender'].value_counts().sort_index().index.tolist() == test_df['gender'].value_counts().sort_index().index.tolist()

In [None]:
train_df['customer_type'].value_counts()

In [None]:
train_df['customer_type'].value_counts().sort_index().index.tolist() == test_df['customer_type'].value_counts().sort_index().index.tolist()

In [None]:
train_df['travel_type'].value_counts()

In [None]:
train_df['travel_type'].value_counts().sort_index().index.tolist() == test_df['travel_type'].value_counts().sort_index().index.tolist()

In [None]:
train_df['class'].value_counts()

In [None]:
train_df['class'].value_counts().sort_index().index.tolist() == test_df['class'].value_counts().sort_index().index.tolist()

In [None]:
train_df['satisfaction'].value_counts()

In [None]:
train_df['satisfaction'].value_counts().sort_index().index.tolist() == test_df['satisfaction'].value_counts().sort_index().index.tolist()

#### Finding the unique values and also checking whether the unique values are same in train and test dataset. As while converting object dtype to numeric dtype if both the train and test dataset doesn't have same unique values it will create a discrepancy in these columns values which may affect the prediction results of the model. Though these dataset has same unique values in each of the object dtype columns in both train and test, then though it is always ideal to combine both train and test to perform the conversion of object dtype to numeric dtype and later again separated train and test dataset

In [None]:
train_df['training_set'] = 1
test_df['training_set'] = 0
frame = [train_df, test_df]
temp_df = pd.concat(frame)
temp_df.reset_index(drop = True, inplace= True)

In [None]:
temp_df['gender'] = temp_df['gender'].astype('category').cat.codes
temp_df['customer_type'] = temp_df['customer_type'].astype('category').cat.codes
temp_df['travel_type'] = temp_df['travel_type'].astype('category').cat.codes
temp_df['class'] = temp_df['class'].astype('category').cat.codes
temp_df['satisfaction'] = temp_df['satisfaction'].astype('category').cat.codes

In [None]:
train_df = temp_df[temp_df['training_set'] == 1]
train_df.drop(['training_set'], axis = 1, inplace = True)
train_df.reset_index(drop = True, inplace = True)

In [None]:
test_df = temp_df[temp_df['training_set'] == 0]
test_df.drop(['training_set'], axis = 1, inplace =True)
test_df.reset_index(drop = True, inplace= True)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# basic statistical details
train_df.describe()

In [None]:
# dropping the id column as it is not an important independent variable 
# for training the model and getting the prediction
train_df.drop(['id'], axis = 1, inplace = True)
test_df.drop(['id'], axis = 1, inplace = True)

In [None]:
# normalizing the data with MinMaxScalar
from sklearn.preprocessing import MinMaxScaler

In [None]:
# independent variables name
input_colname = list(train_df.columns)
input_colname.remove('satisfaction')
input_colname

In [None]:
minmaxtrain_df = train_df.copy()
minmaxtrain_df.drop(['satisfaction'], axis = 1, inplace = True)

minmax_scaler = MinMaxScaler()
minmaxtrain_df = minmax_scaler.fit_transform(minmaxtrain_df)
minmaxtrain_df = pd.DataFrame(minmaxtrain_df, columns = input_colname)
minmaxtrain_df['satisfaction'] = train_df['satisfaction']
minmaxtrain_df.head()

In [None]:
minmaxtest_df = test_df.copy()
minmaxtest_df.drop(['satisfaction'], axis = 1, inplace = True)
minmaxtest_df = minmax_scaler.transform(minmaxtest_df)
minmaxtest_df = pd.DataFrame(minmaxtest_df, columns = input_colname)
minmaxtest_df['satisfaction'] = test_df['satisfaction']
minmaxtest_df.head()

In [None]:
#finding the correlation between all the variables in a dataset

feature_corr = minmaxtrain_df.corr()
plt.figure(figsize = (25, 20))
sns.heatmap(feature_corr, annot = True, cmap = 'RdYlGn')
plt.show()

In [None]:
#finding the mean relationship of independent variables with the dependent variable
minmaxtrain_df.groupby('satisfaction').mean()

In [None]:
#initializing the X (independent variables) and y (dependent variable)

X = minmaxtrain_df.drop(['satisfaction'], axis = 1)
y = minmaxtrain_df['satisfaction']

In [None]:
X_test = minmaxtest_df.drop(['satisfaction'], axis = 1)
y_test = minmaxtest_df['satisfaction']

In [None]:
# training XGBClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
xgbc_model = XGBClassifier(objective = 'binary:logistic', random_state = 42)

In [None]:
parameters = {
    'max_depth' : [8],
    'n_estimators' : [500],
    'learning_rate': [0.01]
}

In [None]:
gs = GridSearchCV(
    estimator = xgbc_model,
    param_grid = parameters,    
    n_jobs = 10,
    cv = 10,
    verbose=True
)

In [None]:
gs.fit(X,y)

In [None]:
pred = gs.predict(X_test)

In [None]:
# model score
gs.score(X_test, y_test)

In [None]:
#getting confusion matrix values

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

print("True Positive", tp)
print("True Negative", tn)
print("False Positive", fp)
print("False Negative", fn)

In [None]:
accuracy_score(y_test, pred)

In [None]:
print(classification_report(y_test, pred))