In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading necessary libraries
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot, download_plotlyjs
init_notebook_mode(connected = True)

In [None]:
directory = '../input/airline-passenger-satisfaction/'
files = ['train.csv', 'test.csv']

print(f'Loading training data from csv file...')
raw_data_train = pd.read_csv(directory + files[0])
print('Training dataset loaded.')

print(f'Loading test data from csv file...')
raw_data_test = pd.read_csv(directory + files[1])
print('Test dataset loaded.')

In [None]:
# Function defined to check medata of a dataframe
def master_dataframe(dataframe):
    df_metadata = pd.DataFrame({'Datatype': dataframe.dtypes,
                                "Null Values": dataframe.isna().sum(),  
                                "Null %": round(dataframe.isna().sum()/len(dataframe)*100, 2),
                                "No: Of Unique Values": dataframe.nunique()})
    
    df_describe = dataframe.describe(include='all').T
    
    df_metadata = df_metadata.join(df_describe)  

    return df_metadata

In [None]:
# Checking metadata of the training dataset
master_dataframe(raw_data_train)

**From the above metadata table, we have the below information :**

1. None of the columns have Null values except "Arrival Delay in Minutes". And the metric says percentage of NUll values is      just 0.3%, that means we can completely drop the null values. But we will decide on handling them going forward.

   Replcing Null values with mean of "Arrival Delay in Minutes" might be a bad idea as the field seem to have outliers which    would give incorrect meaning to the data. 

2. Almost all the features are categorical in nature except some continous features. This clearly explains there
   is difference in scale between them. We might have to standardize the data to a common scale before building the model to    give equal weightage to each feature.
   
 
 3. Fields "Unnamed: 0" and "id" are unique identifiers to the data, hence we can exclude them from our
    set of independent features.
    
 4. There are some features which are categorical in nature and are of data type object. That means, we have 
    to properly encode them before including them to train the model.
    
  With this said let's begin our journey in deeply understanding the data available.  

In [None]:
# Let's replace the Null values in the field "Arrival Delay in Minutes" with zero assuming that they were not delayed.
# You may also choose to drop the rows completely as it would not make much of a difference as, we will be removing only 0.3% of the rows.

data_cleaned = raw_data_train.copy()
data_cleaned = data_cleaned.fillna(0)
data_cleaned['Arrival Delay in Minutes'].isna().sum()

In [None]:
# Let's drop the fields "Unnamed: 0" and "id".
data_cleaned.drop(columns = ['Unnamed: 0', 'id'], axis = 1, inplace = True)
data_cleaned.columns

# Univariant Analysis

Univariant analysis is the basic type of analysis that is performed by an analyst which involves only single variable. Hence, there is no other variable to compare or define a relationship with. Univariant analysis involves checking the frequency distribution, range, dispersion and other characteristcs of a variable using pivot tables, bar chart, pie chart, histogram, box plots etc.

In [None]:
# Let's divide our data into Categorical and Continous dataset.

cat_data_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'satisfaction']
cont_data_cols = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

In [None]:
# Function to show the frequency distribution table and bar chart of the distribution of the variables.
def frequency_distribution(feature):
    if feature in cat_data_cols:
        freq_dist = data_cleaned[feature].value_counts().reset_index()
        freq_dist.rename(columns = {feature: 'Frequency', 'index': feature}, inplace = True)
        freq_dist['% Of Distribution'] = round(freq_dist['Frequency']/freq_dist['Frequency'].sum() * 100, 2)
        freq_dist[feature] = freq_dist[feature].astype(str)
        freq_dist.sort_values(by = ['% Of Distribution'], ascending = True, inplace = True)
        
    else:
        return 'Enter a valid Categorical feature from the data set'
        exit()

    data = [go.Bar(x = freq_dist[feature], 
                   y = freq_dist['% Of Distribution'], 
                   text = freq_dist['% Of Distribution'], 
                   textposition = 'outside', 
                   textfont = {'color': 'white'},
                   marker = dict(color = freq_dist['% Of Distribution'], 
                                 line = {'color': 'white', 'width': 1.5}))]
    
    layout = go.Layout(title = dict(text = 'Frequency Distribution : '+ feature , 
                                    x = 0.5, 
                                    y = 0.88), 
                       xaxis = dict(title = feature, 
                                    titlefont = {'size': 16}), 
                       yaxis = dict(title = '% Of Distribution', 
                                    titlefont = {'size': 16}), 
                       height = 500,
                       width = 550, 
                       template = 'plotly_dark')
    
    fig = go.Figure(data = data, 
                    layout = layout)
    
    
    
    return iplot(fig)

In [None]:
# Pass any categorical feature to the function to check the frequency distribution.
frequency_distribution('Ease of Online booking')

In [None]:
# Checking the frequency distribution for all the categorical variables using a for loop and the function.
for col in cat_data_cols:
    frequency_distribution(col)

In [None]:
# Function to show the dispersion of the data in continous features.
def dispersion(feature):
    if feature in cont_data_cols:
        data = [go.Box(x = data_cleaned[feature], 
                       name = str(feature),
                       marker = dict(line = {'color': 'white', 'width': 1.5}, 
                                     color = 'indianred'))]
        
        layout = go.Layout(title = dict(text = 'Dispersion : '+ feature , 
                                    x = 0.5, 
                                    y = 0.88), 
                           xaxis = dict(title = 'Values', 
                                    titlefont = {'size': 16}), 
                           height = 300,
                           width = 500,
                           template = 'plotly_dark')
        
        fig = go.Figure(data = data, 
                    layout = layout)
        
    else:
        return 'Enter a valid continous feature from the data set'
        exit()
    
    
    return iplot(fig)

In [None]:
dispersion('Age')

In [None]:
# Checking the dispersion for all the continous variables using a for loop and the function.
for col in cont_data_cols:
    dispersion(col)

# Bivariant Analysis

Bivariate analysis is one of the simplest forms of quantitative (statistical) analysis. It involves the analysis of two variables (often denoted as X, Y), for the purpose of determining the empirical relationship between them. ... It is the analysis of the relationship between the two variables.

In [None]:
def satisfaction_by_feature(feature):
    if feature in cat_data_cols:
        grp_by_feature = raw_data_train.groupby(by = [feature, 'satisfaction'])['id'].count().reset_index()
        grp_by_total = raw_data_train.groupby(by = [feature])['id'].count().reset_index()
        grp_by = pd.merge(left = grp_by_feature, right = grp_by_total, on = feature, how = 'inner')
        grp_by['% Satisfied\Dissatisfied'] = round(grp_by['id_x']/grp_by['id_y'] * 100, 2)
        grp_by.drop(columns = ['id_x', 'id_y'], inplace = True)
    else:
        return 'Enter a valid Categorical feature from the data set'
        exit()
        
    data = [go.Bar(x = grp_by[grp_by['satisfaction'] == 'satisfied'][feature], 
                   y = grp_by[grp_by['satisfaction'] == 'satisfied']['% Satisfied\Dissatisfied'], 
                   name = 'Satisfied', 
                   text = grp_by[grp_by['satisfaction'] == 'satisfied']['% Satisfied\Dissatisfied'], 
                   textposition = 'inside', 
                   textfont = {'color': 'white'}, 
                   marker = dict(line = {'color': 'black', 'width': 1.5})), 
       
            go.Bar(x = grp_by[grp_by['satisfaction'] == 'neutral or dissatisfied'][feature], 
                   y = grp_by[grp_by['satisfaction'] == 'neutral or dissatisfied']['% Satisfied\Dissatisfied'], 
                   name = 'Dissatisfied', 
                   text = grp_by[grp_by['satisfaction'] == 'neutral or dissatisfied']['% Satisfied\Dissatisfied'], 
                   textposition = 'inside', 
                   textfont = {'color': 'white'}, 
                   marker = dict(line = {'color': 'black', 'width': 1.5}))]

    layout = go.Layout(title = dict(text = 'Satisfaction Rate By : ' + feature, 
                                    font = {'size': 20}, 
                                    x = 0.5, 
                                    y = 0.88),
                       xaxis = dict(title = feature, 
                                    titlefont = {'size': 16}),
                       yaxis = dict(title = 'Satisfaction Rate', 
                                    titlefont = {'size': 16}),
                       barmode = 'stack', 
                       hovermode = 'closest',
                       height = 500, 
                       width = 650)

    fig = go.Figure(data = data, 
                    layout = layout)
    
    return iplot(fig)
      

In [None]:
# Checking the satisfaction rate relationship for all the categorical variables using a for loop and the function.
for col in cat_data_cols[:-1]:
    satisfaction_by_feature(col)

We can see how various facilities provided by the airline companies i.e. the categorical features affect the rate of satisfaction.

In [None]:
raw_data_train['Arrival Delay in Minutes'].mean()

In [None]:
raw_data_train[(raw_data_train['Arrival Delay in Minutes'] < 28 )].groupby('satisfaction')['id'].count().reset_index()

In [None]:
raw_data_train[(raw_data_train['Departure Delay in Minutes'] == 0 ) & (raw_data_train['Arrival Delay in Minutes'] == 0 )].groupby('satisfaction')['id'].count().reset_index()

In [None]:
raw_data_train[(raw_data_train['Departure Delay in Minutes'] == 0 ) & (raw_data_train['Arrival Delay in Minutes'] == 0 )].count()

# Data Transformation

Let's encode the categorical features like Gender, Customer Type, Class, Type Of Travel & Satisfaction so that we can use them to train our model.

The features Gender, Customer Type, Type Of Travel & Satisfaction are nominal categorical features.i.e. they are not ordered and hence can be encoded using LabelEncoder.

The feature Class is an ordinal categorical feature, hence we will create dummy columns for this feature.

In [None]:
# Let's start with LabelEncoder.
encoded_data = data_cleaned.copy()
cols_to_encode = ['Gender', 'Customer Type', 'Type of Travel', 'satisfaction']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded_data[cols_to_encode] = data_cleaned[cols_to_encode].apply(le.fit_transform)

In [None]:
# Let's use get dummies to encode the Class feature
encoded_data = pd.get_dummies(data = encoded_data, columns = ['Class'], drop_first = True)

In [None]:
master_dataframe(encoded_data)

In [None]:
# Let's rearrange the columns in the dataset so that the dependent variable in the last column.
cols = [col for col in encoded_data if col != 'satisfaction'] + ['satisfaction']
encoded_data = encoded_data[cols]

In [None]:
encoded_data = encoded_data[['Gender', 'Customer Type', 'Type of Travel', 'Age', 'Flight Distance',
           'Inflight wifi service', 'Departure/Arrival time convenient',
           'Ease of Online booking', 'Gate location', 'Food and drink',
           'Online boarding', 'Seat comfort', 'Inflight entertainment',
           'On-board service', 'Leg room service', 'Baggage handling',
           'Checkin service', 'Inflight service', 'Cleanliness',
           'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'Class_Eco',
           'Class_Eco Plus', 'satisfaction']]

In [None]:
# Let's separate the dependent and independent variables.
X = encoded_data.iloc[:, :-1].values
y = encoded_data.iloc[:, -1].values
print('Shape of X: ', X.shape)
print('Shape of y: ', y.shape)

In [None]:
# Let's split the data set into train and test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print('Shape of X_train: ', X_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of y_test: ', y_test.shape)

In [None]:
# Let's standardise the columns.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:-1] = sc.fit_transform(X_train[:, 3:-1])
X_test[:, 3:-1] = sc.transform(X_test[:, 3:-1])

In [None]:
X_train[:, 4]

# Logisitc Regression

In [None]:
# Creating the LogisticRegression model
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

# Predicting the output for test dataset
y_pred = lr_classifier.predict(X_test)

# Lets measure the accuracy of the model using the confusion matrix and accruacy score.
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_lr = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_lr)

# SVC

In [None]:
from sklearn.svm import SVC
svc_classifier = SVC(kernel = 'linear')
svc_classifier.fit(X_train, y_train)

# Predicting the output for test dataset
y_pred = svc_classifier.predict(X_test)

# Lets measure the accuracy of the model using the confusion matrix and accruacy score.
cm = confusion_matrix(y_test, y_pred)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_SVC = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_SVC)

# SVM - Kernel

In [None]:
svm_classifier = SVC(kernel = 'rbf')
svm_classifier.fit(X_train, y_train)

# Predicting the output for test dataset
y_pred = svm_classifier.predict(X_test)

# Lets measure the accuracy of the model using the confusion matrix and accruacy score.
cm = confusion_matrix(y_test, y_pred)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_SVM = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_SVM)

# K Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifer = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifer.fit(X_train, y_train)

# Predicting the output for test dataset
y_pred = knn_classifer.predict(X_test)

# Lets measure the accuracy of the model using the confusion matrix and accruacy score.
cm = confusion_matrix(y_test, y_pred)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_knn = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_knn)

# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

y_pred_nb = nb_classifier.predict(X_test)

# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_nb)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_nb = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_nb)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(criterion = 'entropy')
dt_classifier.fit(X_train, y_train)

y_pred_dt = dt_classifier.predict(X_test)

# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_dt)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_dt = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_dt)

# Random Forect Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)

# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_rf)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_rf = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_rf)