In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.rcParams['figure.figsize']=(8,6)

In [None]:
df=pd.read_csv('/kaggle/input/airline-passenger-satisfaction/train.csv',index_col=0)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#finding if there is any null values present in the dataset
df.isnull().sum()

# Preposseing of the Dataset

In [None]:
#there are two two object type features, analyse them and convert them into int type
df['Gender'].value_counts()

In [None]:
#ploting the Gender values
sns.countplot(x='Gender',data=df)

In [None]:
#Replacing the Gender feature (which holds a nominal Categorical values) by dummy rows
gender=pd.get_dummies(df['Gender'])
df.drop('Gender',axis=1,inplace=True)
df=pd.concat([df,gender],axis=1)
df.head()

In [None]:
#new shape of the dataset
df.shape

In [None]:
#now converting the Customer Type features which holds nominal Categorical values
df['Customer Type'].value_counts()

In [None]:
#plotting the customer type feature values
sns.countplot(x='Customer Type',data=df)

In [None]:
#replacing the Customer type feature by adding dummy rows
customer_type=pd.get_dummies(df['Customer Type'])
df.drop('Customer Type',axis=1,inplace=True)
df=pd.concat([df,customer_type],axis=1)
df.head()

In [None]:
#new shape od the dataset
df.shape

In [None]:
#also replacing the Type of Travel and Class features in the similar way
df['Type of Travel'].value_counts()

In [None]:
df['Class'].value_counts()    #class features hold ordinal Categorical values

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Class']=le.fit_transform(df['Class'])
df.head()

In [None]:
df['Type of Travel']=le.fit_transform(df['Type of Travel'])
df.head()

In [None]:
#Analysing different values of the dependent feature Satisfaction
df['satisfaction'].value_counts()

In [None]:
df.info()

In [None]:
#plotting a heatmap to check the null values of Arrival Delay in Minutes features
sns.heatmap(df.isnull(),yticklabels=False,)

In [None]:
df['Arrival Delay in Minutes'].isnull().sum()

In [None]:
#sns.distplot(df['Arrival Delay in Minutes'],kde=False)
df['Arrival Delay in Minutes'].plot(kind='hist')

In [None]:
val=df['Arrival Delay in Minutes'].mean()
val

In [None]:
#replacing all the null values of Arrival Delay in Minutes features with it's mean
df['Arrival Delay in Minutes'].replace(np.nan,val,inplace=True)

In [None]:
df['Arrival Delay in Minutes'].isnull().sum()

In [None]:
#dropping the id feature as it will not help during predicting calculations
df.drop('id',axis=1,inplace=True)

In [None]:
#Plotting a distribution plot to analyse the different group of age people travelling with this aireline
sns.distplot(df['Age'],kde=False)

There are different age groups from 5 to 80 years, travelling with this airline

# Feature Selection Process

In [None]:
#partioning the independent and dependent data
X=df.drop('satisfaction',axis=1)
y=df['satisfaction']

In [None]:
#Checking if any Features are having constant Variance
from sklearn.feature_selection import VarianceThreshold
vt=VarianceThreshold(threshold=0.0)
vt.fit_transform(X)
vt.get_support()

**As all the values are true i.e there is no feature with constant varinace**

In [None]:
#Plotting the correlation coefficients of all the Features
plt.figure(figsize=(18,12))
sns.heatmap(df.corr(),annot=True)

In [None]:
#function to check if any feature is having correlation coefficient value more than 0.9

def correlation_check(df,val):
    corr_matrix=df.corr()
    features=set()
    for i in range(len(corr_matrix)):
        for j in range(i):
            if corr_matrix.iloc[i,j]>0.9:
                features.add(corr_matrix.columns[i])
    return features
            

In [None]:
columns=correlation_check(X,0.9)
columns

In [None]:
#so we can remove the column 'Arrival Delay in Minutes' as this is highly correlated with the feature 'Departure delay in minutes'
X.drop(columns,axis=1,inplace=True)

In [None]:
df.head()

In [None]:
#Univariate Feature Selection
from sklearn.feature_selection import mutual_info_classif
mic=mutual_info_classif(X,y)

In [None]:
#checking the mutual information classification values of all the features
mic=pd.Series(mic)
mic.index=X.columns
mic

In [None]:
mic.sort_values(ascending=False)

In [None]:
#we will select top 15 best features from the above list
from sklearn.feature_selection import SelectKBest
skb=SelectKBest(mutual_info_classif,k=15)
skb

In [None]:
X_new=skb.fit_transform(X,y)

In [None]:
X_new.shape

In [None]:
X_new

In [None]:
#label Encoding
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_norm=ss.fit_transform(X_new)

In [None]:
X_norm

In [None]:
#cross_validation
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_norm,y,test_size=0.15,random_state=0)
X_train.shape,X_test.shape

In [None]:
y_train.shape,y_test.shape

**Applying into Logistic Regression Classifier**

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)
yhat=lr.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
accuracy_score(y_test,yhat)

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),cmap='Greens',annot=True,fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat,average='weighted')

**Applying into K Nearest Neighbors model**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#testing different values of K which one gives the highest accuracy
k_value=[i for i in range(1,11)]
accuracy=[]
for i in k_value:
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    accuracy.append(accuracy_score(y_test,knn.predict(X_test)))

In [None]:
plt.plot(k_value,accuracy)
plt.ylabel('Accuracy')
plt.xlabel('K values')
plt.show()

In [None]:
print('The maximum accuracy {} can be oobtained with k value ={}'.format(max(accuracy),k_value[accuracy.index(max(accuracy))]))

In [None]:
knn=KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train,y_train)
yhat=knn.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,cmap='Greens',fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat,average='weighted')

**Applying into Support Vector Classifier**

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)
yhat=svc.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),cmap='Greens',annot=True, fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat,average='weighted')