In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Visualization Library
import matplotlib.pyplot as plt 
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from collections import Counter

import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Airline Passenger Satisfaction Analysis
![](https://ligarbatravel.com/wp-content/uploads/2015/07/ligarba_turizm_ucak_bileti.jpg)

# INTRODUCTION

 
* Hello to everyvone. Customer analysis in this notebook. As a result of our analysis, we will guess whether the passengers are satisfied with the trip.
* We will use various machine learning algorithms in our model.
* Let's start with content.
* Content:
    1. [Load and Check Data](#1)
    2. [Variable Description](#2)
        * [Univariate Variable Analysis](#3)
            * [Categorical Variable](#4)
            * [Numerical Variable](#5)
    3. [Basic Data Analysis](#6)        
    4. [Outlier Detection](#7)
    5. [Missing Value](#8)
        * [Find Missing Value](#9)
        * [Fill Missing Value](#10)

    6. [Visualization](#11)
        * [Correlation Matrix](#12)
        * [Gender--> Satisfaction](#13)
        * [Age--> Satisfaction](#14)
        * [Customer Type--> Satisfaction](#15)
        * [Type of Travel--> Satisfaction](#16)
        * [Class--> Satisfaction](#17)
        * [Gender-->Age-->Satisfaction](#22)
        * [Personal Travel Analysis](#18)
            * [Service Scoring of Personal Travel Passenger](#19)
            * [Eating, Drinking, Cleaning and In-flight Entertainment Scoring of Personal Travel Passenger](#20)
            * [Aircraft Specifications Scoring](#21)
    7. [Feature Engineering](#23)
        * [Customer Type](#24)
        * [Type of Travel](#25)
        * [Class](#26)
        * [Gender](#27)
        * [Drop ID](#28)
    8. [Modeling](#29)
        * [Train - Test Split](#30)
        * [Simple Logistic Regression](#31)
        * [Hyperparameter Tuning - Grid Search - Cross Validation](#32)
        * [Ensemble Modeling](#33)
        * [Prediction and Submission](#34)
    9. [Conculusion](#35)


<a id = "1"></a><br>
# Load and Check Data


In [None]:
train_df=pd.read_csv("../input/airline-passenger-satisfaction/test.csv")
test_df=pd.read_csv("../input/airline-passenger-satisfaction/train.csv")

In [None]:
train_df.columns

* "Unnamed: 0" we will not use unnamed 0. so let's delete.

In [None]:
train_df.drop(labels=["Unnamed: 0"],axis=1,inplace=True)
test_df.drop(labels=["Unnamed: 0"],axis=1,inplace=True)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

<a id = "2"></a><br>
# Variable Description
1. id : Unique id number to each passenger.
2. Gender: Gender of the passengers (Female, Male)
3. Customer Type: The customer type (Loyal customer, disloyal customer)
4. Age: The actual age of the passengers
5. Type of Travel: Purpose of the flight of the passengers (Personal Travel, Business Travel)
6. Class: Travel class in the plane of the passengers (Business, Eco, Eco Plus)
7. Flight distance: The flight distance of this journey
8. Inflight wifi service: Satisfaction level of the inflight wifi service (0:Not Applicable;1-5)
9. Departure/Arrival time convenient: Satisfaction level of Departure/Arrival time convenient
10. Ease of Online booking: Satisfaction level of online booking
11. Gate location: Satisfaction level of Gate location
12. Food and drink: Satisfaction level of Food and drink
13. Online boarding: Satisfaction level of online boarding
14. Seat comfort: Satisfaction level of Seat comfort
15. Inflight entertainment: Satisfaction level of inflight entertainment
16. On-board service: Satisfaction level of On-board service
17. Leg room service: Satisfaction level of Leg room service
18. Baggage handling: Satisfaction level of baggage handling
19. Check-in service: Satisfaction level of Check-in service
20. Inflight service: Satisfaction level of inflight service
21. Cleanliness: Satisfaction level of Cleanliness
22. Departure Delay in Minutes: Minutes delayed when departure
23. Arrival Delay in Minutes: Minutes delayed when Arrival
24. Satisfaction: Airline satisfaction level(Satisfaction, neutral or dissatisfaction)

In [None]:
train_df.info()

* int64(18): id, Age,Flight Distance, Inflight wifi service, Departure/Arrival time convenient, Ease of Online booking, Gate location, Food and drink, Online boarding, Seat comfort, Inflight entertainment, On-board service, Leg room service, Baggage handling, Checkin service, Inflight service, Cleanliness and Departure Delay in Minutes.
* object(5): Gender, Customer Type, Type of Travel, Class and satisfaction.
* float64(1): Arrival Delay in Minutes.

<a id = "3"></a><br>
# Univariate Variable Analysis


* Categorical Variables: Inflight wifi service, Departure/Arrival time convenient, Ease of Online booking, Gate location, Food and drink, Online boarding, Seat comfort, Inflight entertainment, On-board service, Leg room service, Baggage handling, Checkin service, Inflight service, Cleanliness,Gender, Customer Type, Type of Travel, Class and satisfaction.
* Numerical Variables: id,Age, Flight Distance, Departure Delay in Minutes, Arrival Delay in Minutes

<a id = "4"></a><br>
## Categorical Variables

In [None]:
def bar_plot(variable):
    
    var=train_df[variable]
    var_Value=var.value_counts()
    
    #visualize
    plt.figure(figsize=(9,3))
    plt.bar(var_Value.index,var_Value.values)
    
    plt.xlabel("Passengers Score")
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,var_Value))

In [None]:
category1=["Inflight wifi service", "Departure/Arrival time convenient", "Ease of Online booking", "Gate location", "Food and drink", "Online boarding", "Seat comfort", "Inflight entertainment", "On-board service", "Leg room service", "Baggage handling", "Checkin service", "Inflight service", "Cleanliness",]
for c in category1:
    bar_plot(c)

In [None]:
category2=["Gender", "Customer Type", "Type of Travel", "Class","satisfaction"]
for c in category2:
    print("{} \n".format(train_df[c].value_counts()))

<a id = "5"></a><br>
## Numerical Variable

In [None]:
def plot_hist(variable):
    plt.figure(figsize=(9,3))
    plt.hist(train_df[variable],bins=50)
    plt.xlabel(variable)
    plt.ylabel("Fruequency")
    plt.title("{} distribution with histogram".format(variable))
    plt.show()


In [None]:
numericVar=["id","Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes"]
for n in numericVar:
    plot_hist(n)

<a id = "6"></a><br>
# Basic Data Analysis
* First of all, I will turn it into numerical to use some object properties in the data. 
* The feature we will classify will be "satisfaction". We make it numeric too. satisfaction (0:neutral or dissatisfied, 1: satisfied)
* Some titles make my job difficult. Let's solve this problem.

In [None]:
train_df.columns=[each.replace(" ","_") for each in train_df.columns]

In [None]:
train_df.head()

In [None]:
train_df["satisfaction"]=[1 if each=="satisfied" else 0 for each in train_df.satisfaction]

In [None]:
train_df.head(10)

* Now let's look at the effect of some features on satisfaction

* Gender --> satisfaction
* Age --> satisfaction
* Type_of_Travel --> satisfaction
* Class --> satisfaction
* Customer_Type --> satisfaction
* Cleanliness --> satisfaction
* Inflight_wifi_service --> satisfaction
* Inflight_entertainment --> satisfaction
* Food_and_drink --> satisfaction
* Seat_comfort --> satisfaction

In [None]:
# Gender vs satisfaction
train_df[["Gender","satisfaction"]].groupby(["Gender"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)


* Satisfaction rate of male passengers %44, satification rate of female passenger %43

In [None]:
# Age vs satisfaction
train_df[["Age","satisfaction"]].groupby(["Age"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* The most satisfied age group 46-56.

In [None]:
# Type_of_Travel vs satisfaction
train_df[["Type_of_Travel","satisfaction"]].groupby(["Type_of_Travel"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* 58% of passengers traveling on business are satisfied, 0.09% of passengers traveling on personal are satisfied. It's interesting statitic.

In [None]:
# Class vs satisfaction
train_df[["Class","satisfaction"]].groupby(["Class"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* %69 of passenger class on business are satisfied, %24 of passenger class on eco plus are satisfied, %19 of passenger class on eco are satisfied. It is interesting that there is such a difference in satisfaction between eco and business.

In [None]:
# Customer_Type vs satisfaction
train_df[["Customer_Type","satisfaction"]].groupby(["Customer_Type"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* Loyal Customer most satisfied than disloyal customer.

In [None]:
# Cleanliness vs satisfaction
train_df[["Cleanliness","satisfaction"]].groupby(["Cleanliness"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* Most of those who give 5 points to cleanliness are satisfied with the flight.

In [None]:
# Inflight_wifi_service vs satisfaction
train_df[["Inflight_wifi_service","satisfaction"]].groupby(["Inflight_wifi_service"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* Most of those who give 0 to wifi service on the plane are satisfied with the flight. It seems that wifi service is not that important.

In [None]:
# Inflight_entertainment vs satisfaction
train_df[["Inflight_entertainment","satisfaction"]].groupby(["Inflight_entertainment"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* There is a standard distribution in flight entertainment.

In [None]:
# Food_and_drink vs satisfaction
train_df[["Food_and_drink","satisfaction"]].groupby(["Food_and_drink"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* 20% of the passengers who do not like the food are satisfied with the flight.Interesting statitic.

In [None]:
# Seat_comfort vs satisfaction
train_df[["Seat_comfort","satisfaction"]].groupby(["Seat_comfort"],as_index=False).mean().sort_values(by="satisfaction",ascending=False)

* There is no one that gives the seat comfort 0 points.

<a id = "7"></a><br>
# Outlier Detection
* Outlier detection is very important for the correct operation of the model when installing the model.

In [None]:
numerical_features = train_df.select_dtypes(exclude=['object']).drop(["satisfaction"],axis=1).copy()
numerical_features.columns

In [None]:
fig = plt.figure(figsize=(12,18))
for i in range(len(numerical_features.columns)):
    fig.add_subplot(9,4,i+1)
    sns.boxplot(y=numerical_features.iloc[:,i])

plt.tight_layout()
plt.show()

In [None]:
def detect_outliers(df,features):
    outlier_indices=[]
    
    for c in features:
        # 1st quartile
        Q1=np.percentile(df[c],25)
        
        # 3rd quartile
        Q3=np.percentile(df[c],75)
        
        # IQR
        IQR= Q3-Q1
        
        # Outlier Step
        outlier_step= IQR * 1.5
        
        # Detect outlier and their indeces 
        outlier_list_col = df[(df[c]< Q1 - outlier_step)|( df[c] > Q3 + outlier_step)].index
        
        # Store indices 
        outlier_indices.extend(outlier_list_col)
    
    outliers_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i , v in outliers_indices.items() if v>2 )
    return multiple_outliers

In [None]:
train_df.loc[detect_outliers(train_df,[ 'Age', 'Flight_Distance', 'Inflight_wifi_service',
       'Departure/Arrival_time_convenient', 'Ease_of_Online_booking',
       'Gate_location', 'Food_and_drink', 'Online_boarding', 'Seat_comfort',
       'Inflight_entertainment', 'On-board_service', 'Leg_room_service',
       'Baggage_handling', 'Checkin_service', 'Inflight_service',
       'Cleanliness', 'Departure_Delay_in_Minutes',
       'Arrival_Delay_in_Minutes'])]

In [None]:
# drop outliers
train_df = train_df.drop(detect_outliers(train_df,[ 'Age', 'Flight_Distance', 'Inflight_wifi_service',
       'Departure/Arrival_time_convenient', 'Ease_of_Online_booking',
       'Gate_location', 'Food_and_drink', 'Online_boarding', 'Seat_comfort',
       'Inflight_entertainment', 'On-board_service', 'Leg_room_service',
       'Baggage_handling', 'Checkin_service', 'Inflight_service',
       'Cleanliness', 'Departure_Delay_in_Minutes',
       'Arrival_Delay_in_Minutes']),axis = 0).reset_index(drop = True)

<a id = "8"></a><br>
# Missing Value
* Find Missing Value
* Fill Missing Value

In [None]:
test_df.columns=[each.replace(" ","_") for each in test_df.columns]
test_df["satisfaction"]=[1 if each=="satisfied" else 0 for each in test_df.satisfaction]

In [None]:
train_df.shape

In [None]:
train_df_len=len(train_df)
train_df= pd.concat([train_df,test_df],axis=0).reset_index(drop=True)

In [None]:
train_df.head()

<a id = "9"></a><br>
## Find Missing Value

In [None]:
import missingno as msno
msno.bar(train_df)
plt.title("Missing Value Graphs")
plt.show()

* In the missing value chart, the only shortcoming is in Arrival_Delay_in_Munites

In [None]:
train_df.columns[train_df.isnull().any()]

In [None]:
train_df.isnull().sum()

<a id = "10"></a><br>
## Fill Missing Value
* Arrival_Delay_in_Minutes has 393 missing value
* Since I do not know the flight delay time, I will try a different method.
* I will fill the nan values with the average flight delay time. Because I don't want my model to be affected by this parameter.


In [None]:
np.mean(train_df["Arrival_Delay_in_Minutes"])

In [None]:
train_df[train_df["Arrival_Delay_in_Minutes"].isnull()]

In [None]:
train_df.shape

In [None]:
train_df["Arrival_Delay_in_Minutes"]=train_df["Arrival_Delay_in_Minutes"].fillna(np.mean(train_df["Arrival_Delay_in_Minutes"]))

In [None]:
train_df[train_df["Arrival_Delay_in_Minutes"].isnull()]

<a id = "11"></a><br>
# Visualization

<a id = "12"></a><br>
## Corelation Matrix

In [None]:
plt.figure(figsize=(10,10))
list1=["Age",'Inflight_wifi_service',
       'Departure/Arrival_time_convenient', 'Ease_of_Online_booking',
       'Gate_location', 'Food_and_drink', 'Online_boarding', 'Seat_comfort',
       'Inflight_entertainment', 'On-board_service', 'Leg_room_service',
       'Baggage_handling', 'Checkin_service', 'Inflight_service',
       'Cleanliness',"satisfaction"]
sns.heatmap(train_df[list1].corr(),annot=True,fmt=".2f")
plt.show()

<a id = "13"></a><br>
## Gender-->Satisfaction

In [None]:
g=sns.catplot(x="Gender",y="satisfaction",data=train_df,kind="bar",size=6)
g.set_ylabels("Satisfaction Probability")
plt.show()

* Females and males are satisfaction probability almost equal. Male %44, Female %43 satisfaction.


<a id = "14"></a><br>
## Age-->Satisfaction

In [None]:
g= sns.FacetGrid(train_df,col="satisfaction")
g.map(sns.distplot,"Age",bins=25)
plt.show()
# 0=neutral or dissatisfied, 1=satisfied 

* 0-20 years old is  generally neutral or dissatisfied with the flight.
* 30-60 years old is generally satisfied with the flight.
* 65-80 years old passengers is neutral or dissatisfied.

<a id = "15"></a><br>
## Customer Type-->Satisfaction

In [None]:
g=sns.factorplot(x="Customer_Type",y="satisfaction",data=train_df,kind="bar",size=6)
g.set_ylabels("Satisfaction Probability")
plt.show()

* Disloyal customer is generally neutral or dissatisfied with the flight.

<a id = "16"></a><br>
## Type of Travel-->Satisfaction

In [None]:
g=sns.factorplot(x="Type_of_Travel",y="satisfaction",data=train_df,kind="bar",size=6)
g.set_ylabels("Satisfaction Probability")
plt.show()

* Business travel is generally satisfied with the flight but personal travel is generally neutral or dissatisfied with the flight.
* Personal travel is %90 neutral or dissatisfied with the flight. This statistic is interesting.

<a id = "17"></a><br>
## Class-->Satisfaction

In [None]:
g=sns.factorplot(x="Class",y="satisfaction",data=train_df,kind="bar",size=6)
g.set_ylabels("Satisfation Probability")
plt.show()

* Business class is also happy here. But eco and eco plus  neutral or dissatisfied with the flight. 

<a id = "22"></a><br>
## Gender-->Age-->Satisfaction

In [None]:
sns.swarmplot(x="Gender", y="Age",hue="satisfaction", data=train_df.head(1000))
plt.show()
# 0=neutral or dissatisfied, 1=satisfied  

<a id = "18"></a><br>

# Personal Travel Analysis
* Personal travel is generally neutral or dissatisfied with the flight.
* Let's examine why.

In [None]:
personal=train_df[train_df.Type_of_Travel=="Personal Travel"]
personal.head()

<a id = "19"></a><br>
## Service Scoring of Personal Travel Passenger

In [None]:
def service_plot(variable):
    
    var=personal[variable]
    var_Value=var.value_counts()
    
    #visualize
    plt.figure(figsize=(9,3))
    plt.bar(var_Value.index,var_Value.values)
    
    plt.xlabel("Passengers Score")
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,var_Value))

In [None]:
service=["On-board_service", "Leg_room_service", "Checkin_service","Inflight_service"]

for c in service:
    service_plot(c)

* There are 507 0 points in the leg room service.4624 has 1 point, 9436 has 2 points. Personal travel passengers are not satisfied with this service.
* There are lots of 1 and 2 points in on board service and checking service.

<a id = "20"></a><br>
## Eating, Drinking, Cleaning and In-flight Entertainment Scoring of Personal Travel Passenger

In [None]:
def eat_plot(variable):
    
    var=personal[variable]
    var_Value=var.value_counts()
    
    #visualize
    plt.figure(figsize=(9,3))
    plt.bar(var_Value.index,var_Value.values)
    
    plt.xlabel("Passengers Score")
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,var_Value))

In [None]:
eat=["Food_and_drink", "Cleanliness", "Inflight_entertainment"]

for c in eat:
    eat_plot(c)

* There are many passengers who give 1 and 2 points in eating and drinking.There are also 64 passengers giving 0 points.
* There are many passengers who give 1 and 2 points in cleaning.
* There are many passengers who get 1 and 2 points in in-flight entertainment.
* There may be a problem with these services.

<a id = "21"></a><br>
## Aircraft Specifications Scoring

In [None]:
def flight_plot(variable):
    
    var=personal[variable]
    var_Value=var.value_counts()
    
    #visualize
    plt.figure(figsize=(9,3))
    plt.bar(var_Value.index,var_Value.values)
    
    plt.xlabel("Passengers Score")
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,var_Value))

In [None]:
flight=["Gate_location", "Seat_comfort", "Baggage_handling"]

for c in flight:
    flight_plot(c)

* The gate position of the plane is not liked.
* Seat comfort is also not liked.

<a id = "23"></a><br>
# Feature Engineering 
* Our aim in this section is to bring out new features.

<a id = "24"></a><br>
## Customer Type

In [None]:
train_df.head()

In [None]:
sns.countplot(x="Customer_Type",data=train_df)


In [None]:
train_df=pd.get_dummies(train_df,columns=["Customer_Type"])
train_df.head()

<a id = "25"></a><br>
## Type of Travel

In [None]:
train_df.Type_of_Travel.head()

In [None]:
sns.countplot(x="Type_of_Travel",data=train_df)

In [None]:
train_df=pd.get_dummies(train_df,columns=["Type_of_Travel"])
train_df.head()

<a id = "26"></a><br>
## Class

In [None]:
sns.barplot(x=train_df.Class.value_counts().values,y=train_df.Class.value_counts().index)
plt.xlabel("Number of Passenger")
plt.show()

In [None]:
train_df=pd.get_dummies(train_df,columns=["Class"])
train_df.head()

<a id = "27"></a><br>
## Gender

In [None]:
sns.countplot(x="Gender",data=train_df)

In [None]:
train_df=pd.get_dummies(train_df,columns=["Gender"])
train_df.head()

<a id = "28"></a><br>
## Drop ID

In [None]:
train_df.drop(labels=["id"],axis=1,inplace=True)

In [None]:
train_df.head()

<a id = "29"></a><br>
# Modeling
* Algorithms that we will use when creating our model;
* Logistic Regression
* Random Forest Classifer
* Decision Tree Classifer
* KNeighbors Classifer

## Import Librarires

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

<a id = "30"></a><br>
## Train - Test Split

In [None]:
train_df_len

In [None]:
test=train_df[train_df_len:]
test.drop(labels=["satisfaction"],axis=1,inplace=True)

In [None]:
test.head()

In [None]:
train=train_df[:train_df_len]
X_train=train.drop(labels="satisfaction",axis=1)
y_train=train["satisfaction"]
X_train,X_test,y_train,y_test=train_test_split(X_train,y_train,test_size=0.33,random_state=42)
print("X_train",len(X_train))
print("X_test",len(X_test))
print("y_train",len(y_train))
print("y_test",len(y_test))
print("test",len(test))

<a id = "31"></a><br>
## Simple Logistic Regression

In [None]:
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
acc_log_train=round(logreg.score(X_train,y_train)*100,2)
acc_log_test=round(logreg.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_log_train))
print("Test Accuracy: % {}".format(acc_log_test))

* Our logistic regression model gives 81.94% correct results.
* 19% error margin is not good for this data.

## Confusion Matrix (Logistic Regression)

In [None]:
y_pred=logreg.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,fmt="d") 
plt.show()

* In the confusion matrix we can see where our mistakes are.
* 0:satisfied, 1: neutral or dissatisfied.

<a id = "32"></a><br>
## Hyperparameter Tuning - Grid Search - Cross Validation
We will compare 4 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.
* Decision Tree
* Random Forest
* KNN
* Logistic Regression

In [None]:
random_state=42
classifier = [DecisionTreeClassifier(random_state=random_state),
             RandomForestClassifier(random_state=random_state),
             LogisticRegression(random_state=random_state),
             KNeighborsClassifier()]
dt_param_grid={"min_samples_split":range(10,500,20),
              "max_depth":range(1,20,2)}
rf_param_grid={"max_features":[1,3,10],
              "min_samples_split":[2,3,10],
              "min_samples_leaf":[1,3,10],
              "bootstrap":[False],
              "n_estimators":[100,300],
              "criterion":["gini"]}

logreg_param_grid={"C":np.logspace(-3,3,7),
                  "penalty":["l1","l2"]}
knn_param_grid={"n_neighbors": np.linspace(1,19,10,dtype=int).tolist(),
               "weights":["uniform","distance"],
               "metric":["euclidean","manhattan"]}
classifier_param=[dt_param_grid,
                 rf_param_grid,
                 logreg_param_grid,
                 knn_param_grid]


In [None]:
cv_result=[]
best_estimators=[]
for i in range(len(classifier)):
    clf=GridSearchCV(classifier[i],param_grid=classifier_param[i],cv=StratifiedKFold(n_splits=10),scoring="accuracy",n_jobs=-1,verbose=1)
    clf.fit(X_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [None]:
cv_results = pd.DataFrame({"Cross Validation Means":[0.9426983403065574,0.9574115458084643,0.8198176386217473,0.784126066638906], "ML Models":["DecisionTreeClassifier","RandomForestClassifier",
             "LogisticRegression",
             "KNeighborsClassifier"]})

g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")

* DecisonTreeClassifier accuracy score: 94.2%
* RandomForestClassifier accuracy score: 95.7%
* LogisticRegression : 81.9%
* KNeighborsClassifier: 78.4%

<a id = "33"></a><br>
## Ensemble Modeling
* We will combine 2 algorithms with the best value. These will be random forest classifer and decision tree classifer.

In [None]:
votingC = VotingClassifier(estimators = [("dt",best_estimators[0]),
                                        ("rfc",best_estimators[1])],
                                        voting = "soft", n_jobs = -1)
votingC = votingC.fit(X_train, y_train)
print(accuracy_score(votingC.predict(X_test),y_test))

* Our model estimates 95.4% correctly. Our model works well.

<a id = "34"></a><br>
## Prediction and Submission

In [None]:
test_satisfaction = pd.Series(votingC.predict(test), name = "satisfaction").astype(int)
results = pd.concat([test_df.id, test_satisfaction],axis = 1)
results.to_csv("satisfaction.csv", index = False)
results.head()

<a id = "35"></a><br>
# Conculusion
* In this project, I tried to find out if people are satisfied with the airplane journey.
* You can support me by liking and commenting.
* See you in my next project !
