In [1]:
#Import all necessary modules 
import numpy as  np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
#load dataset
df=pd.read_csv('airline_passenger_satisfaction.csv')

In [3]:
df.head()

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied


# Pre-processing part

In [4]:
#drop unnecessary column ie ID
df.drop('ID',axis=1,inplace=True)

In [5]:
#check for null values
df.isnull().sum()

Gender                                      0
Age                                         0
Customer Type                               0
Type of Travel                              0
Class                                       0
Flight Distance                             0
Departure Delay                             0
Arrival Delay                             393
Departure and Arrival Time Convenience      0
Ease of Online Booking                      0
Check-in Service                            0
Online Boarding                             0
Gate Location                               0
On-board Service                            0
Seat Comfort                                0
Leg Room Service                            0
Cleanliness                                 0
Food and Drink                              0
In-flight Service                           0
In-flight Wifi Service                      0
In-flight Entertainment                     0
Baggage Handling                  

In [6]:
# fill null values of Arrival Delay with mean of that column
m=df['Arrival Delay'].mean()
df['Arrival Delay'].fillna(m,inplace=True)

In [7]:
#cross check
df.isnull().sum()

Gender                                    0
Age                                       0
Customer Type                             0
Type of Travel                            0
Class                                     0
Flight Distance                           0
Departure Delay                           0
Arrival Delay                             0
Departure and Arrival Time Convenience    0
Ease of Online Booking                    0
Check-in Service                          0
Online Boarding                           0
Gate Location                             0
On-board Service                          0
Seat Comfort                              0
Leg Room Service                          0
Cleanliness                               0
Food and Drink                            0
In-flight Service                         0
In-flight Wifi Service                    0
In-flight Entertainment                   0
Baggage Handling                          0
Satisfaction                    

In [8]:
#check for duplicate values
df.duplicated().sum()

0

In [9]:
#check for datatypes
df.dtypes

Gender                                     object
Age                                         int64
Customer Type                              object
Type of Travel                             object
Class                                      object
Flight Distance                             int64
Departure Delay                             int64
Arrival Delay                             float64
Departure and Arrival Time Convenience      int64
Ease of Online Booking                      int64
Check-in Service                            int64
Online Boarding                             int64
Gate Location                               int64
On-board Service                            int64
Seat Comfort                                int64
Leg Room Service                            int64
Cleanliness                                 int64
Food and Drink                              int64
In-flight Service                           int64
In-flight Wifi Service                      int64


In [10]:
#we need to convert datatype of column into numerical type of column by applyng Label Encoder
from sklearn.preprocessing import LabelEncoder

In [11]:
#seperate object type and integer type data
df_cat=df.select_dtypes('object')
df_num=df.select_dtypes(['int64','float64'])

In [12]:
df_cat.columns

Index(['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Satisfaction'], dtype='object')

In [13]:
columns=df_cat.columns
for col in columns:
    le=LabelEncoder()
    df_cat[col]=le.fit_transform(df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[col]=le.fit_transform(df[col])


In [14]:
#cross check
df_cat

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,Satisfaction
0,1,0,0,0,0
1,0,1,0,0,1
2,1,1,0,0,1
3,1,1,0,0,1
4,0,1,0,0,1
...,...,...,...,...,...
129875,1,1,1,2,0
129876,1,1,1,2,0
129877,1,1,1,2,0
129878,1,1,1,2,1


In [15]:
#merg both type of set
df_new=pd.concat([df_cat,df_num],axis=1)

In [16]:
df_new.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,Satisfaction,Age,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,Gate Location,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling
0,1,0,0,0,0,48,821,2,5.0,3,...,3,3,5,2,5,5,5,3,5,5
1,0,1,0,0,1,35,821,26,39.0,2,...,2,5,4,5,5,3,5,2,5,5
2,1,1,0,0,1,41,853,0,0.0,4,...,4,3,5,3,5,5,3,4,3,3
3,1,1,0,0,1,50,1905,0,0.0,2,...,2,5,5,5,4,4,5,2,5,5
4,0,1,0,0,1,49,3470,0,1.0,3,...,3,3,4,4,5,4,3,3,3,3


In [17]:
# split data into input and output
x=df_new.drop('Satisfaction',axis=1)#input variables
y=df_new['Satisfaction'] #output varaible

In [18]:
#give data to train_test_split class
from sklearn.model_selection import train_test_split

In [19]:
# spilt data into 2 parts ie. training(70%) and testing(30%)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

In [20]:
#apply scalling on input data
#import standard scaller
from sklearn.preprocessing import StandardScaler

In [21]:
ss=StandardScaler()
x_train=ss.fit_transform(x_train)#converting into numpy array
x_test=ss.transform(x_test)

In [22]:
# check wheather the data is balance or not
# first check ytrain(0 and 1) :how many sample of o's and 1's
y_train.value_counts()

0    51380
1    39536
Name: Satisfaction, dtype: int64

In [23]:
#difference is less than 50%
#hence no need to apply sampling technique

# Create model function

In [24]:
#create model to perform ML algorithms
def create_model(model):
    model.fit(x_train,y_train)# train model with 70% data
    y_pred=model.predict(x_test)#test with 30% data
    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))
    return model
#import classification_report,confusion_matrix
from sklearn.metrics import classification_report,confusion_matrix

# Using Logistic Regression

In [25]:
#First create meodel using Logistic resgression
#import logistic regression module
from sklearn.linear_model import LogisticRegression

In [26]:
lr=LogisticRegression(random_state=1)
lr=create_model(lr)

              precision    recall  f1-score   support

           0       0.88      0.91      0.89     22072
           1       0.87      0.83      0.85     16892

    accuracy                           0.87     38964
   macro avg       0.87      0.87      0.87     38964
weighted avg       0.87      0.87      0.87     38964

[[19996  2076]
 [ 2817 14075]]


In [122]:
#with Logistic Regression we are got f1 score 0=89% 1=85%

# Using Decision Tree Clasifier

# Gini-index method

In [27]:
#create meodel using DecisionTree Classifier
#import logistic regression module
from sklearn.tree import DecisionTreeClassifier

In [29]:
dt=DecisionTreeClassifier(random_state=1)
dt=create_model(dt)

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     22072
           1       0.94      0.94      0.94     16892

    accuracy                           0.95     38964
   macro avg       0.94      0.95      0.94     38964
weighted avg       0.95      0.95      0.95     38964

[[20969  1103]
 [ 1012 15880]]


In [100]:
#with decision tree we got f1 score of 0=95% & 1=94%

In [30]:
#check information gain : 
dict={'Input':x.columns,'IG':dt.feature_importances_} 
df1=pd.DataFrame(dict)
#sorting in decending order 
df1.sort_values('IG',ascending=False ,ignore_index=True)

Unnamed: 0,Input,IG
0,Online Boarding,0.361078
1,In-flight Wifi Service,0.17759
2,Type of Travel,0.14945
3,In-flight Entertainment,0.048454
4,Customer Type,0.042187
5,Check-in Service,0.028661
6,Flight Distance,0.02431
7,Age,0.02161
8,Class,0.021199
9,Baggage Handling,0.018515


In [31]:
#to improve our f1score we apply puring technique
#1)max_depth technique
#2)min_samples_leaf technique

In [95]:
# using max_depth technique
'''for i in range(1,9):
    dt1=DecisionTreeClassifier(random_state=1,max_depth=i)
    print('max_depth:',i)
    dt1=create_model(dt1)'''
#heat and trial method

"for i in range(1,9):\n    dt1=DecisionTreeClassifier(random_state=1,max_depth=i)\n    print('max_depth:',i)\n    dt1=create_model(dt1)"

In [96]:
dt=DecisionTreeClassifier(random_state=1,max_depth=8)
dt=create_model(dt)

              precision    recall  f1-score   support

           0       0.94      0.95      0.94     22072
           1       0.93      0.92      0.93     16892

    accuracy                           0.94     38964
   macro avg       0.94      0.93      0.93     38964
weighted avg       0.94      0.94      0.94     38964

[[20962  1110]
 [ 1382 15510]]


In [33]:
#with max depth technique we are got f1 score 0=94% 1=93%

In [97]:
#second puring technique: min_sampleleaf
#min_samples_leaf >=45 and<=100
'''for i in range(45,101,1): #i=45 i=47,.....
 dt2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i)
 #by default gini index
 print('min_samples_leaf :',i)
 #call function
 dt2=create_model(dt2)'''

"for i in range(45,101,1): #i=45 i=47,.....\n dt2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i)\n #by default gini index\n print('min_samples_leaf :',i)\n #call function\n dt2=create_model(dt2)"

In [98]:
dt2=DecisionTreeClassifier(random_state=1,min_samples_leaf=45)
dt2=create_model(dt2)

              precision    recall  f1-score   support

           0       0.94      0.97      0.96     22072
           1       0.95      0.93      0.94     16892

    accuracy                           0.95     38964
   macro avg       0.95      0.95      0.95     38964
weighted avg       0.95      0.95      0.95     38964

[[21310   762]
 [ 1246 15646]]


In [99]:
#with min_samples_leaf technique we got f1 score 0=96% 1=94%

In [37]:
#check information gain : 
dict={'Input':x.columns,'IG':dt2.feature_importances_} 
df1=pd.DataFrame(dict)
#sorting in decending order 
df1.sort_values('IG',ascending=False ,ignore_index=True)

Unnamed: 0,Input,IG
0,Online Boarding,0.414184
1,In-flight Wifi Service,0.199929
2,Type of Travel,0.169368
3,In-flight Entertainment,0.053215
4,Customer Type,0.044655
5,Check-in Service,0.027294
6,Class,0.021148
7,Baggage Handling,0.011902
8,Gate Location,0.011025
9,In-flight Service,0.009904


In [38]:
#with gini index we got maximum f1 score of 0=96% & 1=94%

# Entropy method

In [40]:
dte=DecisionTreeClassifier(random_state=1,criterion='entropy')
dte=create_model(dte)

              precision    recall  f1-score   support

           0       0.96      0.95      0.95     22072
           1       0.94      0.94      0.94     16892

    accuracy                           0.95     38964
   macro avg       0.95      0.95      0.95     38964
weighted avg       0.95      0.95      0.95     38964

[[21027  1045]
 [  984 15908]]


In [101]:
#we got f1 score of 0=95% & 1=94%

In [41]:
#check information gain : 
dict={'Input':x.columns,'IG':dte.feature_importances_} 
df1=pd.DataFrame(dict)
#sorting in decending order 
df1.sort_values('IG',ascending=False ,ignore_index=True)

Unnamed: 0,Input,IG
0,Online Boarding,0.30168
1,In-flight Wifi Service,0.186771
2,Type of Travel,0.131633
3,Customer Type,0.048655
4,Class,0.039084
5,On-board Service,0.036101
6,Flight Distance,0.028197
7,In-flight Entertainment,0.028123
8,Age,0.026566
9,Check-in Service,0.02583


In [42]:
#to improve our f1score we apply puring technique
#1)max_depth technique
#2)min_samples_leaf technique

In [104]:
# using max_depth technique
'''for i in range(1,9):
    dte1=DecisionTreeClassifier(random_state=1,max_depth=i,criterion='entropy')
    print('max_depth:',i)
    dte1=create_model(dte1)'''

"for i in range(1,9):\n    dte1=DecisionTreeClassifier(random_state=1,max_depth=i,criterion='entropy')\n    print('max_depth:',i)\n    dte1=create_model(dte1)"

In [102]:
dte1=DecisionTreeClassifier(random_state=1,max_depth=8,criterion='entropy')
dte1=create_model(dte1)

              precision    recall  f1-score   support

           0       0.94      0.95      0.95     22072
           1       0.93      0.93      0.93     16892

    accuracy                           0.94     38964
   macro avg       0.94      0.94      0.94     38964
weighted avg       0.94      0.94      0.94     38964

[[20923  1149]
 [ 1222 15670]]


In [103]:
#with max depth technique we are got drop in f1 score 0=95% 1=93%

In [105]:
#second puring technique: min_sampleleaf
#min_samples_leaf >=45 and<=100
'''for i in range(45,101,1): #i=45 i=47,.....
 dte2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i,criterion='entropy')
 #by default gini index
 print('min_samples_leaf :',i)
 #call function
 dte2=create_model(dte2)'''

"for i in range(45,101,1): #i=45 i=47,.....\n dte2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i,criterion='entropy')\n #by default gini index\n print('min_samples_leaf :',i)\n #call function\n dte2=create_model(dte2)"

In [45]:
dte2=DecisionTreeClassifier(random_state=1,min_samples_leaf=45,criterion='entropy')
dte2=create_model(dte2)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     22072
           1       0.96      0.93      0.94     16892

    accuracy                           0.95     38964
   macro avg       0.95      0.95      0.95     38964
weighted avg       0.95      0.95      0.95     38964

[[21387   685]
 [ 1204 15688]]


In [106]:
#with entropy method we got f1 score of 0=96% & 1=94%
# we got same output by both method 
#hence our final output after applig Decision tree is 0=96% & 1=94%

# Using RandomForest

In [48]:
#crate model using Random Forest
from sklearn.ensemble import RandomForestClassifier

In [107]:
'''for i in range(10,101):
    rfc=RandomForestClassifier(n_estimators=i,random_state=1)
    print('no. of decision tree:',i)
    rfc=create_model(rfc)'''

"for i in range(10,101):\n    rfc=RandomForestClassifier(n_estimators=i,random_state=1)\n    print('no. of decision tree:',i)\n    rfc=create_model(rfc)"

In [50]:
#we choose no_decisiontree=29(n_estimator)
rfc=RandomForestClassifier(random_state=1,n_estimators=29)
rfc=create_model(rfc)

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     22072
           1       0.97      0.94      0.96     16892

    accuracy                           0.96     38964
   macro avg       0.96      0.96      0.96     38964
weighted avg       0.96      0.96      0.96     38964

[[21598   474]
 [ 1011 15881]]


In [108]:
#with Random Forest we got f1 score of 0=97% and 1=96%

In [51]:
#check information gain : 
dict={'Input':x.columns,'IG':rfc.feature_importances_} 
df1=pd.DataFrame(dict)
#sorting in decending order 
df1.sort_values('IG',ascending=False ,ignore_index=True)

Unnamed: 0,Input,IG
0,Online Boarding,0.192875
1,In-flight Wifi Service,0.148736
2,Class,0.105648
3,Type of Travel,0.096767
4,In-flight Entertainment,0.05996
5,Leg Room Service,0.039227
6,Ease of Online Booking,0.036083
7,Seat Comfort,0.035923
8,Flight Distance,0.034834
9,Age,0.032538


In [110]:
#apply pruning technique 1:max_depth method
'''for i in range (1,9):
     rfc1=RandomForestClassifier(n_estimators=29,random_state=1,max_depth=i)
     print('max depth',i)
     rfc1=create_model(rfc1)'''

"for i in range (1,9):\n     rfc1=RandomForestClassifier(n_estimators=29,random_state=1,max_depth=i)\n     print('max depth',i)\n     rfc1=create_model(rfc1)"

In [109]:
rfc1=RandomForestClassifier(n_estimators=29,random_state=1,max_depth=8)
rfc1=create_model(rfc1)

              precision    recall  f1-score   support

           0       0.94      0.95      0.95     22072
           1       0.94      0.92      0.93     16892

    accuracy                           0.94     38964
   macro avg       0.94      0.94      0.94     38964
weighted avg       0.94      0.94      0.94     38964

[[21004  1068]
 [ 1356 15536]]


In [111]:
#we got f1 score of 0=95% and 1=93%

In [113]:
#apply second pruning technique ie. min_sample_leaf(>=45 and<=100)
'''for i in range (45,100):
     rfc2=RandomForestClassifier(n_estimators=29,random_state=1,min_samples_leaf=i)
     print('minsampeles_leaf',i)
     rfc2=create_model(rfc2)'''

"for i in range (45,100):\n     rfc2=RandomForestClassifier(n_estimators=29,random_state=1,min_samples_leaf=i)\n     print('minsampeles_leaf',i)\n     rfc2=create_model(rfc2)"

In [112]:
rfc2=RandomForestClassifier(n_estimators=29,random_state=1,min_samples_leaf=45)
rfc2=create_model(rfc2)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95     22072
           1       0.95      0.92      0.93     16892

    accuracy                           0.94     38964
   macro avg       0.94      0.94      0.94     38964
weighted avg       0.94      0.94      0.94     38964

[[21194   878]
 [ 1316 15576]]


In [None]:
#we got f1 score of 0=95% and 1=93%

# Using Adaboost Technique

In [54]:
# create model using Adaboosting technique
from sklearn.ensemble import AdaBoostClassifier

In [115]:
'''for i in range(1,22):#as we have 21 input varaiables
    ada=AdaBoostClassifier(n_estimators=i,random_state=1)
    print('no. of decions stump',i)
    ada=create_model(ada)'''

"for i in range(1,22):#as we have 21 input varaiables\n    ada=AdaBoostClassifier(n_estimators=i,random_state=1)\n    print('no. of decions stump',i)\n    ada=create_model(ada)"

In [114]:
ada=AdaBoostClassifier(n_estimators=,random_state=1)
ada=create_model(ada)

              precision    recall  f1-score   support

           0       0.93      0.94      0.94     22072
           1       0.92      0.91      0.92     16892

    accuracy                           0.93     38964
   macro avg       0.93      0.93      0.93     38964
weighted avg       0.93      0.93      0.93     38964

[[20816  1256]
 [ 1485 15407]]


In [56]:
#with Adaboost we got f1 score of  0=94% and 1=92%

# Using Gradient Boost

In [57]:
from sklearn.ensemble import  GradientBoostingClassifier

In [116]:
'''for i in range(10,101):
    #create the object of gradientboostclassier class
    gbc= GradientBoostingClassifier(n_estimators=i,random_state=1)
    print('No of estimators :',i)
    #call function
    gbc=create_model(gbc)'''

"for i in range(10,101):\n    #create the object of gradientboostclassier class\n    gbc= GradientBoostingClassifier(n_estimators=i,random_state=1)\n    print('No of estimators :',i)\n    #call function\n    gbc=create_model(gbc)"

In [59]:
gbc= GradientBoostingClassifier(n_estimators=35,random_state=1)
gbc=create_model(gbc)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94     22072
           1       0.93      0.90      0.92     16892

    accuracy                           0.93     38964
   macro avg       0.93      0.93      0.93     38964
weighted avg       0.93      0.93      0.93     38964

[[20901  1171]
 [ 1620 15272]]


In [None]:
#with gradient boosting we got f1 score of 0=94% and 1=92%

# Using xtreame gradient boost

In [117]:
from xgboost import XGBClassifier
'''for i in range(10,101):
 #create the object of XGBclassier class
 xgc= XGBClassifier(n_estimators=i,random_state=1,reg_alpha=1)
 print('No of estimators :',i)
 #call function
 xgc=create_model(gbc)'''

"for i in range(10,101):\n #create the object of XGBclassier class\n xgc= XGBClassifier(n_estimators=i,random_state=1,reg_alpha=1)\n print('No of estimators :',i)\n #call function\n xgc=create_model(gbc)"

In [61]:
xgc= XGBClassifier(n_estimators=10,random_state=1,reg_alpha=1)
xgc=create_model(xgc)



              precision    recall  f1-score   support

           0       0.94      0.97      0.95     22072
           1       0.95      0.93      0.94     16892

    accuracy                           0.95     38964
   macro avg       0.95      0.95      0.95     38964
weighted avg       0.95      0.95      0.95     38964

[[21310   762]
 [ 1264 15628]]


In [62]:
# using xtreame boosting we got f1 score of 0=95% and 1=94%

# Using KNN method

In [72]:
from sklearn.neighbors import KNeighborsClassifier
#create object of KNeighnors classifier
knc=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
#p=2 means Eucidean distance method

In [73]:
#call function
knc=create_model(knc)

              precision    recall  f1-score   support

           0       0.91      0.96      0.94     22072
           1       0.95      0.88      0.91     16892

    accuracy                           0.93     38964
   macro avg       0.93      0.92      0.93     38964
weighted avg       0.93      0.93      0.93     38964

[[21276   796]
 [ 2010 14882]]


In [74]:
# using KNN we got f1 score of 0=95% and 1=94%

# Using SVM algorithum

In [76]:
from sklearn.svm import LinearSVC

In [77]:
# Create object of linearSVM class
svc= LinearSVC(random_state=1)
# we have not added any error for our outliers so we will get a line which will be called hard line
# we call the function:
svc = create_model(svc)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89     22072
           1       0.87      0.83      0.85     16892

    accuracy                           0.87     38964
   macro avg       0.87      0.87      0.87     38964
weighted avg       0.87      0.87      0.87     38964

[[20015  2057]
 [ 2880 14012]]




In [78]:
#suppose outlier in our dataset means model is overfitted   
#so reduce overfit column or remove outlier by adding eternal erro at training time
#means again create object of class of  svc and pass charecter 'C' means error
# the value of c can be <=1
svc1=LinearSVC(random_state=1,C=0.9)# soft margine

In [79]:
svc1=create_model(svc1)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89     22072
           1       0.87      0.83      0.85     16892

    accuracy                           0.87     38964
   macro avg       0.87      0.87      0.87     38964
weighted avg       0.87      0.87      0.87     38964

[[20015  2057]
 [ 2880 14012]]




In [80]:
# even after changing value oc 'c' we get same output 
#hence it is non linear dataset

In [81]:
#polynomial kernal function
from sklearn.svm import SVC

In [82]:
poly_svc=SVC(random_state=1,kernel='poly')

In [84]:
poly_svc=create_model(poly_svc)

              precision    recall  f1-score   support

           0       0.93      0.96      0.95     22072
           1       0.95      0.91      0.93     16892

    accuracy                           0.94     38964
   macro avg       0.94      0.94      0.94     38964
weighted avg       0.94      0.94      0.94     38964

[[21217   855]
 [ 1512 15380]]


In [85]:
#using radial function
radial_svc=SVC(random_state=1,kernel='rbf')

In [86]:
radial_svc=create_model(radial_svc)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     22072
           1       0.96      0.93      0.95     16892

    accuracy                           0.95     38964
   macro avg       0.96      0.95      0.95     38964
weighted avg       0.95      0.95      0.95     38964

[[21418   654]
 [ 1108 15784]]


In [87]:
# using SVM(radial svc) we got f1 score of 0=96% and 1=95%

# Using Naive Bayes Theorem

In [119]:
from sklearn.naive_bayes import GaussianNB
# Creating the object of Gaussian Class
gnb = GaussianNB()

In [120]:
# Calling the function
gnb = create_model(gnb)

              precision    recall  f1-score   support

           0       0.86      0.90      0.88     22072
           1       0.86      0.81      0.84     16892

    accuracy                           0.86     38964
   macro avg       0.86      0.86      0.86     38964
weighted avg       0.86      0.86      0.86     38964

[[19853  2219]
 [ 3165 13727]]


In [121]:
#with naive bayes we got  f1 score of 0=88% and 1=84%

# Conclusion:
We found that for this dataset Random Forest algo gives best f1 score ie 0=97% 1=96%