In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [2]:
airlines = pd.read_csv('airline_satisfaction.csv').dropna()
airlines.head()

Unnamed: 0,id,Gender,Customer,Age,TravelType,Class,Distance,InflightWifi,ConvenientTime,OnlineBooking,...,InflightEntertainment,OnboardService,LegRoom,Baggage,CheckinService,InflightService,Cleanliness,DepartureDelay,ArrivalDelay,Satisfaction
0,70172,Male,Loyal,13,Personal,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,Unsatisfied
1,5047,Male,Disloyal,25,Business,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,Unsatisfied
2,110028,Female,Loyal,26,Business,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,Satisfied
3,24026,Female,Loyal,25,Business,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,Unsatisfied
4,119299,Male,Loyal,61,Business,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,Satisfied


In [3]:
labelBinary = LabelBinarizer()

# Gender == 1 if Male
airlines['Gender'] = labelBinary.fit_transform(airlines['Gender'])
# Customer == 1 if Loyal
airlines['Customer'] = labelBinary.fit_transform(airlines['Customer'])
# TravelType == 1 if Personal
airlines['TravelType'] = labelBinary.fit_transform(airlines['TravelType'])
# Class == 1 if Business
airlines['Class'] = labelBinary.fit_transform(airlines['Class'])

In [4]:
# Using fewer input features reduces computational time
X = airlines[['Gender', 'Customer', 'Age', 'TravelType', 
              'InflightWifi', 'Cleanliness', 'InflightEntertainment']]
y = airlines[['Satisfaction']]

In [None]:
# How does the number of estimators affect score for the airline satisfaction data?
score = []
predvar = []

# Maximum number of base models
N = 30

for i in range(1, N):
    # Fit the ensemble with i base models
    baggingModel = BaggingClassifier(n_estimators=i, random_state=i)
    baggingModel.fit(X, np.ravel(y))
    
    # Calculate the score
    score_i = baggingModel.score(X, y)
    score.append(score_i)

In [None]:
p = sns.lineplot(x=range(1, N), y=score, linewidth=2)
p.set_xlabel('Number of base models', fontsize=14)
p.set_ylabel('Score', fontsize=14)