In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import pairwise_distances, accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix, recall_score
from numpy import mean
from numpy import std

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def makeTable(headerRow,columnizedData,columnSpacing=2):
    """Creates a technical paper style, left justified table

    Author: Christopher Collett
    Date: 6/1/2019"""
    from numpy import array,max,vectorize

    cols = array(columnizedData,dtype=str)
    colSizes = [max(vectorize(len)(col)) for col in cols]

    header = ''
    rows = ['' for i in cols[0]]

    for i in range(0,len(headerRow)):
        if len(headerRow[i]) > colSizes[i]: colSizes[i]=len(headerRow[i])
        headerRow[i]+=' '*(colSizes[i]-len(headerRow[i]))
        header+=headerRow[i]
        if not i == len(headerRow)-1: header+=' '*columnSpacing

        for j in range(0,len(cols[i])):
            if len(cols[i][j]) < colSizes[i]:
                cols[i][j]+=' '*(colSizes[i]-len(cols[i][j])+columnSpacing)
            rows[j]+=cols[i][j]
            if not i == len(headerRow)-1: rows[j]+=' '*columnSpacing

    line = '-'*len(header)
    print(line)
    print(header)
    print(line)
    for row in rows: print(row)
    print(line)
    
def presentResults(scoresDictionary):
    fitTimeArray = scoresDictionary['fit_time']
    fitTimeMean = mean(fitTimeArray)
    fitTimeStd = std(fitTimeArray)
    
    scoreTimeArray = scoresDictionary['score_time']
    scoreTimeMean = mean(scoreTimeArray)
    scoreTimeStd = std(scoreTimeArray)
    
    precisionArray = scoresDictionary['test_precision_macro']
    precisionMean = mean(precisionArray)
    precisionStd = std(precisionArray)   
    
    recallArray = scoresDictionary['test_recall_macro']
    recallMean = mean(recallArray)
    recallStd = std(recallArray)
        
    f1Array = scoresDictionary['test_f1_macro']
    f1Mean = mean(f1Array)
    f1Std = std(f1Array)
    
    accuracyArray = scoresDictionary['test_accuracy']
    accuracyMean = mean(accuracyArray)
    accuracyStd = std(accuracyArray)
    
    tableHeader = ['Metric', 'Mean', 'Standard Deviation']
    metricRow = ['Precision', 'Accuracy', 'Recall', 'F1-score']
    meanRow = [str(round(precisionMean,4)),str(round(accuracyMean,4)),str(round(recallMean,4)),str(round(f1Mean,4))]
    stdRow = [str(round(precisionStd,4)),str(round(accuracyStd,4)),str(round(recallStd,4)),str(round(f1Std,4))]
    makeTable(tableHeader, [metricRow,meanRow,stdRow])

In [None]:
test = pd.read_csv("/kaggle/input/airline-passenger-satisfaction/train.csv", index_col=0)
train = pd.read_csv("/kaggle/input/airline-passenger-satisfaction/test.csv", index_col=0)
dataset = pd.merge(test, train, how = 'outer')

In [None]:
dataset.dtypes

In [None]:
dataset.drop(['id'], axis=1, inplace=True)

In [None]:
dataset.describe()

In [None]:
#Categorical Columns
dataset.describe(exclude=[np.number])

In [None]:
dataset.isnull().sum()

In [None]:
dataset['Arrival Delay in Minutes'].median()

In [None]:
#Arrival Delay in Minutes
dataset['Arrival Delay in Minutes'].replace(np.NaN,0,inplace=True)
print(dataset['Arrival Delay in Minutes'].isnull().sum())

In [None]:
dataset['Arrival Delay in Minutes'].value_counts()

In [None]:
#Categorical Columns
print("Gender:", dataset['Gender'].unique())
print("Customer Type:", dataset['Customer Type'].unique())
print("Type of Travel:", dataset['Type of Travel'].unique())
print("Class:", dataset['Class'].unique())
print("Satisfaction:", dataset['satisfaction'].unique())

In [None]:
dataset['Class'].value_counts()

In [None]:
#Gender
dataset['Gender'] = dataset['Gender'].str[0].str.upper().map({'M':0, 'F':1})
print("Gender:", dataset['Gender'].unique())

#Customer Type
dataset['Customer Type'] = dataset['Customer Type'].str[0].str.upper().map({'L':0, 'D':1})
print("Customer Type:", dataset['Customer Type'].unique())

#Type of Travel
dataset['Type of Travel'] = dataset['Type of Travel'].str[0].str.upper().map({'P':0, 'B':1})
print("Type of Travel:", dataset['Type of Travel'].unique())

#Class
dataset['Class'] = dataset['Class'].str.upper().map({'ECO':0, 'ECO PLUS': 1 ,'BUSINESS':2})
print("Class:", dataset['Class'].unique())

#Satisfaction
dataset['satisfaction'] = dataset['satisfaction'].str[0].str.upper().map({'N':0, 'S':1})
print("Satisfaction:", dataset['satisfaction'].unique())

In [None]:
transformer = MaxAbsScaler().fit(dataset)
scaled_data = transformer.transform(dataset)
dataset = pd.DataFrame(scaled_data, columns=dataset.columns)

In [None]:
dataset

In [None]:
#Graphics

fig, axes = plt.subplots(6, 4, figsize=(28,22))
sns.histplot(dataset, x='Gender', ax = axes[0][0])
sns.histplot(dataset, x='Customer Type', ax = axes[0][1])
sns.histplot(dataset, x='Type of Travel', ax = axes[0][2])
sns.histplot(dataset, x='Class', ax = axes[0][3])

sns.histplot(dataset, x='Age', ax = axes[1][0])
sns.histplot(dataset, x='Flight Distance', ax = axes[1][1])
sns.histplot(dataset, x='Inflight wifi service', ax = axes[1][2])
sns.histplot(dataset, x='Departure/Arrival time convenient', ax = axes[1][3])

sns.histplot(dataset, x='Ease of Online booking', ax = axes[2][0])
sns.histplot(dataset, x='Gate location', ax = axes[2][1])
sns.histplot(dataset, x='Food and drink', ax = axes[2][2])
sns.histplot(dataset, x='Online boarding', ax = axes[2][3])

sns.histplot(dataset, x='Seat comfort', ax = axes[3][0])
sns.histplot(dataset, x='Inflight entertainment', ax = axes[3][1])
sns.histplot(dataset, x='On-board service', ax = axes[3][2])
sns.histplot(dataset, x='Leg room service', ax = axes[3][3])

sns.histplot(dataset, x='Baggage handling', ax = axes[4][0])
sns.histplot(dataset, x='Checkin service', ax = axes[4][1])
sns.histplot(dataset, x='Inflight service', ax = axes[4][2])
sns.histplot(dataset, x='Cleanliness', ax = axes[4][3])

sns.histplot(dataset, x='Departure Delay in Minutes', ax = axes[5][0])
sns.histplot(dataset, x='Arrival Delay in Minutes', ax = axes[5][1])
sns.histplot(dataset, x='satisfaction', ax = axes[5][2])



In [None]:
plt.figure(figsize=(14, 10))
sns.boxplot(dataset['Departure Delay in Minutes'])

In [None]:
plt.figure(figsize=(14, 10))
sns.boxplot(dataset['Arrival Delay in Minutes'])

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(28,16))

sns.scatterplot(data = dataset, x = 'Flight Distance', y= 'Age', ax = axes[0][0], hue = 'satisfaction')
axes[0,0].set_xlabel('Flight Distance x Age')

sns.scatterplot(data = dataset, x = 'Flight Distance', y= 'Arrival Delay in Minutes', ax = axes[0][1], hue = 'satisfaction')
axes[0,1].set_xlabel('Flight Distance x Arrival Delay in Minutes')

sns.scatterplot(data = dataset, x = 'Age', y= 'Arrival Delay in Minutes', ax = axes[1][0], hue = 'satisfaction')
axes[1,0].set_xlabel('Age x Arrival Delay in Minutes')

sns.scatterplot(data = dataset, x = 'Flight Distance', y= 'Departure Delay in Minutes', ax = axes[1][1], hue = 'satisfaction')
axes[1,1].set_xlabel('Flight Distance x Departure Delay in Minutes')

In [None]:
corr = dataset.corr()
fig = plt.figure(figsize=(20, 20))
cmap = LinearSegmentedColormap.from_list('RedGreenRed', ['crimson','gold','lime', 'lime', 'lime', 'gold', 'crimson'])
sns.heatmap(corr[np.abs(corr>0.5)], annot=True, vmin=-1, vmax=1, cmap=cmap)
plt.title("Data Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
dataset.drop(['Arrival Delay in Minutes'], axis=1, inplace=True)

In [None]:
scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']

In [None]:
gnb = GaussianNB()
scores = cross_validate(gnb, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)

# scores_gnb = cross_val_score(gnb, dataset.iloc[:, :-1], dataset.iloc[:, -1], cv=10, scoring='accuracy')
# print(scores_gnb.mean())
# print(scores_gnb.std())
# print(scores_gnb)

In [None]:
logreg = LogisticRegression(solver='sag')
scores = cross_validate(logreg, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)

# scores_logreg = cross_val_score(logreg, dataset.iloc[:, :-1], dataset.iloc[:, -1], cv=10, scoring='accuracy')
# print(scores_logreg.mean())
# print(scores_logreg.std())
# print(scores_logreg)

In [None]:
decisiontree = DecisionTreeClassifier(random_state=0)
scores = cross_validate(decisiontree, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)

# scores_decisiontree = cross_val_score(decisiontree, dataset.iloc[:, :-1], dataset.iloc[:, -1], cv=10, scoring='accuracy')
# print(scores_decisiontree.mean())
# print(scores_decisiontree.std())
# print(scores_decisiontree)

In [None]:
random_forest = RandomForestClassifier(oob_score = True, random_state=0)
random_forest.fit(dataset.iloc[:, :-1], dataset.iloc[:, -1])

importances = pd.DataFrame({'feature':dataset.iloc[:, :-1].columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
print(importances)
random_forest.score(dataset.iloc[:, :-1], dataset.iloc[:, -1])
print(random_forest.oob_score_)

In [None]:
gradientboosting = GradientBoostingClassifier(learning_rate=1.0, random_state=0)
scores = cross_validate(gradientboosting, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)

# gradientboosting.fit(dataset.iloc[:, :-1], dataset.iloc[:, -1])
# scores_gradientboosting = cross_val_score(gradientboosting, dataset.iloc[:, :-1], dataset.iloc[:, -1], cv=10, scoring='accuracy')
# print(scores_gradientboosting.mean())
# print(scores_gradientboosting.std())
# print(scores_gradientboosting)


In [None]:
svc = SVC(kernel='rbf')
scores = cross_validate(svc, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)
# svc.fit(x_treino, y_treino)
# scores_svc = cross_val_score(svc, x, y, cv=10, scoring='accuracy')
# print(scores_svc.mean())
# print(scores_svc.std())
# print(scores_svc)


In [None]:
mlp_lbfgs = MLPClassifier(solver='lbfgs')
scores = cross_validate(mlp_lbfgs, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)

In [None]:
mlp_adam = MLPClassifier(solver='adam')
scores = cross_validate(mlp_adam, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)

In [None]:
mlp_sgd = MLPClassifier(solver='sgd')
scores = cross_validate(mlp_sgd, dataset.iloc[:, :-1], dataset.iloc[:, -1], scoring=scoring, cv=10, n_jobs=-1)
presentResults(scores)