In [48]:
### imports and warning filters ###
import pandas as pd
import matplotlib as mplib
import numpy as np
import sklearn as sk
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import category_encoders as ce
import warnings
from sklearn.impute import SimpleImputer
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

#filtering warnings
warnings.filterwarnings('ignore')

In [34]:
### reading CSV files into dataframes ###
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
full = pd.concat([train, test], axis=0, ignore_index=True)

In [24]:
### Calculating what percentage of loyals customers are satisfied with service and what percentage of disloyal customers are satisfied ###
loyals = []
disloyals = []
for i in range(len(full)):
    currRow = full.iloc[i]
    if (currRow['Customer Type'] == 'Loyal Customer'):
        loyals.append(currRow['satisfaction'])
    else:
        disloyals.append(currRow['satisfaction'])

        
loyalsSatisfied = 0
for i in range(len(loyals)):
    if (loyals[i] == 'satisfied'):
        loyalsSatisfied += 1

        
disloyalsSatisfied = 0
for i in range(len(disloyals)):
    if (disloyals[i] == 'satisfied'):
        disloyalsSatisfied += 1

In [25]:
### printing out the percentages calculated in cell above ###
print('There were ' + str(format((loyalsSatisfied / len(loyals)) * 100, '.2f')) + '% of loyal customers who were satisfied with the airline service.')
print('There were ' + str(format((disloyalsSatisfied / len(disloyals)) * 100, '.2f')) + '% of disloyal customers who were satisfied with the airline service.')

There were 47.81% of loyal customers who were satisfied with the airline service.
There were 23.97% of disloyal customers who were satisfied with the airline service.


In [26]:
### function for avg and stdev ratings of all services for satisfied, dissatisfied and all customers ###
def avgRating(services):
    total = [[] for x in range(len(services))]
    satisfied = [[] for x in range(len(services))]
    dissatisfied = [[] for x in range(len(services))]

    for i in range(len(full)):
        currRow = full.iloc[i]
        
        if (currRow['satisfaction'] == 'satisfied'):
            for j in range(len(services)):
                if (currRow[services[j]] == 0):
                    continue
                    
                satisfied[j].append(currRow[services[j]])
        else:
            for j in range(len(services)):
                if (currRow[services[j]] == 0):
                    continue
                    
                dissatisfied[j].append(currRow[services[j]])
        
        for j in range(len(services)):
            if (currRow[services[j]] == 0):
                    continue
                    
            total[j].append(currRow[services[j]])
        
    ret = []
    for j in range(len(services)):
        ret.append([np.mean(satisfied[j]), np.std(satisfied[j]), np.mean(dissatisfied[j]), np.std(dissatisfied[j]), np.mean(total[j]), np.std(total[j])])

    return ret

In [27]:
### calling function above for each service offered to customer ###
services = ['Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']
avgRatings = avgRating(services)

for i in range(len(avgRatings)):
    avgRatings[i].insert(0, services[i])
    
for i in range(len(avgRatings)):
    for j in range(1, len(avgRatings[i])):
        avgRatings[i][j] = format(avgRatings[i][j], '.2f')

In [28]:
### displaying calculated results in a dataframe ###
ratingsDF = pd.DataFrame(data = avgRatings)
ratingsDF.columns = ['Service', 'Average Satisfied Rating', 'Stdev of Satisfied Ratings', 'Average Dissatisfied Rating', 'Stdev of Dissatisfied Ratings', 'Average Total Rating', 'Stdev of Total Ratings']
ratingsDF

Unnamed: 0,Service,Average Satisfied Rating,Stdev of Satisfied Ratings,Average Dissatisfied Rating,Stdev of Dissatisfied Ratings,Average Total Rating,Stdev of Total Ratings
0,Inflight wifi service,3.39,1.39,2.4,0.96,2.81,1.26
1,Departure/Arrival time convenient,3.14,1.41,3.29,1.36,3.22,1.39
2,Ease of Online booking,3.24,1.4,2.62,1.15,2.88,1.3
3,Gate location,2.97,1.37,2.98,1.2,2.98,1.28
4,Food and drink,3.53,1.23,2.96,1.34,3.21,1.33
5,Online boarding,4.15,0.97,2.71,1.1,3.33,1.27
6,Seat comfort,3.97,1.14,3.04,1.3,3.44,1.32
7,Inflight entertainment,3.96,1.08,2.89,1.32,3.36,1.33
8,On-board service,3.86,1.13,3.02,1.28,3.38,1.29
9,Leg room service,3.83,1.16,3.01,1.29,3.37,1.3


In [8]:
### preparing X train, y train, X test, and y test lists ###
X_train = []
y_train = []
X_test = []
y_test = []

X_train = train[train.columns.difference(['Unnamed: 0', 'id', 'satisfaction'])]
X_test = test[test.columns.difference(['Unnamed: 0', 'id', 'satisfaction'])]

y_train = train[['satisfaction']]
y_test = test[['satisfaction']]

In [9]:
### encoding values within the training and testing data to ordinal values ###
X_encoder = ce.OrdinalEncoder(cols=['Gender', 'Customer Type', 'Type of Travel', 'Class'])

X_train = X_encoder.fit_transform(X_train)
X_test = X_encoder.transform(X_test)

y_encoder = ce.OrdinalEncoder(cols=['satisfaction'])

y_train = y_encoder.fit_transform(y_train)
y_test = y_encoder.transform(y_test)

In [10]:
### imputing NaN's in the data ###
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_train[['Arrival Delay in Minutes']] = imputer.fit_transform(X_train[['Arrival Delay in Minutes']])
X_test[['Arrival Delay in Minutes']] = imputer.transform(X_test[['Arrival Delay in Minutes']])

In [50]:
### building and evaluating a MLP classifier on the dataset via 5-fold cross validation and a confusion matrix ###
mlp_clf = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=500).fit(X_train, y_train)

scores = cross_val_score(mlp_clf, X_train, y_train, cv=5)
print(scores)

y_pred = mlp_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[0.92281411 0.93229392 0.92425774 0.91930128 0.93979788]
[[13693   880]
 [  977 10426]]
0.9285109331690792


In [51]:
### building a decision tree classifier and evaluating it ###
tree_clf = tree.DecisionTreeClassifier().fit(X_train, y_train)

scores = cross_val_score(tree_clf, X_train, y_train, cv=5)
print(scores)

y_pred = tree_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[0.94004138 0.94408354 0.94600837 0.94389105 0.94475457]
[[13848   725]
 [  664 10739]]
0.9465275639051433


In [52]:
### Applying GridSearch to find the best MLP Classifier ###
parameters = {'solver': ['adam', 'lbfgs', 'sgd'], 
              'max_iter': [1000, 1200, 1500, 2000], 
              'hidden_layer_sizes': [(10,10,10), (10, 5), (5,5), (100, 10, 50)],
              'alpha': [.0001, .005], 
              'learning_rate': ['constant', 'adaptive']}

mlp_clf = MLPClassifier()
gridClf = GridSearchCV(mlp_clf, parameters, cv=3)
gridClf.fit(X_train, y_train)
print("Grid search completed.")

print("\nHere are the best parameters:",gridClf.best_params_,"\n")

means = gridClf.cv_results_['mean_test_score']
stds = gridClf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gridClf.cv_results_['params']):
    print("Mean: %0.3f, Std Dev: (+/-%0.03f), Parameters: %r"% (mean, std * 2, params))
    print()
    
#copy this in the morning...

Grid search completed.

Here are the best parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 10, 50), 'learning_rate': 'constant', 'max_iter': 1500, 'solver': 'adam'} 

Mean: 0.922, Std Dev: (+/-0.007), Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'max_iter': 1000, 'solver': 'adam'}

Mean: 0.842, Std Dev: (+/-0.018), Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'max_iter': 1000, 'solver': 'lbfgs'}

Mean: 0.567, Std Dev: (+/-0.000), Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'max_iter': 1000, 'solver': 'sgd'}

Mean: 0.924, Std Dev: (+/-0.013), Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'max_iter': 1200, 'solver': 'adam'}

Mean: 0.845, Std Dev: (+/-0.008), Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'max_iter': 1200, 'solver': 'lb

In [46]:
### Applying GridSearch to find the best DecisionTree Classifier ###
#default params: criterion : gini, splitter : best, max_depth : None,
#   min_samples_split : 2, min_samples_leaf : 1, max_features : None
parameters = {'criterion' : ['gini', 'entropy'],
              'splitter' : ['best', 'random'],
              'min_samples_split' : [1, 2, 4, 8],
              'min_samples_leaf' : [1, 2, 4, 8],
              'max_features' : [None, 'auto', 'sqrt', 'log2'],
              'max_depth' : [2, 4, 8, 16, 32, None]}

tree_clf = tree.DecisionTreeClassifier()
gridClf = GridSearchCV(tree_clf, parameters, cv=3)
gridClf.fit(X_train, y_train)
print("Grid search completed.")

print("\nHere are the best parameters:",gridClf.best_params_,"\n")

means = gridClf.cv_results_['mean_test_score']
stds = gridClf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gridClf.cv_results_['params']):
    print("Mean: %0.3f, Std Dev: (+/-%0.03f), Parameters: %r"% (mean, std * 2, params))
    print()

Grid search completed.

Here are the best parameters: {'criterion': 'entropy', 'max_depth': 16, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'} 

Mean: nan, Std Dev: (+/-nan), Parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 1, 'splitter': 'best'}

Mean: nan, Std Dev: (+/-nan), Parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 1, 'splitter': 'random'}

Mean: 0.859, Std Dev: (+/-0.004), Parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}

Mean: 0.787, Std Dev: (+/-0.031), Parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}

Mean: 0.859, Std Dev: (+/-0.004), Parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'min_samples_