In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix

In [None]:
data = pd.read_csv('../input/airline-passenger-satisfaction/airline_passenger_satisfaction.csv')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data = data.drop(columns = [data.columns[0]])

In [None]:
data.isnull().sum()

In [None]:
#dropping null rows in arrival_delay_in_minutes as there arent that many 

In [None]:
data = data.dropna(axis = 0, how = 'any')

In [None]:
data.isnull().sum()

In [None]:
#Label encoding categorical columns 
cat_columns = data.select_dtypes(include = ['object'])
le = LabelEncoder()
cat_encoded = cat_columns.apply(le.fit_transform)

In [None]:
cat_encoded

In [None]:
#replacing columns in dataset
cat_labels = cat_encoded.columns
data = data.drop(columns = cat_labels, axis = 1)

In [None]:
data = pd.concat([data, cat_encoded], axis = 1)

In [None]:
data.head()

In [None]:
#plot correlation heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(data.corr(), annot = True)

#we can see that the target column has most correlation with online boarding, inflight entertainment, seat comfort, onboard service. 
#mostly these services are all inflight services
#we can also see that satisfaction does not correlate very much to customer class.

In [None]:
#corr against satisfaction
corr = data[data.columns[1:]].corr()['satisfaction'][:-1]
corr
#we are going to drop columns with near 0 correlation. 

In [None]:
names = []
for i, r in corr.items():
    if  ((r < 0.06) & (r > -0.06)):
        names.append(i)

In [None]:
names

In [None]:
data = data.drop(columns = data[names])

In [None]:
data.head()

In [None]:
data.iloc[:, :-1]

In [None]:
X = data.iloc[:, :-1]
y = data[['satisfaction']]

In [None]:
sc = StandardScaler()
x = sc.fit_transform(X)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 42)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train.values.ravel())
log_proba = log_reg.predict_proba(x_test)
log_proba = log_proba[:,1]
log_pred = log_reg.predict(x_test)

In [None]:
log_reg.score(x_test, y_test)

In [None]:
plot_confusion_matrix(log_reg, x_test, y_test, normalize = 'true')

In [None]:
print(classification_report(y_test, log_pred))

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_proba = dt.predict_proba(x_test)
dt_proba = dt_proba[:,1]
dt_pred = dt.predict(x_test)

In [None]:
dt.score(x_test,y_test)

In [None]:
plot_confusion_matrix(dt, x_test, y_test, normalize = 'true')

In [None]:
print(classification_report(y_test, dt_pred))

In [None]:
lr_auc = roc_auc_score(y_test, log_proba)
lr_fpr, lr_tpr, _ = roc_curve(y_test, log_proba)
dt_auc = roc_auc_score(y_test, dt_proba)
dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_proba)
plt.figure(figsize=(20, 15))

plt.plot(lr_fpr, lr_tpr,  label='Logistic')
plt.plot(dt_fpr, dt_tpr, label='Decision Tree')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend()

plt.show()