# Describe Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# Invistico_Airline.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df = pd.read_csv('../input/airlines-customer-satisfaction/Invistico_Airline.csv', delimiter=',', nrows = nRowsRead)
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
numerical = df.select_dtypes(include=['int64','float64']).columns.tolist()
categorical = df.drop(columns=numerical).columns.tolist()
ratings = df[numerical].drop(columns=['Age','Flight Distance','Departure Delay in Minutes','Arrival Delay in Minutes'], axis=1).columns.tolist()

In [None]:
df[ratings].agg(['mean'])

In [None]:
#Ratings overall
df[ratings].stack().mean()

# Preprocessing

### Null & 0 Value

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#change 0 value to mode
for i in range(0,len(ratings)):
    mode = df[ratings[i]].mode()
    df[ratings[i]] = df[ratings[i]].replace(0,mode[0])

### Duplicate Value

In [None]:
df.duplicated().sum()

In [None]:
# df_clean = df.replace(0,np.nan).dropna(subset=ratings)
# df_clean = df_clean.replace(np.nan,0)
# df_clean[numerical] = df_clean[numerical].astype('int64')

In [None]:
df_clean = df.copy()

In [None]:
#no duplicates in the data

### Feature Engineering

In [None]:
#label satisfaction
df_clean['satisfaction_label'] = df_clean['satisfaction'].map({'satisfied':1, 'dissatisfied':0})
df_clean['satisfaction_label'].value_counts()

In [None]:
#new column total delay
df_clean['total_delay'] = df_clean['Departure Delay in Minutes'] + df_clean['Arrival Delay in Minutes']


In [None]:
# # #travel type & gender label
# df_clean['Gender_Male'] = df_clean['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
# df_clean['Travel_Personal'] = df_clean['Type of Travel'].apply(lambda x: 1 if x == 'Personal Travel' else 0)


In [None]:
# # #customer type label
# df_clean['Loyal_customer'] = df_clean['Customer Type'].apply(lambda x: 1 if x == 'Loyal Customer' else 0)


In [None]:
# # #class label
# df_clean['Class'] = df_clean['Class'].map({'Eco':'Economy', 'Eco Plus':'Economy','Business':'Business'})
# df_clean['Class_Economy'] = df_clean['Class'].apply(lambda x: 1 if x == 'Economy' else 0)

In [None]:
df_eda = df_clean.copy()
df_eda.drop(columns=['Departure Delay in Minutes','Arrival Delay in Minutes'], inplace=True)
# df_eda.drop(columns=['Gender','Type of Travel'], inplace=True)
# df_eda.drop(columns='Customer Type', inplace=True)
# df_eda.drop(columns='Class', inplace=True)

# EDA

In [None]:
plt.figure(figsize=(18,10))
sns.heatmap(df_eda.corr(), cmap='Reds', annot=True, fmt='.2f')

In [None]:
sns.set(style='white', font_scale=1.1)
fig = plt.figure(figsize=(5,6))
ax = sns.countplot(data=df_clean, x='satisfaction_label', palette='inferno', order=[1,0])
ax.set_xticklabels(['Satisfied','Dissatisfied'])
for p in ax.patches:
    ax.annotate(str(p.get_height())+' ('+str((p.get_height()/len(df_clean)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))
plt.xlabel('Satisfaction', weight='bold', fontsize=15)
plt.ylabel('No. of Passangers', weight='bold', fontsize=15)
sns.despine()

In [None]:
sns.set(style='white', font_scale=1.1)
fig = plt.figure(figsize=(6,8))
ax = sns.countplot(data=df_clean, x='Type of Travel', palette='pastel', hue='satisfaction_label', hue_order=[1,0])
ax.legend(labels=['Satisfied','Dissatisfied'])
for p in ax.patches:
    ax.annotate(str(p.get_height()),(p.get_x()+0.1,p.get_height()+400))
plt.xlabel('Travel Type', weight='bold', fontsize=15)
plt.ylabel('No. of Passangers', weight='bold', fontsize=15)
sns.despine()

In [None]:
sns.set(style='white', font_scale=1.1)
fig = plt.figure(figsize=(6,8))
ax = sns.countplot(data=df_clean, x='Customer Type', palette='pastel', hue='satisfaction_label', hue_order=[1,0])
ax.legend(labels=['Satisfied','Dissatisfied'])
for p in ax.patches:
    ax.annotate(str(p.get_height()),(p.get_x()+0.1,p.get_height()+400))
plt.xlabel('Customer Type', weight='bold', fontsize=15)
plt.ylabel('No. of Passangers', weight='bold', fontsize=15)
sns.despine()

In [None]:
sns.set(style='white', font_scale=1.1)
fig = plt.figure(figsize=(6,8))
ax = sns.countplot(data=df_clean, x='Gender', palette='pastel', hue='satisfaction_label',hue_order=[1,0])
ax.set_xticklabels(['Female','Male'])
ax.legend(labels=['Satisfied','Dissatisfied'])
for p in ax.patches:
    ax.annotate(str(p.get_height()),(p.get_x()+0.1,p.get_height()+400))
plt.xlabel('Gender', weight='bold', fontsize=15)
plt.ylabel('No. of Passangers', weight='bold', fontsize=15)
sns.despine()

In [None]:
sns.set(style='white', font_scale=1.1)
fig = plt.figure(figsize=(8,10))
ax = sns.countplot(data=df_clean, x='Class', palette='pastel', hue='satisfaction_label', hue_order=[1,0])

ax.legend(labels=['Satisfied','Dissatisfied'])
for p in ax.patches:
    ax.annotate(str(p.get_height()),(p.get_x()+0.1,p.get_height()+400))
plt.xlabel('Class Type', weight='bold', fontsize=15)
plt.ylabel('No. of Passangers', weight='bold', fontsize=15)
sns.despine()

In [None]:
df.columns

In [None]:
df_be=df.groupby(['Type of Travel', 'Class'])['satisfaction'].count().reset_index()
df_be.sort_values(['satisfaction'],ascending=False)

In [None]:
df_be2=df_be
df_be2=df.groupby(['satisfaction','Type of Travel', 'Class']).agg({'satisfaction':['count']}).reset_index()
df_be2.columns=['satisfaction','Type of Travel', 'Class','Total']

df_be2.sort_values(['Total'],ascending=False)

In [None]:
sns.set(style='white',font_scale=1)
fig = plt.figure(figsize=[20,20])
for i in range(22):
    fig.add_subplot(4, 6, i+1)
    if i in [2,5,20,21]:
        sns.histplot(data=df,x=df.columns[i+1],hue='satisfaction', multiple='stack')
    else:    
        sns.countplot(data=df,x=df.columns[i+1],hue='satisfaction')
    sns.despine()
    plt.suptitle('TOTAL SATISFACTION', fontsize=40)
    plt.tight_layout()
    fig.savefig('test.png')


In [None]:
df_male = df[(df['Gender']!='Female')]
df_female = df[(df['Gender']=='Female')]
df_male_personal = df[(df['Gender']!='Female')&(df['Type of Travel']=='Personal Travel')]
df_male_personal_eco = df[(df['Gender']!='Female')&(df['Type of Travel']=='Personal Travel')&(df['Class']=='Eco')]
df_male_business_eco = df[(df['Gender']!='Female')&(df['Type of Travel']=='Business Travel')&(df['Class']=='Eco')]
df_male_eco = df[(df['Gender']!='Female')&(df['Class']=='Eco')]
df_personal = df[(df['Type of Travel']=='Personal Travel')]
df_disloyal = df[(df['Customer Type']=='disloyal Customer')]
df_female_personal_eco = df[(df['Gender']=='Female')&(df['Type of Travel']=='Personal Travel')&(df['Class']=='Eco')]
df_eco = df[(df['Class'].isin(['Eco','Eco Plus']))]
df_business = df[(df['Class'].isin(['Business']))]
df_businesstype = df[(df['Type of Travel']=='Business travel')]
df_businesstype_business = df[(df['Type of Travel']=='Business travel')&(df['Class']=='Business')]
df_businesstype_eco = df[(df['Type of Travel']=='Business travel')&(df['Class']=='Eco')]
df_personal_eco = df[(df['Type of Travel']=='Personal Travel')&(df['Class']=='Eco')]


In [None]:
sns.set(style='white',font_scale=1)
fig = plt.figure(figsize=[20,20])
for i in range(22):
    fig.add_subplot(4, 6, i+1)
    if i in [2,5,20,21]:
        sns.histplot(data=df_personal,x=df_personal.columns[i+1],hue='satisfaction', multiple='stack')
    else:    
        sns.countplot(data=df_personal,x=df_personal.columns[i+1],hue='satisfaction')
    sns.despine()
    plt.suptitle('SATISFACTION PERSONAL TRAVELER', fontsize=25)
    plt.tight_layout()
    

In [None]:
sns.set(style='white',font_scale=1)
fig = plt.figure(figsize=[20,20])
for i in range(22):
    fig.add_subplot(4, 6, i+1)
    if i in [2,5,20,21]:
        sns.histplot(data=df_businesstype,x=df_businesstype.columns[i+1],hue='satisfaction', multiple='stack')
    else:    
        sns.countplot(data=df_businesstype,x=df_businesstype.columns[i+1],hue='satisfaction')
    sns.despine()
    plt.suptitle('SATISFACTION BUSINESS TRAVELER', fontsize=25)
    plt.tight_layout()
    

In [None]:
sns.set(style='white',font_scale=1)
fig = plt.figure(figsize=[20,20])
for i in range(22):
    fig.add_subplot(4, 6, i+1)
    if i in [2,5,20,21]:
        sns.histplot(data=df_eco,x=df_eco.columns[i+1],hue='satisfaction', multiple='stack')
    else:    
        sns.countplot(data=df_eco,x=df_eco.columns[i+1],hue='satisfaction')
    sns.despine()
    plt.suptitle('SATISFACTION ECONOMY CLASS', fontsize=25)
    plt.tight_layout()

In [None]:
sns.set(style='white',font_scale=1)
fig = plt.figure(figsize=[20,20])
for i in range(22):
    fig.add_subplot(4, 6, i+1)
    if i in [2,5,20,21]:
        sns.histplot(data=df_business,x=df_business.columns[i+1],hue='satisfaction', multiple='stack')
    else:    
        sns.countplot(data=df_business,x=df_business.columns[i+1],hue='satisfaction')
    sns.despine()
    plt.suptitle('SATISFACTION BUSINESS CLASS', fontsize=25)
    plt.tight_layout()

In [None]:
sns.set(style='white',font_scale=1)
fig = plt.figure(figsize=[20,20])
for i in range(22):
    fig.add_subplot(4, 6, i+1)
    if i in [2,5,20,21]:
        sns.histplot(data=df_personal_eco,x=df_personal_eco.columns[i+1],hue='satisfaction', multiple='stack')
    else:    
        sns.countplot(data=df_personal_eco,x=df_personal_eco.columns[i+1],hue='satisfaction')
    sns.despine()
    plt.suptitle('SATISFACTION PERSONAL ECONOMY TRAVELER', fontsize=25)
    plt.tight_layout()
    

In [None]:
sns.set(style='white',font_scale=1)
fig = plt.figure(figsize=[20,20])
for i in range(22):
    fig.add_subplot(4, 6, i+1)
    if i in [2,5,20,21]:
        sns.histplot(data=df_businesstype_eco,x=df_businesstype_eco.columns[i+1],hue='satisfaction', multiple='stack')
    else:    
        sns.countplot(data=df_businesstype_eco,x=df_businesstype_eco.columns[i+1],hue='satisfaction')
    sns.despine()
    plt.suptitle('SATISFACTION BUSINESS ECONOMY TRAVELER', fontsize=25)
    plt.tight_layout()
    

In [None]:
df_eda2 = df_eda.drop(columns=['satisfaction','Gender','Customer Type','Class','Type of Travel'])
df_pre = df_eda.copy()

In [None]:
sns.set(style='white',font_scale=1.5)
fig = plt.figure(figsize=(30,20))
for i in range (18):
    fig.add_subplot(3,6,i+1)
    sns.kdeplot(data=df_pre,x=df_eda2.columns[i], hue='satisfaction_label')
    if i == 17:
        plt.xlim(-50,300)
    sns.despine()
    plt.tight_layout()

In [None]:
#Inflight Entertainment Satisfaction Rate
inf_entertain = df_eda.groupby(['Inflight entertainment','satisfaction_label']).agg({'Age':'count'}).reset_index()
inf_entertain.columns = ['Rating', 'satisfaction', 'Customer Rated']
inf_entertain['Percentage'] = (inf_entertain['Customer Rated']/inf_entertain.groupby(['Rating'])['Customer Rated'].transform('sum'))*100
inf_entertain['Total_Percentage'] = (inf_entertain['Customer Rated']/inf_entertain['Customer Rated'])*100
inf_entertain_sat = inf_entertain[inf_entertain.satisfaction ==1]

plt.figure(figsize=(10,8))
bar1 = sns.barplot(x='Rating',y='Total_Percentage',data=inf_entertain, color='darkgrey')
bar2 = sns.barplot(x='Rating',y='Percentage',data=inf_entertain_sat, color='skyblue')
plt.title('Inflight Entertainment', fontsize=16);

In [None]:
#Seat comfort Satisfaction Rate
seat_comf = df_eda.groupby(['Seat comfort','satisfaction_label']).agg({'Age':'count'}).reset_index()
seat_comf.columns = ['Rating', 'satisfaction', 'Customer Rated']
seat_comf['Percentage'] = (seat_comf['Customer Rated']/seat_comf.groupby(['Rating'])['Customer Rated'].transform('sum'))*100
seat_comf['Total_Percentage'] = (seat_comf['Customer Rated']/seat_comf['Customer Rated'])*100
seat_comf_sat = seat_comf[seat_comf.satisfaction ==1]

plt.figure(figsize=(10,8))
bar1 = sns.barplot(x='Rating',y='Total_Percentage',data=seat_comf, color='darkgrey')
bar2 = sns.barplot(x='Rating',y='Percentage',data=seat_comf_sat, color='skyblue')
plt.title('Seat comfort', fontsize=16);

In [None]:
#Inflight Entertainment Satisfaction Rate
ease_book = df_eda.groupby(['Ease of Online booking','satisfaction_label']).agg({'Age':'count'}).reset_index()
ease_book.columns = ['Rating', 'satisfaction', 'Customer Rated']
ease_book['Percentage'] = (ease_book['Customer Rated']/ease_book.groupby(['Rating'])['Customer Rated'].transform('sum'))*100
ease_book['Total_Percentage'] = (ease_book['Customer Rated']/ease_book['Customer Rated'])*100
ease_book_sat = ease_book[ease_book.satisfaction ==1]

plt.figure(figsize=(10,8))
bar1 = sns.barplot(x='Rating',y='Total_Percentage',data=ease_book, color='darkgrey')
bar2 = sns.barplot(x='Rating',y='Percentage',data=ease_book_sat, color='skyblue')
plt.title('Booking Ease', fontsize=16);

In [None]:
#Dept/Arrival Time Convenient
deptarr = df_eda.groupby(['Departure/Arrival time convenient','satisfaction_label']).agg({'Age':'count'}).reset_index()
deptarr.columns = ['Rating', 'satisfaction', 'Customer Rated']
deptarr['Percentage'] = (deptarr['Customer Rated']/deptarr.groupby(['Rating'])['Customer Rated'].transform('sum'))*100
deptarr['Total_Percentage'] = (deptarr['Customer Rated']/deptarr['Customer Rated'])*100
deptarr_sat = deptarr[deptarr.satisfaction ==1]

plt.figure(figsize=(10,8))
bar1 = sns.barplot(x='Rating',y='Total_Percentage',data=deptarr, color='darkgrey')
bar2 = sns.barplot(x='Rating',y='Percentage',data=deptarr_sat, color='skyblue')
plt.title('Dept/Arr Time Convenient', fontsize=16);

In [None]:
#Gate Location
gateloc = df_eda.groupby(['Gate location','satisfaction_label']).agg({'Age':'count'}).reset_index()
gateloc.columns = ['Rating', 'satisfaction', 'Customer Rated']
gateloc['Percentage'] = (gateloc['Customer Rated']/gateloc.groupby(['Rating'])['Customer Rated'].transform('sum'))*100
gateloc['Total_Percentage'] = (gateloc['Customer Rated']/gateloc['Customer Rated'])*100
gateloc_sat = gateloc[gateloc.satisfaction ==1]

plt.figure(figsize=(10,8))
bar1 = sns.barplot(x='Rating',y='Total_Percentage',data=gateloc, color='darkgrey')
bar2 = sns.barplot(x='Rating',y='Percentage',data=gateloc_sat, color='skyblue')
plt.title('Gate Location', fontsize=16);

In [None]:
#fnb
fnb = df_eda.groupby(['Food and drink','satisfaction_label']).agg({'Age':'count'}).reset_index()
fnb.columns = ['Rating', 'satisfaction', 'Customer Rated']
fnb['Percentage'] = (fnb['Customer Rated']/fnb.groupby(['Rating'])['Customer Rated'].transform('sum'))*100
fnb['Total_Percentage'] = (fnb['Customer Rated']/fnb['Customer Rated'])*100
fnb_sat = fnb[fnb.satisfaction ==1]

plt.figure(figsize=(10,8))
bar1 = sns.barplot(x='Rating',y='Total_Percentage',data=fnb, color='darkgrey')
bar2 = sns.barplot(x='Rating',y='Percentage',data=fnb_sat, color='skyblue')
plt.title('FNB', fontsize=16);

## Normalization/Outliers Correction

In [None]:
plt.figure(figsize=(10,8))
for i in range(0, len(['Age','Flight Distance','total_delay'])):
    plt.subplot(3, 1, i+1)
    sns.boxplot(df_pre[['Age','Flight Distance','total_delay'][i]], color='green', orient='h')
    plt.tight_layout()

In [None]:
for var in ['Flight Distance', 'total_delay']:
    df_pre[var] = (df_pre[var]+1).apply(np.log)

from sklearn.preprocessing import MinMaxScaler, StandardScaler
for var in ['Age','Flight Distance','total_delay']:
    df_pre[var]= MinMaxScaler().fit_transform(df_pre[var].values.reshape(len(df_pre), 1))

In [None]:
plt.figure(figsize=(10,8))
for i in range(0, len(['Age','Flight Distance','total_delay'])):
    plt.subplot(3, 1, i+1)
    sns.boxplot(df_pre[['Age','Flight Distance','total_delay'][i]], color='orange', orient='h')
    plt.tight_layout()

In [None]:
# print(f'Jumlah baris sebelum memfilter outlier: {len(df_pre)}')

# filtered_entries = np.array([True] * len(df_pre))
# for col in ['Flight Distance']:
#     Q1 = df_pre[col].quantile(0.25)
#     Q3 = df_pre[col].quantile(0.75)
#     IQR = Q3 - Q1
#     low_limit = Q1 - (IQR * 1.5)
#     high_limit = Q3 + (IQR * 1.5)

#     filtered_entries = ((df_pre[col] >= low_limit) & (df_pre[col] <= high_limit)) & filtered_entries
    
# df_iqr = df_pre[filtered_entries]

# print(f'Jumlah baris setelah memfilter outlier: {len(df_iqr)}')

In [None]:
# plt.figure(figsize=(10,8))
# for i in range(0, len(['Age','Flight Distance','total_delay'])):
#     plt.subplot(3, 1, i+1)
#     sns.boxplot(df_iqr[['Age','Flight Distance','total_delay'][i]], color='orange', orient='h')
#     plt.tight_layout()

### Feature Encoding

In [None]:
df_model = df_pre.copy()

In [None]:
dummies_category = pd.get_dummies(df_model['Gender'],prefix='Gender')
df_model = pd.concat([df_model, dummies_category], axis=1)
df_model.drop(columns='Gender', axis=1, inplace=True)

In [None]:
dummies_category = pd.get_dummies(df_model['Customer Type'],prefix='Customer Type')
df_model = pd.concat([df_model, dummies_category], axis=1)
df_model.drop(columns='Customer Type', axis=1, inplace=True)

In [None]:
dummies_category = pd.get_dummies(df_model['Class'],prefix='Class')
df_model = pd.concat([df_model, dummies_category], axis=1)
df_model.drop(columns='Class', axis=1, inplace=True)

In [None]:
dummies_category = pd.get_dummies(df_model['Type of Travel'],prefix='Travel')
df_model = pd.concat([df_model, dummies_category], axis=1)
df_model.drop(columns='Type of Travel', axis=1, inplace=True)

In [None]:
# df_model.drop(columns=['Departure/Arrival time convenient','Gate location'], axis=1, inplace=True)

In [None]:
df_model.info()

# Modelling

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRegressor

In [None]:
def eval(model,pred,xtrain,ytrain,xtest,ytest):
    print("Accuracy (Test Set): %.2f" % accuracy_score(ytest, pred))
    print("Precision (Test Set): %.2f" % precision_score(ytest, pred))
    print("Recall (Test Set): %.2f" % recall_score(ytest, pred))
    print("F1-Score (Test Set): %.2f" % f1_score(ytest, pred))
    print('AUC Score (Test set): %.2f' % roc_auc_score(ytest,pred))

In [None]:
def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
    ax.invert_yaxis()

    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('feature importance score')

In [None]:
X = df_model.drop(columns=['satisfaction','satisfaction_label'], axis=1)
y = df_model['satisfaction_label']

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.3,random_state=42)

### Logistic Regression

In [None]:
logres = LogisticRegression()
logres.fit(Xtrain,ytrain)

y_pred = logres.predict(Xtest)
eval(logres,y_pred,Xtrain,ytrain,Xtest,ytest)

In [None]:
print('Train Accuracy Score :',logres.score(Xtrain,ytrain))
print('Test Accuracy Score :', logres.score(Xtest,ytest))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(Xtrain,ytrain)

y_pred = dt.predict(Xtest)
eval(dt,y_pred,Xtrain,ytrain,Xtest,ytest)

In [None]:
print('Train Accuracy Score :', dt.score(Xtrain,ytrain))
print('Test Accuracy Score :',dt.score(Xtest,ytest))

**Hyperparameters Tuning**

In [None]:
#parameters:
criterion = ['gini','entropy']
splitter = ['best','random']
max_depth =[6,10,15,20]
min_samples_split = [5,7,10]
min_samples_leaf = [2,4,5]

hyperparams = dict(criterion = criterion, 
                   splitter = splitter,
                   max_depth = max_depth,
                   min_samples_split = min_samples_split,
                   min_samples_leaf = min_samples_leaf)

#initiate
dt_tuned = RandomizedSearchCV(dt,hyperparams,cv=5, scoring='f1')
dt_tuned.fit(Xtrain,ytrain)

y_pred = dt_tuned.predict(Xtest)
eval(dt_tuned,y_pred,Xtrain,ytrain,Xtest,ytest)

In [None]:
print('Best criterion:', dt_tuned.best_estimator_.get_params()['criterion'])
print('Best splitter:', dt_tuned.best_estimator_.get_params()['splitter'])
print('Best max_depth:', dt_tuned.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split:', dt_tuned.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', dt_tuned.best_estimator_.get_params()['min_samples_leaf'])

### KNearestNeighbors

In [None]:
knn = KNeighborsClassifier()
knn.fit(Xtrain,ytrain)

y_pred = knn.predict(Xtest)
eval(knn,y_pred,Xtrain,ytrain,Xtest,ytest)

### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(Xtrain,ytrain)

y_pred = rf.predict(Xtest)
eval(rf,y_pred,Xtrain,ytrain,Xtest,ytest)

In [None]:
print('Train Accuracy Score :', rf.score(Xtrain,ytrain))
print('Test Accuracy Score :',rf.score(Xtest,ytest))

In [None]:
show_feature_importance(rf)

### XGBoost

In [None]:
xg = XGBClassifier(eval_metric='mlogloss')
xg.fit(Xtrain, ytrain)

y_pred = xg.predict(Xtest)
eval(xg, y_pred, Xtrain, ytrain, Xtest, ytest)

In [None]:
show_feature_importance(xg)

In [None]:
pd.DataFrame(y_pred,ytest)

In [None]:
def make_confusion_matrix2(model, threshold=0.7):
    # Predict class 1 if probability of being in class 1 is greater than threshold
    # (model.predict(X_test) does this automatically with a threshold of 0.5)
    y_predict = (model.predict_proba(Xtest)[:, 1] >= threshold)
    satisfaction_confusion = confusion_matrix(ytest, y_pred)
    plt.figure(dpi=120)
    ax = sns.heatmap(satisfaction_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['Dissatisfied', 'Satisfied'],
           yticklabels=['Dissatisfied', 'Satisfied']);   
    plt.xlabel('Prediction',weight='bold',fontsize=12)
    plt.ylabel('Actual',weight='bold',fontsize=12)
    plt.title('Confusion Matrix',weight='bold',fontsize=15)

In [None]:
sns.set(style='white',font_scale=1)
make_confusion_matrix2(rf)
plt.savefig('confusionplot.png',transparent=True, bbox_inches='tight')

# Impact Model

### General Impact Model

In [None]:
#data yang mau disample
df_model

In [None]:
sample_imp = df_model.sample(n=70000, random_state=42)
x_imp = sample_imp.drop(columns=['satisfaction','satisfaction_label'])
x_imp.reset_index(inplace=True, drop=True)
sample_imp.shape

In [None]:
new_rate = np.random.choice(range(4,6), size = x_imp.shape[0])
for i in ['Inflight entertainment','Seat comfort','Ease of Online booking']:
    x_imp[i] = x_imp[i].map(dict(zip(x_imp[i].unique(), new_rate)))

In [None]:
x_imp

In [None]:
y_imp = rf.predict(x_imp)
y_imp = pd.DataFrame(y_imp.tolist())

In [None]:
df_imp = x_imp.copy()
df_imp['satisfaction_label'] = y_imp
df_imp.head()

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))
sns.countplot(data=sample_imp, x='satisfaction_label',ax=ax[0])
sns.countplot(data=df_imp, x='satisfaction_label',ax=ax[1])
plt.tight_layout()

In [None]:
df_compare = df_model.append(df_imp).drop(columns='satisfaction')

In [None]:
df_compare

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))

g = sns.countplot(data=df_model, x='satisfaction_label',ax=ax[0])
ax[0].set_yticks(range(0,100000,20000))
ax[0].set_xlabel('Satisfaction', fontsize=16)
ax[0].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[0].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(df_model)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

g = sns.countplot(data=df_compare, x='satisfaction_label',ax=ax[1])
ax[1].set_yticks(range(0,100000,20000))
ax[1].set_xlabel('Satisfaction', fontsize=16)
ax[1].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[1].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(df_compare)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

plt.tight_layout()
sns.despine()

In [None]:
sns.set(style='white', font_scale=1.1)
fig = plt.figure(figsize=(5,6))
ax = sns.countplot(data=df_clean, x='satisfaction_label', palette='inferno', order=[1,0])
ax.set_xticklabels(['Satisfied','Dissatisfied'])
for p in ax.patches:
    ax.annotate(str(p.get_height())+' ('+str((p.get_height()/len(df_clean)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))
plt.xlabel('Satisfaction', weight='bold', fontsize=15)
plt.ylabel('No. of Passangers', weight='bold', fontsize=15)
sns.despine()

In [None]:
df_compare[ratings].stack().mean()

### Random Impact

In [None]:
sample_rand = df_model.sample(n=70000, random_state=42)
x_rand = sample_rand.drop(columns=['satisfaction','satisfaction_label'])
x_rand.reset_index(inplace=True, drop=True)
sample_rand.shape

In [None]:
new_rate_rand = np.random.choice(range(4,6), size = x_rand.shape[0])
for i in ['Departure/Arrival time convenient','Gate location','Inflight wifi service']:
    x_rand[i] = x_rand[i].map(dict(zip(x_rand[i].unique(), new_rate_rand)))

In [None]:
x_rand

In [None]:
y_rand = rf.predict(x_rand)
y_rand = pd.DataFrame(y_rand.tolist())

In [None]:
df_rand = x_rand.copy()
df_rand['satisfaction_label'] = y_rand
df_rand.head()

In [None]:
rand_compare = df_model.append(df_rand).drop(columns='satisfaction')

In [None]:
rand_compare

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))

g = sns.countplot(data=df_model, x='satisfaction_label',ax=ax[0])
# ax[0].set_yticks(range(0,100000,20000))
ax[0].set_xlabel('Satisfaction', fontsize=16)
ax[0].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[0].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(df_model)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

g = sns.countplot(data=rand_compare, x='satisfaction_label',ax=ax[1])
# ax[1].set_yticks(range(0,100000,20000))
ax[1].set_xlabel('Satisfaction', fontsize=16)
ax[1].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[1].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(rand_compare)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

plt.tight_layout()
sns.despine()

In [None]:
rand_compare[ratings].stack().mean()

### Impact Model on Eco Class

In [None]:
model_eco = df_model[(df_model['Class_Eco']==1)
        |(df_model['Class_Eco Plus']==1)].reset_index(drop=True)
# oversamp = model_eco.copy()
# model_eco = model_eco.append(oversamp)
model_eco.shape

In [None]:
eco_samp = model_eco.sample(frac=1, random_state=42)
x_eco = eco_samp.drop(columns=['satisfaction','satisfaction_label'])
x_eco.reset_index(inplace=True,drop=True)

In [None]:
# new_all_eco = np.random.choice(range(1,6), size = x_eco.shape[0])
# for i in ratings:
#     x_eco[i] = x_eco[i].map(dict(zip(x_eco[i].unique(), new_all_eco)))

new_rate_eco = np.random.choice(range(5,6), size = x_eco.shape[0])
for i in ['Inflight entertainment','Seat comfort','Ease of Online booking']:
    x_eco[i] = x_eco[i].map(dict(zip(x_eco[i].unique(), new_rate_eco)))

x_eco

In [None]:
y_eco = rf.predict(x_eco)
y_eco = pd.DataFrame(y_eco.tolist())

In [None]:
new_eco = x_eco.copy()
new_eco['satisfaction_label'] = y_eco
new_eco.head()

In [None]:
eco_compare = model_eco.append(new_eco).drop(columns='satisfaction')
eco_compare

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))

g = sns.countplot(data=model_eco, x='satisfaction_label',ax=ax[0])
# ax[0].set_yticks(range(0,100000,20000))
ax[0].set_xlabel('Satisfaction', fontsize=16)
ax[0].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[0].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(model_eco)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

g = sns.countplot(data=eco_compare, x='satisfaction_label',ax=ax[1])
# ax[1].set_yticks(range(0,100000,20000))
ax[1].set_xlabel('Satisfaction', fontsize=16)
ax[1].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[1].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(eco_compare)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

fig.suptitle('Compare Between Eco & Eco Plus Only',fontsize=24)
plt.tight_layout()

In [None]:
eco_to_total = df_model.append(new_eco).drop(columns='satisfaction')
eco_to_total

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))

g = sns.countplot(data=df_model, x='satisfaction_label',ax=ax[0])
# ax[0].set_yticks(range(0,100000,20000))
ax[0].set_xlabel('Satisfaction', fontsize=16)
ax[0].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[0].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(df_model)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

g = sns.countplot(data=eco_to_total, x='satisfaction_label',ax=ax[1])
# ax[1].set_yticks(range(0,100000,20000))
ax[1].set_xlabel('Satisfaction', fontsize=16)
ax[1].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[1].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(eco_to_total)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

fig.suptitle('Compare Base with New Eco',fontsize=24)
plt.tight_layout()

In [None]:
eco_to_total[ratings].stack().mean()

### Impact Model on Eco Class 2nd Term

In [None]:
eco_samp2 = model_eco.sample(frac=0.8, random_state=42)
x_eco2 = eco_samp2.drop(columns=['satisfaction','satisfaction_label'])
x_eco2.reset_index(inplace=True,drop=True)

In [None]:
new_rate_eco2 = np.random.choice(range(5,6), size = x_eco2.shape[0])
for i in ['Inflight entertainment','Seat comfort','Ease of Online booking','Online support']:
    x_eco2[i] = x_eco2[i].map(dict(zip(x_eco2[i].unique(), new_rate_eco2)))
x_eco2

In [None]:
y_eco2 = rf.predict(x_eco2)
y_eco2 = pd.DataFrame(y_eco2.tolist())

In [None]:
new_eco2 = x_eco2.copy()
new_eco2['satisfaction_label'] = y_eco2
new_eco2.head()

In [None]:
eco_compare2 = model_eco.append(new_eco2).drop(columns='satisfaction')
eco_compare2

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))

g = sns.countplot(data=model_eco, x='satisfaction_label',ax=ax[0])
# ax[0].set_yticks(range(0,100000,20000))
ax[0].set_xlabel('Satisfaction', fontsize=16)
ax[0].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[0].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(model_eco)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

g = sns.countplot(data=eco_compare2, x='satisfaction_label',ax=ax[1])
# ax[1].set_yticks(range(0,100000,20000))
ax[1].set_xlabel('Satisfaction', fontsize=16)
ax[1].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[1].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(eco_compare2)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

fig.suptitle('Compare Between Eco & Eco Plus Only',fontsize=24)
plt.tight_layout()

In [None]:
eco_to_total2 = df_model.append(new_eco2).drop(columns='satisfaction')
eco_to_total2

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))

g = sns.countplot(data=df_model, x='satisfaction_label',ax=ax[0])
# ax[0].set_yticks(range(0,140000,20000))
ax[0].set_xlabel('Satisfaction', fontsize=16)
ax[0].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[0].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(df_model)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

g = sns.countplot(data=eco_to_total2, x='satisfaction_label',ax=ax[1])
# ax[1].set_yticks(range(0,100000,20000))
ax[1].set_xlabel('Satisfaction', fontsize=16)
ax[1].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[1].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(eco_to_total2)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

fig.suptitle('Compare Base with New Eco',fontsize=24)
plt.tight_layout()

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,10))

g = sns.countplot(data=eco_to_total, x='satisfaction_label',ax=ax[0])
# ax[0].set_yticks(range(0,140000,20000))
ax[0].set_xlabel('Satisfaction', fontsize=16)
ax[0].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[0].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(eco_to_total)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

g = sns.countplot(data=eco_to_total2, x='satisfaction_label',ax=ax[1])
# ax[1].set_yticks(range(0,100000,20000))
ax[1].set_xlabel('Satisfaction', fontsize=16)
ax[1].set_xticklabels(['Dissatisfied','Satisfied'], fontsize=14)
ax[1].set_ylabel('Count', fontsize=16)

for p in g.patches:
    g.annotate(str(p.get_height())+' ('+str((p.get_height()/len(eco_to_total2)*100).round(1))+'%)',(p.get_x()+0.1,p.get_height()+400))

fig.suptitle('Compare Eco 1st & 2nd Term',fontsize=24)
plt.tight_layout()

# Conclusions

* Feature yang digunakan dalam modelling berjumlah 26 feature termasuk yang sudah di one-hot encode
* Segmentasi customer dapat dibagi menjadi beberapa kategori yaitu berdasarkan Type of Travel, Class, serta Loyalty customernya. Berdasarkan EDA didapatkan satisfaction terendah berada pada segmentasi customer berdasarkan Class Eco
* Terdapat 5 algoritma klasifikasi yang diaplikasikan. Dari ke 5 algoritma ini yang memiliki f1-score paling tinggi ialah RandomForest dan XGBoost
* Berdasarkan EDA serta feature importance model yang terpilih, ditemukan 3 feature yang berpengaruh besar terhadap target
* 3 Feature itu ialah Inflight entertainment, Seat comfort, & Ease of online booking. Dan feature - feature ini merupakan feature terpilih yang menjadi rekomendasi untuk ditingkatkan
* Pada simulasi impact dilakukan uji simulasi peningkatan rating dari 3 feature. Dilakukan 2 uji coba yaitu 3 feature terpilih, dan 3 feature random
* Hasil dari simulasi ialah peningkatan yang terjadi terhadap 3 feature terpilih dapat pula meningkatkan satisfaction level secara keseluruhan. Sedangkan sebaliknya, jika 3 feature random yang ditingkatkan, tidak ada peningkatan satisfaction level bahkan cenderung memberi penurunan.