In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')

## Read the file

In [None]:
df = pd.read_csv('../input/airline-passenger-satisfaction/train.csv')

## Drop absolute unnecessary columns

In [None]:
df.drop(['Unnamed: 0', 'id'], axis=1, inplace=True)

## Imbalance check

transform the satisfaction string category to numerical category for the imbalance check

In [None]:
df['satisfaction'] = df.satisfaction.apply(lambda x: int(1) if x == 'satisfied' else int(0))

In [None]:
print(df.satisfaction.value_counts()/len(df.satisfaction))

#### the data are balance enough, so the data doesn't need to be resampling

## Not Available data analysis

In [None]:
NA_columns = [column for column in df.columns if df[column].isnull().sum() > 0]

the percentage of the NA data

In [None]:
print(df[NA_columns].isnull().sum()/ len(df[NA_columns]))

In [None]:
df[NA_columns].hist(bins=100)

#### the distribution very right skewed cause the modus of this columns is 0

see the effect of the nan with the satisfaction and compare it to non nan with satisfaction. 1 for nan row data and 0 for non nan

In [None]:
for col in NA_columns:
    df[col + '_na'] = np.where(df[NA_columns].isnull(), 1, 0)
    sns.countplot(col + '_na', data=df, hue='satisfaction' )
    plt.tight_layout()

cause the nan percentage s very small, so I plot the nan here to see the effect

In [None]:
nan_data = df[(df['Arrival Delay in Minutes_na'] == 1)]
sns.catplot('Arrival Delay in Minutes_na', data=nan_data, kind='count', hue='satisfaction' )

#### the plot showed that the nan value is sligthly affect the satisfaction, or even not affecting at all cause the satisfaction percentage are the same in non nan and nan.

In [None]:
df.drop('Arrival Delay in Minutes_na', axis=1, inplace=True)

## Numeric Columns Analysis

In [None]:
num_columns = [column for column in df.columns if df[column].dtypes != 'O']

in the numeric column, there are two types, discrete and continuous. here I Extract the discrete numeric columns with threshold unique data < 10

### Discrete Numeric Columns

In [None]:
num_columns_discrete = [column for column in num_columns if len(df[column].unique()) <= 10 and column not in ['satisfaction']]

the num_columns _discrete is appropriate according to the dataset as service satisfaction value in each columns

transform data 0 (Not Applicable) to mode in each columns

In [None]:
for col in num_columns_discrete:
    modus_value = str(df[col].mode()[0])
    df[col] = df[col].astype(str).apply(lambda x: x.replace('0', modus_value))
    df[col] = df[col].astype(int)

In [None]:
for col in num_columns_discrete:
    plt.figure()
    sns.countplot(col, data=df, alpha=0.5)
    sns.countplot(col, data=df, hue='satisfaction')
    plt.tight_layout()

#### inflight wifi service most of the data are in 2 and 3 point and end up a lot dissatisfied, it happens also in ease of online booking, gate location . departure/arrival time are in good trends but not lead to the satisfaction of the airline. Food and drink seems neutral and not impresive . Online boarding, seat comfort, inflight entertaintment, on-board-service, leg room service, cleanliness are in the good shape with the most of the data in point 4 and the trends lead to the satisfaction of the airline. Baggage handling, checkin service and inflight service are slightly like departure/arrival time but still got a lot satisfaction in point 5

#### so in summary inflight wifi service, online booking and gate location need big improvement for better overall service performance.

#### Food and drink, baggage handling, checkin service and inflight service just need little improvement cause they already have the good trends

#### and big question mark for the departure/arrival time convenient, why they have good trends but end up in dissatisfaction.

### Continuous Numeric Columns

In [None]:
num_columns_continue = [column for column in num_columns if column not in num_columns_discrete and column not in ['satisfaction']] 

In [None]:
for col in num_columns_continue:
    fig, axs = plt.subplots(1,2, figsize=(10,5))
    sns.violinplot(y=col, data=df, x='satisfaction', ax=axs[0])
    plt.title(col + ' and satisfaction')
    sns.distplot(x=df[col], ax=axs[1])
    plt.title(col)
    plt.tight_layout()

#### Age < 40 and age > 60 tend to not satisfied, while age range for 40 - 60 tend to be satisifed with the airlines. In short range flight distance the passenger tend to be not satisfied while at the more long range tend to be satisfied. In arrival and departure delay, the shorter delay tend to be satisfied the passenge.

## Categorical columns analysis

In [None]:
cat_columns = [column for column in df.columns if column not in num_columns]

In [None]:
for col in cat_columns:
    plt.figure()
    sns.countplot(col, data=df, alpha=0.5)
    sns.countplot(col, data=df, hue='satisfaction')
    plt.tight_layout()

#### For the gender and customer type, the data shows that they are less important for the overall satisfaction. The passenger of business class and type shows that they are tend to be satisfied. and the eco class shows the worst percentage of the passenger to not to be satisfied with the airline.

## Check the correlation between columns in the data

In [None]:
print(num_columns_discrete)

In [None]:
print(num_columns_continue)

In [None]:
print(cat_columns)

### Correlation with the gender

In [None]:
for col in cat_columns :
    plt.figure()
    sns.countplot(data=df, x=col, hue='Gender')

#### the gender seem balance, therefore gender is less essential for the another categorical columns

In [None]:
for col in num_columns_discrete:
    plt.figure()
    sns.countplot(data=df, x=col, hue='Gender')

#### its also the same for the numerical discrete columns, which is the gender is the less important factor 

In [None]:
for col in num_columns_continue:
    plt.figure()
    sns.histplot(data=df, x=col, hue='Gender', multiple='dodge')

#### yeah its the same result, no correlation

### Correlation with the age

In [None]:
for col in num_columns_continue:
    plt.figure()
    sns.scatterplot(data=df, x=col, y='Age', hue='satisfaction')

#### there is no correlation here

In [None]:
for col in cat_columns:
    fig, axs = plt.subplots(1,2, figsize=(10,5))
    sns.violinplot(data=df, x=col, y='Age', hue='satisfaction', ax=axs[0], split=True)
    sns.violinplot(data=df, x=col, y='Age', ax=axs[1], palette='Greys')

#### Gender for the age are completely the same, less important. the passenger with age > 40 tend to be loyal and satisfied with the airline, while passenger with the younger age tend to be disloyal but satisfied with the airline. again the passenger with age over 40 dominance the business travel type of travel and tend to be satisfied with the airline. eco plus and eco dominance by the younger passenger, < 20, with tend to be not satisfied, while bussiness class are dominance by the older passenger, with age over 40 years and tend to be satisfied with the airline. 

In [None]:
for col in num_columns_discrete:
    plt.figure()
    sns.violinplot(data=df, x=col, y='Age')

#### online boarding, seat comfort, inflight entertainment, on-board service, leg room service are dominant to provide the good service for older people, age 40 - 60. and Baggage handling tend to be bad service for the older people

### Another Table correlation information

In [None]:
sns.countplot(data=df, x='Class', hue='Type of Travel')

In [None]:
sns.violinplot(data=df, x='Type of Travel', y='Flight Distance')

In [None]:
sns.violinplot(data=df, x='Class', y='Flight Distance')

#### Business class are dominant by the business type of travel with long range filght distance

## Conclussion for Exploratory Data Analysis

#### The Airlines passenger satisfaction are dominanth by the older people within range of the age between 40 - 60 for the business travel using business class wihtin long flying range distance, which supported and reinforced by good services score for older people like seat comfort, on-board service and leg room.

# Feature Engineering

## Fix the NAN and Not Applicable data

In [None]:
for col in NA_columns:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)

check the NA after the fillna

In [None]:
df.isnull().sum()

transform the Not Applicable (0) with the modus in columns services (num_columns_discrete)

In [None]:
for col in num_columns_discrete:
    modus_value = str(df[col].mode()[0])
    df[col] = df[col].astype(str).apply(lambda x: x.replace('0', modus_value))
    df[col] = df[col].astype(int)

## Transform not normally distributed data to normally distributed

In [None]:
for col in num_columns_continue:
    plt.figure()
    sns.histplot(data=df, x=col)

The Flight Distance need to transform to get the more normally distributed. here Iam using two type of transform, and compare the transform result

In [None]:
FD_log = np.log(df['Flight Distance'])
plt.figure()
sns.histplot(FD_log)

In [None]:
import scipy.stats as stats
FD_boxcox = stats.boxcox(df['Flight Distance'])
plt.figure()
sns.histplot(FD_boxcox)

from the result log transfrom give more normally distributed, so I choose log transform method

In [None]:
df['Flight Distance'] = FD_log

## Encode string categorical column into numeric

the encode is ordered by the sum of satisfaction within one categoy in each columns

In [None]:
def encode_category(data, column, target):
    ordinal_data= data.groupby([column])[target].sum().sort_values(by=column, ascending=False).index
    ordinal_num = {k: i for i, k in enumerate(ordinal_data, start=0)}
    data[column] = data[column].map(ordinal_num)
    print(ordinal_data)

In [None]:
for col in cat_columns:
    encode_category(df, col, ['satisfaction'])

## Scale the dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
columns_training = [column for column in df.columns if column not in ['satisfaction']]
df[columns_training] = sc_X.fit_transform(df[columns_training])

# Feature Selection

In [None]:
y = df['satisfaction']
X = df.drop(['satisfaction'], axis=1)

## Filter Methods

### Continuous Columns

In [None]:
sns.heatmap(df[num_columns_continue].corr(), annot=True)
plt.tight_layout()

'Departure Delay in Minutes' and 'Arrival Delay in Minutes' are too corelated for the correlation between independent features, so one of the features need to be drop

In [None]:
from sklearn.feature_selection import f_classif, SelectKBest
selector_cont = SelectKBest(score_func = f_classif, k=3)
selector_cont.fit(X[num_columns_continue], y)
pd.DataFrame({'Features':X[num_columns_continue].columns, 'F-Score':selector_cont.scores_, 'p-value':selector_cont.pvalues_})

cause the 'Departure Delay in minutes' is the smallest F-score, so I drop it.

In [None]:
cont_select = X[num_columns_continue].columns[selector_cont.get_support()].tolist()

### Categorical columns

for categorical columns I use chi squared function.

In [None]:
categorical_columns = [column for column in X.columns if column not in num_columns_continue]

In [None]:
from sklearn.feature_selection import chi2
selector_cat = SelectKBest(score_func= chi2, k=15)
selector_cat.fit(X[categorical_columns], y)
pd.DataFrame({'Features':X[categorical_columns].columns, 'score':selector_cat.scores_, 'p-value':selector_cat.pvalues_})

from the score, 'Gender', 'Departure/Arrival time convenient', and 'gate location' are the less important feature and have big score difference with another features

In [None]:
cat_select = X[categorical_columns].columns[selector_cat.get_support()].tolist()

All selected features

In [None]:
selected_features = cont_select + cat_select

# Machine Learning Modeling

## Define the X, Y and split it into Training and Test dataset

In [None]:
y = df['satisfaction']
X = df[selected_features]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41, test_size=0.2)

## Model Comparassion

#### Model selection with metrics, the metrics that I use is accuracy and precission in model selection. Precission is choose cause I want to minimize the False Positive (not satisfied data classify as satisfied) to get the worst case scenario, so the Airline can do maximum improve in the service

In [None]:
def model_selection(X_train, y_train, X_test, y_test, models):
    
    from sklearn.metrics import accuracy_score, precision_score
    
    accuracy_result = []
    precission_result = []
    str_models = []
    
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        precission = precision_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)       
        accuracy_result.append(accuracy)
        precission_result.append(precission)  
        str_models.append(str(model))
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,10))

    ax1.plot(accuracy_result)
    ax1.set_ylabel('accuracy_score')

    ax2.plot(str_models,precission_result)
    ax2.set_ylabel('precission_result')
    ax2.set_xticklabels(str_models, rotation=90)
    plt.tight_layout()
    
    return pd.DataFrame({'models':models, 'accuracy':accuracy_result, 'precission':precission_result}) 

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
models = [LogisticRegression(), RidgeClassifier(alpha=0.005), LinearSVC(), SVC(), KNeighborsClassifier(),
          RadiusNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(),
          AdaBoostClassifier(), MLPClassifier()]
 
model_selection(X_train, y_train, X_test, y_test, models)

## Best Classsifier Model

In [None]:
rf = RandomForestClassifier()
svc = SVC()

## Random Forest

### Confusion Matrix 

In [None]:
def confusion(X_train, y_train, X_test, y_test, model):
    from sklearn.metrics import confusion_matrix
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    confu_score = confusion_matrix(y_pred, y_test, labels=[1,0])
    return pd.DataFrame(confu_score, columns=['Actual Postive', 'Actual Negative'], 
                        index=['Predicted Positive', 'Predicted Negative'])

In [None]:
confusion(X_train, y_train, X_test, y_test, rf)

#### The Random Forest have better performance based on the confusion matrix, its showed by the the smaller false positive

## ROC_AUC

In [None]:
def ROC_AUC_test(X_train, y_train, X_test, y_test, model):
    from sklearn.metrics import roc_auc_score, roc_curve
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    y_pred = y_pred[:,1]
    AUC = roc_auc_score(y_test, y_pred)
    tpr, fpr, _ = roc_curve(y_test, y_pred)
    print('AUC: ' + str(AUC))
    plt.plot(tpr, fpr)
    plt.title('ROC performance for ' + str(model))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/

In [None]:
ROC_AUC_test(X_train, y_train, X_test, y_test, rf)

### Overfit and underfit check

In [None]:
def fit_check(model, kfolds):
    
    from sklearn.model_selection import KFold
    from sklearn.metrics import precision_score
    
    kf = KFold(n_splits=kfolds)
    list_training_error = []
    list_testing_error = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_train_data_pred = model.predict(X_train)
        y_test_data_pred = model.predict(X_test)
        fold_training_error = precision_score(y_train, y_train_data_pred)
        fold_testing_error = precision_score(y_test, y_test_data_pred)
        list_training_error.append(fold_training_error)
        list_testing_error.append(fold_testing_error)
    
    figsize=(5,5)
    plt.plot(range(1, kf.get_n_splits() + 1), np.array(list_training_error).ravel(), 'o-', label = 'training')
    plt.plot(range(1, kf.get_n_splits() + 1), np.array(list_testing_error).ravel(), 'o-', label = 'testing')
    plt.xlabel('number of fold')
    plt.ylabel('Precision')
    plt.title('Precision across folds')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
fit_check(rf, 5)

#### The overfit and underfit check showed that the model have an overfit indication cause the precision in training have better performance than precision in test set

### Hyperparameter using RandomizedsearchCV

In [None]:
def model_randomCV(X, y, model, parameters):
    
    from sklearn.model_selection import RandomizedSearchCV
    
    randCV = RandomizedSearchCV(estimator=model, scoring='precision', param_distributions=parameters, n_jobs=-1, cv=3)
    
    randCV.fit(X, y)
   
    print('best_parameters: ' + str(randCV.best_params_))
    print('best_score: ' + str(randCV.best_score_))
    print('best_estimator: ' + str(randCV.best_estimator_))    
    
    return pd.DataFrame(randCV.cv_results_).sort_values(by='rank_test_score')

In [None]:
parameters = {'n_estimators': np.arange(290, 310, 5), 'max_features':['auto', 'sqrt', 'log2'], 'max_depth':np.arange(18,23, 1), 
             'min_samples_split':np.arange(3, 8, 1), 'criterion':['gini', 'entropy']}

model_randomCV(X_train, y_train, rf, parameters)

#### The best random forest model got slightly better in precision score, which is 0.968

### Random Forest after Hyperparameter Tuning

In [None]:
rf_best = RandomForestClassifier(criterion='entropy', max_depth=19, max_features='sqrt',
                       min_samples_split=3, n_estimators=295)

In [None]:
confusion(X_train, y_train, X_test, y_test, rf_best)

In [None]:
ROC_AUC_test(X_train, y_train, X_test, y_test, rf_best)

In [None]:
fit_check(rf_best, 5)

#### The fitting of Random Forest after tuning shows that the overfit is slightly decrease, but it's not enough to dispute the overfitting indication in default Random Forest 

## Support Vector Classifier

In [None]:
confusion(X_train, y_train, X_test, y_test, svc)

In [None]:
svc_prob_true = SVC(probability=True)

ROC_AUC_test(X_train, y_train, X_test, y_test, svc_prob_true)

In [None]:
fit_check(svc, 5)

#### The SVC's performance(precision and ROC_AUC) is no better than Random Forest, but the fitiing perform better than Random Forest

### SVC with parameters

Cause SVC model hyperparameter tuning take a longer time to train the model especially with gamma and kernel poly parameters (I already treid it in my Jupyter Notebook), so here Iam simply test model fitting with the parameter that I already have. 

In [None]:
svc_hyp = SVC(C=10)

fit_check(svc_hyp, 5)

In [None]:
svc_hyp_1 = SVC(gamma=1)

fit_check(svc_hyp_1, 5)

### Model Conclusion

In [None]:
def model_conclusion(X_train, y_train, X_test, y_test, models):
    
    from sklearn.metrics import precision_score
    
    train_result = []
    test_result = []
    str_models = []
    
    for model in models:
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred = model.predict(X_test)
        precision_train = precision_score(y_train, y_pred_train)
        precision_test = precision_score(y_test, y_pred)       
        train_result.append(precision_train)
        test_result.append(precision_test)  
        str_models.append(str(model))
    
    figsize=(10,10)

    sns.lineplot(str_models, train_result, label='train')
    sns.lineplot(str_models, test_result, label='test')
    plt.ylabel('Precision')
    plt.title('Precision for models')
    plt.xticks(str_models, rotation=90)
    plt.legend()
    plt.tight_layout()
    
    return pd.DataFrame({'models':models, 'precision_train':train_result, 'precission_test':test_result}) 

In [None]:
rf = RandomForestClassifier()
svc = SVC()
rf_best = RandomForestClassifier(criterion='entropy', max_depth=19, max_features='sqrt', min_samples_split=3, n_estimators=295)
svc_hype = SVC(C=10)
svc_hype_1 = SVC(gamma=1)
models = [rf, svc, rf_best, svc_hype, svc_hype_1]

model_conclusion(X_train, y_train, X_test, y_test, models)

#### from the model conclusion, I choose SVC model with gamma parameter to prevent overfitting with better precision.