# Task for Today  

***

## Airline Customer Satisfaction Prediction  

Given *data about airline customers*, let's try to predict if a given customer will be **satisfied with the airline**.

We will use a logistic regression model to make our predictions, but first we will detect and remove outliers using z-scores.

# Getting Started

In [None]:
import numpy as np
import scipy.stats
import pandas as pd
pd.set_option('max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('../input/airlines-customer-satisfaction/Invistico_Airline.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Fill missing arrival delay values with column mean
    df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].mean())
    
    # Binary encoding
    df['Gender'] = df['Gender'].replace({
        'Female': 0,
        'Male': 1
    })
    df['Customer Type'] = df['Customer Type'].replace({
        'disloyal Customer': 0,
        'Loyal Customer': 1
    })
    df['Type of Travel'] = df['Type of Travel'].replace({
        'Personal Travel': 0,
        'Business travel': 1
    })
    
    # One-hot encoding
    flight_class_dummies = pd.get_dummies(df['Class'], prefix='Flight_class')
    df = pd.concat([df, flight_class_dummies], axis=1)
    df = df.drop('Class', axis=1)
    
    return df

In [None]:
X = preprocess_inputs(data)

In [None]:
X

# Outlier Detection

In [None]:
eda_df = X.drop('satisfaction', axis=1).copy()
nonbinary_columns = [column for column in eda_df.columns if len(eda_df[column].unique()) > 2]

plt.figure(figsize=(20, 20))

for i, column in enumerate(nonbinary_columns):
    plt.subplot(3, 6, i + 1)
    sns.boxplot(data=eda_df[column], color='darkviolet')
    plt.title(column)

plt.suptitle("Boxplots With Outliers", size=30)
plt.show()

# Outlier Removal

In [None]:
def remove_outliers(df, columns, threshold):
    df = df.copy()
    
    # Calculate the lower and upper bounds on the Z distribution given a threshold value
    lower_bound = scipy.stats.norm.ppf(q=(threshold / 2), loc=0, scale=1)
    upper_bound = scipy.stats.norm.ppf(q=1 - (threshold / 2), loc=0, scale=1)
    
    # Calculate z-scores of every example in the columns specified
    outlier_df = df.loc[:, columns].copy()
    zscores = pd.DataFrame(scipy.stats.zscore(outlier_df, axis=0), index=outlier_df.index, columns=outlier_df.columns)
    
    # Get boolean arrays denoting the outlier examples
    lower_outliers = (zscores < lower_bound).any(axis=1)
    upper_outliers = (zscores >= upper_bound).any(axis=1)
    
    # Get indicies of all outlier examples
    outliers = df[pd.concat([lower_outliers, upper_outliers], axis=1).any(axis=1)].index
    
    # Drop the outliers
    df = df.drop(outliers, axis=0).reset_index(drop=True)
    print(len(outliers), "examples dropped.")
    
    return df

In [None]:
outliers_df = remove_outliers(
    df=X,
    columns=[
        'On-board service',
        'Checkin service',
        'Departure Delay in Minutes',
        'Arrival Delay in Minutes',
        'Online boarding'
    ],
    threshold=0.08
)

plt.figure(figsize=(20, 20))

for i, column in enumerate(nonbinary_columns):
    plt.subplot(3, 6, i + 1)
    sns.boxplot(data=outliers_df[column], color='cornflowerblue')
    plt.title(column)

plt.suptitle("Boxplots Without Outliers", size=30)
plt.show()

# Finalizing Model Inputs

In [None]:
def finalize_inputs(df, keep_outliers=True, outlier_threshold=0.05):
    df = df.copy()
    
    
    
    # Train-test Split
    train_df, test_df = train_test_split(df, train_size=0.7, shuffle=True, random_state=1)
    
    # Remove outliers
    if keep_outliers == False:
        train_df = remove_outliers(
            train_df,
            columns=[
                'On-board service',
                'Checkin service',
                'Departure Delay in Minutes',
                'Arrival Delay in Minutes'
            ],
            threshold=outlier_threshold
        )
    
    # Split df into X and y
    y_train = train_df['satisfaction']
    y_test = test_df['satisfaction']
    X_train = train_df.drop('satisfaction', axis=1)
    X_test = test_df.drop('satisfaction', axis=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

# Training Without Outlier Removal

In [None]:
X_train1, X_test1, y_train1, y_test1 = finalize_inputs(X, keep_outliers=True)

model1 = LogisticRegression()
model1.fit(X_train1, y_train1)

print("Test Accuracy: {:.3f}%".format(model1.score(X_test1, y_test1) * 100))

# Training With Outlier Removal

In [None]:
X_train2, X_test2, y_train2, y_test2 = finalize_inputs(X, keep_outliers=False, outlier_threshold=0.0000001)

model2 = LogisticRegression()
model2.fit(X_train2, y_train2)

print("Test Accuracy: {:.3f}%".format(model2.score(X_test2, y_test2) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/drTBYwjFjn4