In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
credit_card = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
credit_card = credit_card.iloc[:,:-2] # Removed last two columns as suggested by author
credit_card.head()

# Explore data

In [None]:
credit_card.info()

In [None]:
numerical_cols = []
object_cols = []
for i in range(credit_card.shape[1]):
    if credit_card.iloc[:,i].dtype == 'object':
        object_cols.append(credit_card.columns[i])
    else:
        numerical_cols.append(credit_card.columns[i])

# Drop clientnum
numerical_cols.pop(0)

# Set attrition_flag to outcome and drop
Y = credit_card['Attrition_Flag']
object_cols.pop(0)
        
print(f"Numerical column:\n {numerical_cols}\n")
print(f"Object column:\n {object_cols}")

## Numerical data

In [None]:
credit_card.describe()

## Categorical data

In [None]:
for i in range(credit_card.shape[1]):
    if credit_card.iloc[:,i].dtype == 'object':
        sns.countplot(x = credit_card.columns[i],hue='Attrition_Flag', data=credit_card)        
        plt.gcf().set_size_inches(12,5)
        plt.show()

# Prepare data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
# from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (StandardScaler(), numerical_cols),
    (OneHotEncoder(handle_unknown='ignore'), object_cols),
)

X = credit_card[numerical_cols + object_cols]

# Recode Y into 0 = Existing, 1 = Attrition
le = LabelEncoder()
Y = le.fit_transform(Y)

# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, stratify=Y, train_size=0.75)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]] # This is for deep learning

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
# Use score method to get accuracy of model
score = clf.score(X_valid, y_valid)

# Refer to: https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a
from sklearn import metrics
predictions = clf.predict(X_valid)
cm = metrics.confusion_matrix(y_valid, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}%'.format(round(score*100,1))
plt.title(all_sample_title, size = 15);

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier().fit(X_train, y_train)
predictions = clf.predict(X_valid)
score = clf.score(X_valid, y_valid)

# Plotting
cm = metrics.confusion_matrix(y_valid, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}%'.format(round(score*100,1))
plt.title(all_sample_title, size = 15);

# k-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier().fit(X_train, y_train)
predictions = clf.predict(X_valid)
score = clf.score(X_valid, y_valid)

# Plotting
cm = metrics.confusion_matrix(y_valid, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}%'.format(round(score*100,1))
plt.title(all_sample_title, size = 15);

# Linear SVC

In [None]:
from sklearn.svm import SVC

clf = SVC().fit(X_train, y_train)
predictions = clf.predict(X_valid)
score = clf.score(X_valid, y_valid)

# Plotting
cm = metrics.confusion_matrix(y_valid, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}%'.format(round(score*100,1))
plt.title(all_sample_title, size = 15);

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(X_train, y_train)
predictions = clf.predict(X_valid)
score = clf.score(X_valid, y_valid)

# Plotting
cm = metrics.confusion_matrix(y_valid, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}%'.format(round(score*100,1))
plt.title(all_sample_title, size = 15);

# Deep learning

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# Create model
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=input_shape),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(256, activation='relu'),    
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(1, activation='sigmoid'),
])

In [None]:
# Add Optimizer, Loss, and Metric
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=200,
    callbacks=[early_stopping],
)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [None]:
predictions = model.predict_classes(X_valid)

# Plotting
cm = metrics.confusion_matrix(y_valid, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}%'.format(round(history_df['binary_accuracy'].iloc[-1]*100,1))
plt.title(all_sample_title, size = 15);

# Conclusion

Using vanilla approach (no hyperparameter tuning), RandomForestClassifier gives the highest accuracy while Naive Bayes gives the lowest accuracy