Business Case Description

A manager at the bank is disturbed with more and more customers leaving their credit card services. They would really appreciate if one could predict for them who is gonna get churned so they can proactively go to the customer to provide them better services and turn customers' decisions in the opposite direction

In [None]:
# Import relevant libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

pd.set_option('max_columns', None)

In [None]:
# Load the dataset

dataset = pd.read_csv('../input/credit-card-customers/BankChurners.csv')

In [None]:
# Take a first glimpse at the data

dataset.head()

In [None]:
# Get rid of the last two columns because I don't need them

dataset = dataset.iloc[:,:-2]

In [None]:
# Check for missing values

dataset.isna().sum()

## Starting with some EDA

In [None]:
# Explore the variables

dataset.describe(include =  'all')

In [None]:
dataset['Education_Level'].value_counts()

In [None]:
dataset['Marital_Status'].value_counts()

In [None]:
dataset['Income_Category'].value_counts()

In [None]:
dataset['Card_Category'].value_counts()

In [None]:
dataset['Customer_Age'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Customer Age", size = 20)
plt.show()

In [None]:
dataset['Dependent_count'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Dependent Count", size = 20)
plt.show()

In [None]:
dataset['Months_on_book'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Months on book", size = 20)
plt.show()

In [None]:
dataset['Total_Relationship_Count'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Total Relationship Count", size = 20)
plt.show()

In [None]:
dataset['Months_Inactive_12_mon'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Months Inactive in the last 12 months", size = 20)
plt.show()

In [None]:
dataset['Contacts_Count_12_mon'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Contacts in the last 12 months", size = 20)
plt.show()

In [None]:
dataset['Credit_Limit'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Credit Limit", size = 20)
plt.show()

In [None]:
dataset['Total_Revolving_Bal'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Total Revolving Balance", size = 20)
plt.show()

In [None]:
dataset['Avg_Open_To_Buy'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Open to Buy Credit (Avg. Last 12 months)", size = 20)
plt.show()

In [None]:
dataset['Total_Amt_Chng_Q4_Q1'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Change in Transaction Amount (Q4 over Q1))", size = 20)
plt.show()

In [None]:
dataset['Total_Trans_Amt'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Total Transaction Amount", size = 20)
plt.show()

In [None]:
dataset['Total_Trans_Ct'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Total Transaction Count", size = 20)
plt.show()

In [None]:
dataset['Total_Ct_Chng_Q4_Q1'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Change in Transaction Count (Q4 over Q1)", size = 20)
plt.show()

In [None]:
dataset['Avg_Utilization_Ratio'].plot(kind = 'hist', figsize = (10, 8))
plt.title("Avg. Card Utiliation", size = 20)
plt.show()

##### The data looks pretty normal, there are no outliers or erroneous entries

## Preparing the data for analysis

In [None]:
X = dataset.iloc[:,2:]

In [None]:
y = dataset.iloc[:,1]

In [None]:
# Split the data into train and test sets 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
# Encode the response variables to 0s and 1s

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
print(y_train)
print(y_test)

In [None]:
# Perform feature scaling to the continuous variables 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train.iloc[:,[0,2,7,8,9,10,11,12,13,14,15,16,17,18]] = sc.fit_transform(X_train.iloc[:,[0,2,7,8,9,10,11,12,13,14,15,16,17,18]])
X_test.iloc[:,[0,2,7,8,9,10,11,12,13,14,15,16,17,18]] = sc.transform(X_test.iloc[:,[0,2,7,8,9,10,11,12,13,14,15,16,17,18]])

In [None]:
# Turn the categorical variables into dummy variables 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 3, 4, 5, 6])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.fit_transform(X_test))

## Test various models to see which one performs best

### Logistic Regression

In [None]:
# Train the model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# Test the model

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Build the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### K-NN

In [None]:
# Train the model

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
# Test the model

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Build the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### SVM

In [None]:
# Train the model

from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# Test the model

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Build the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### Naive Bayes

In [None]:
# Train the model

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
# Test the model

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Build the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### Decision Tree

In [None]:
# Train the model

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# Test the model

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Build the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### Random Forest

In [None]:
# Train the model

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# Test the model

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Build the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### ANN

In [None]:
# Import tensorflow

import tensorflow as tf

In [None]:
# Initialzing the ANN 

ann = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer

ann.add(tf.keras.layers.Dense(units=12, activation='relu'))

# Adding the second hidden layer

ann.add(tf.keras.layers.Dense(units=12, activation='relu'))

# Adding the output layer

ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [None]:
# Compile the ANN

ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
# Train the ANN

ann.fit(X_train, y_train, batch_size = 32, epochs = 100)

In [None]:
# Test the model

y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Build the confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### XGBoost

In [None]:
# Train the model

from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [None]:
# Test the model

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
# Apply k-fold cross validation

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

##### It looks like the best performing model is the XGBoost with a 97.16% accuracy score