# Part I Import packages and data

In [1]:
# Basic libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data provessing and machine learning packages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
import shap

print('packages installed')

packages installed


In [2]:
data = pd.read_csv('data.csv')
# Remove irrelevant features
data = data.drop(columns = ['RowNumber', 'CustomerId', 'Surname'])
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Summary statistics
data.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [4]:
# Defined variables

# Color
DarkBlue = "#24477F"
LightBlue = "#00AEEF"
White = "#FFFFFF"
Black = "#000000"
MediumBlue = "#005EB8"
SkyBlue = "#7EC8E3"
Orange = "#EF7C00"

# Part II Exploratory data analysis

In [None]:
# Check missing value
data.isnull().sum()

In [None]:
# Distribution of the target variable 'Exited'
plt.figure(figsize=(8, 6))
sns.countplot(x='Exited', data=data)
plt.title('Distribution of Exited')
plt.show()

In [None]:
# Distribution of numerical features
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
data[numerical_features].hist(bins=15, figsize=(15, 10), layout=(3, 2), color = Orange)
plt.suptitle('Distribution of Numerical Features')
plt.show()

In [None]:
# Boxplots of numerical features by target variable 'Exited'
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
    plt.subplot(3, 2, i+1)
    sns.boxplot(x='Exited', y=feature, data=data, color = DarkBlue)
    plt.title(f'{feature} by Exited')
plt.tight_layout()
plt.show()

In [None]:
# Countplots of categorical features by target variable 'Exited'
categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
plt.figure(figsize=(15, 10))

palette = {
    0: DarkBlue,
    1: Orange
}

for i, feature in enumerate(categorical_features):
    plt.subplot(2, 2, i+1)
    sns.countplot(x=feature, hue='Exited', data=data, palette = palette)
    plt.title(f'{feature} by Exited')
plt.tight_layout()
plt.show()

In [None]:
numerical_data = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=False)

# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = numerical_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Part III Baseline model

In [6]:
# Separate features and target variable
X = data.drop('Exited', axis=1)
y = data['Exited']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [8]:
# Zero R model

y_test_pred = np.zeros(len(y_test))

# Evaluate the model
print("Test Set Evaluation")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Test Set Evaluation
[[1570    0]
 [ 430    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1570
           1       0.00      0.00      0.00       430

    accuracy                           0.79      2000
   macro avg       0.39      0.50      0.44      2000
weighted avg       0.62      0.79      0.69      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
