# Bank Customers Churn Model

**Author:** Steve Githinji
***

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_selection import (VarianceThreshold, SelectKBest, f_regression, mutual_info_regression, 
    RFE, RFECV)

## Business Understanding

## Data Understanding

In [2]:
data = pd.read_csv('data/Churn_Modelling.csv', index_col='RowNumber')

data.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


In [4]:
data.describe()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [5]:
data.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
print('Raw counts: \n')
print(data['Exited'].value_counts())
print('Normalized counts: \n')
print(data['Exited'].value_counts(normalize=True))

Raw counts: 

0    7963
1    2037
Name: Exited, dtype: int64
Normalized counts: 

0    0.7963
1    0.2037
Name: Exited, dtype: float64


In [7]:
data.nunique().sort_values(ascending=False)

CustomerId         10000
EstimatedSalary     9999
Balance             6382
Surname             2932
CreditScore          460
Age                   70
Tenure                11
NumOfProducts          4
Geography              3
Exited                 2
IsActiveMember         2
HasCrCard              2
Gender                 2
dtype: int64

In [8]:
data['Geography'].value_counts()


France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

Half of the records are for customers living in France, while.....

In [9]:
def view_salary_balance_range(df, salaries, balances):
    min_sal, max_sal = df[salaries].min(), df[salaries].max()
    min_bal, max_bal = df[balances].min(), df[balances].max()
    print(f"The dataset contains customers whose salaries range from €{min_sal} to €{max_sal}. Their account balances range from {min_bal} to {max_bal}.")

In [10]:
view_salary_balance_range(data, 'EstimatedSalary', 'Balance')

The dataset contains customers whose salaries range from €11.58 to €199992.48. Their account balances range from 0.0 to 250898.09.


## Data Preparation

### Categorical Features

In [11]:
X = data.drop(columns=['Exited', 'CustomerId', 'Surname'], axis=1)
y = data['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11)

In [12]:
X_train_categorical = X_train.select_dtypes(exclude=['int64', 'float64']).copy()
X_train_categorical

Unnamed: 0_level_0,Geography,Gender
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
2263,France,Male
1455,Spain,Female
3902,Spain,Male
6247,France,Male
1332,Germany,Male
...,...,...
1294,France,Male
4024,France,Male
7260,Spain,Male
5201,Germany,Male


In [13]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

ohe.fit(X_train_categorical)

X_train_ohe = pd.DataFrame(
    ohe.transform(X_train_categorical),
    index=X_train_categorical.index,
    columns=np.hstack(ohe.categories_)
)
X_train_ohe

Unnamed: 0_level_0,France,Germany,Spain,Female,Male
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2263,1.0,0.0,0.0,0.0,1.0
1455,0.0,0.0,1.0,1.0,0.0
3902,0.0,0.0,1.0,0.0,1.0
6247,1.0,0.0,0.0,0.0,1.0
1332,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...
1294,1.0,0.0,0.0,0.0,1.0
4024,1.0,0.0,0.0,0.0,1.0
7260,0.0,0.0,1.0,0.0,1.0
5201,0.0,1.0,0.0,0.0,1.0


### Normalization

In [14]:
numeric_features = ['CreditScore', 'Age','Tenure', 'Balance', 'NumOfProducts',
                    'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
X_train_numeric = X_train[numeric_features].copy()
X_train_numeric.tail()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1294,641,30,2,87505.47,2,0,1,7278.57
4024,535,38,8,85982.07,1,1,0,9238.35
7260,625,32,7,106957.28,1,1,1,134794.02
5201,512,42,9,93955.83,2,1,0,14828.54
3776,528,22,5,93547.23,2,0,1,961.57


In [15]:
scaler = MinMaxScaler()

scaler.fit(X_train_numeric)
X_train_scaled = pd.DataFrame(
    scaler.transform(X_train_numeric),
    # index is important to ensure we can concatenate with other columns
    index=X_train_numeric.index,
    columns=X_train_numeric.columns
)
X_train_scaled

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2263,0.444,0.337838,0.2,0.633911,0.000000,1.0,0.0,0.132222
1455,0.342,0.216216,0.7,0.319280,0.000000,1.0,1.0,0.101183
3902,0.450,0.162162,0.2,0.000000,0.333333,1.0,1.0,0.411140
6247,0.566,0.297297,0.4,0.679710,0.000000,0.0,1.0,0.173331
1332,0.998,0.256757,0.7,0.647546,0.333333,1.0,1.0,0.086430
...,...,...,...,...,...,...,...,...
1294,0.582,0.162162,0.2,0.395000,0.333333,0.0,1.0,0.036342
4024,0.370,0.270270,0.8,0.388123,0.000000,1.0,0.0,0.046143
7260,0.550,0.189189,0.7,0.482806,0.000000,1.0,1.0,0.674050
5201,0.324,0.324324,0.9,0.424117,0.333333,1.0,0.0,0.074100


In [16]:
X_train_full = pd.concat([X_train_scaled, X_train_ohe], axis=1)
X_train_full

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,France,Germany,Spain,Female,Male
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2263,0.444,0.337838,0.2,0.633911,0.000000,1.0,0.0,0.132222,1.0,0.0,0.0,0.0,1.0
1455,0.342,0.216216,0.7,0.319280,0.000000,1.0,1.0,0.101183,0.0,0.0,1.0,1.0,0.0
3902,0.450,0.162162,0.2,0.000000,0.333333,1.0,1.0,0.411140,0.0,0.0,1.0,0.0,1.0
6247,0.566,0.297297,0.4,0.679710,0.000000,0.0,1.0,0.173331,1.0,0.0,0.0,0.0,1.0
1332,0.998,0.256757,0.7,0.647546,0.333333,1.0,1.0,0.086430,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1294,0.582,0.162162,0.2,0.395000,0.333333,0.0,1.0,0.036342,1.0,0.0,0.0,0.0,1.0
4024,0.370,0.270270,0.8,0.388123,0.000000,1.0,0.0,0.046143,1.0,0.0,0.0,0.0,1.0
7260,0.550,0.189189,0.7,0.482806,0.000000,1.0,1.0,0.674050,0.0,0.0,1.0,0.0,1.0
5201,0.324,0.324324,0.9,0.424117,0.333333,1.0,0.0,0.074100,0.0,1.0,0.0,0.0,1.0


## Modelling

In [17]:
# Handling categorical data
X_test_categorical = X_test.select_dtypes(exclude=['int64', 'float64']).copy()
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test_categorical),
    index=X_test_categorical.index,
    columns=np.hstack(ohe.categories_)
)

# Normalization
X_test_numeric = X_test[numeric_features].copy()
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_numeric),
    index=X_test_numeric.index,
    columns=X_test_numeric.columns
)

# Concatenating categorical and numeric data
X_test_full = pd.concat([X_test_scaled, X_test_ohe], axis=1)
X_test_full

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,France,Germany,Spain,Female,Male
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3105,0.352,0.175676,0.5,0.656956,0.000000,1.0,0.0,0.662101,0.0,1.0,0.0,0.0,1.0
6354,0.496,0.229730,0.8,0.515556,0.000000,1.0,1.0,0.371632,1.0,0.0,0.0,0.0,1.0
8690,0.384,0.662162,1.0,0.584254,0.000000,0.0,1.0,0.106683,1.0,0.0,0.0,0.0,1.0
5858,0.488,0.513514,0.7,0.000000,0.000000,1.0,0.0,0.131048,1.0,0.0,0.0,1.0,0.0
6012,0.340,0.364865,0.1,0.555612,0.000000,1.0,1.0,0.205196,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7653,0.708,0.081081,0.2,0.668963,0.333333,1.0,0.0,0.914004,1.0,0.0,0.0,0.0,1.0
7828,0.510,0.310811,1.0,0.000000,0.333333,0.0,1.0,0.486107,1.0,0.0,0.0,0.0,1.0
5170,0.602,0.175676,0.7,0.622969,0.333333,1.0,0.0,0.649638,0.0,1.0,0.0,0.0,1.0
9313,0.458,0.297297,1.0,0.204976,0.333333,1.0,0.0,0.093626,0.0,1.0,0.0,0.0,1.0


#

In [18]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))

## start

In [None]:
rf_param_grid = {
    "n_estimators": [10, 30, 50, 100],
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 6, 10, 12],
    "min_samples_split": [2, 5, 10, 12],
    "min_samples_leaf": [1, 3, 6],
}

#### resample

In [None]:
# Previous original class distribution
print(y_train.value_counts()) 

# Fit SMOTE to training data
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train_full, y_train) 

# Preview synthetic sample class distribution
print('\n')
print(pd.Series(y_train_resampled).value_counts()) 

In [None]:
forest = RandomForestClassifier()
selector = SelectKBest(score_func=f_regression, k=11)
X_train_k_best = selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_k_best = selector.transform(X_test_full)

In [22]:
# Get the indices of the selected features
selected_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = np.array(data[selected_indices]

# Print the selected feature names
print(selected_feature_names)

SyntaxError: invalid syntax (<ipython-input-22-3475e2656b2b>, line 8)

In [None]:
rf4_grid_search = GridSearchCV(forest, rf_param_grid, cv=3)
rf4_grid_search.fit(X_train_k_best, y_train_resampled)

print(f"Training Accuracy: {rf4_grid_search.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf4_grid_search.best_params_}")

Use the optimized hyperparameters.

In [None]:
rf4_tuned = RandomForestClassifier(criterion='gini', max_depth=None,
                                  min_samples_leaf=3, min_samples_split=5, 
                                  n_estimators=100)

rf4_tuned.fit(X_train_k_best, y_train_resampled)

y_train_rf4_preds = rf4_tuned.predict(X_train_k_best)

y_test_rf4_preds = rf4_tuned.predict(X_test_k_best)


In [None]:
print_metrics(y_train_resampled, y_train_rf4_preds)

In [None]:
print_metrics(y_test, y_test_rf4_preds)