# Bank Dataset

## --> Create Dataframe

In [2]:
import pandas as pd

url = 'https://drive.google.com/file/d/16mqr9dRGhUn2_m2BjJHkXMZ9GBmlvGm3/view?usp=share_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

df = pd.read_csv(url)
df

ModuleNotFoundError: No module named 'pandas'

## 1) Preprocessing

### A) Basics

In [None]:
df.describe()

: 

RowNumber, CustomerId and Surname are completely irrelevant and should be dropped

In [None]:
df.drop(['CustomerId','RowNumber','Surname'],axis='columns', inplace=True)

: 

In [None]:
df.head()

: 

In [None]:
df.isnull().sum()

: 

There are no null values to deal with

In [None]:
df.info()

: 

In order to use ANN, we need to convert object type into float or int

### B) Label Encoding

In [None]:
def print_unique_col_val(df):
    for col in df:
        if df[col].dtypes == 'object':
            print(f'{col} : {df[col].unique()}')

: 

In [None]:
print_unique_col_val(df)        # Will print unique values of columns which are of object type

: 

We need to encode Gender, and One Hot Encode Geography

In [None]:
df['Gender'].replace({'Male':0, 'Female':1}, inplace=True)

: 

In [None]:
df['Gender'].unique()

: 

In [None]:
df = pd.get_dummies(data=df, columns=['Geography'])

: 

In [None]:
df.sample(5)

: 

In [None]:
df.info()

: 

All data types are now numbers. There are no strings or objects.

### C) Scaling

We will use the min max scaler to transform values that are not between 0 and 1, to values between 0 and 1

In [None]:
cols_to_scale = ['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','EstimatedSalary']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

: 

In [None]:
df.sample(3)

: 

## 2) Model Creation

In [None]:
# 1) Separate

X = df.drop('Exited',axis='columns')
y = df['Exited']

: 

In [None]:
# 2) Split

from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

: 

In [None]:
# 3) Train

import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([                                         # Our neural network in which we will input all of our layers
    keras.layers.Dense(10, input_shape=(12,), activation='relu'),  # 1.Input Layer (Dense Layer)
    keras.layers.Dense(7, activation='relu'),                      # 2.Hidden Layer. No need to enter input shape twice. Hidden Layer is optional
    keras.layers.Dense(1, activation='sigmoid')                    # 3.Output Layer. activation='sigmoid' cause we have 0s and 1s
])

model.compile(optimizer='adam',               # Very commonly used optimizer
              loss='binary_crossentropy',     # Because our output is binary
              metrics=['accuracy'])

: 

In [None]:
model.fit(X_train, y_train, epochs=100)

: 

In [None]:
# 5) Predict

y_pred = model.predict(X_test)
y_pred

: 

Since we used a sigmoid function we got y_pred values between 0 to 1<br>But y_test values are either 0 or 1

To solve this, we will simply convert this 2D y_pred array to 1D and replace values between 0-0.5 with 0 and replace anything greater than 0.5 with 1

In [None]:
y_pred_binary = []
for element in y_pred:
    if element > 0.5:
        p = 1
    else:
        p = 0
    y_pred_binary.append(p)

y_pred_binary[:5]

: 

## 3) Accuracy

In [None]:
from sklearn.metrics import confusion_matrix, classification_report as clarep

print(clarep(y_test, y_pred_binary))

: 

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt

cm = tf.math.confusion_matrix(labels=y_test, predictions=y_pred_binary)

plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True, fmt='d')              # fmt='d' helps adjust range of both axes
plt.xlabel('Predicted')
plt.ylabel('Actuals')

: 

Sources<br>
https://www.youtube.com/watch?v=MSBY28IJ47U