# Loading Data from google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')
project_path = '/content/drive/My Drive/assignments/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Import statements

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from tensorflow.keras import optimizers
import seaborn as sn
import matplotlib.pyplot as plt

  import pandas.util.testing as tm


Load CSV data

In [3]:
dataset_file = project_path + 'bank.csv' # Full Path to the CSV file in the google drive

In [4]:
data = pd.read_csv(dataset_file)

Check if the data is loaded correctly.

In [5]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Drop the columns which are unique for all users like IDs

Drop the RowNumber and CustomerId and surname columns as they are the unique columns for each of the users.

In [6]:
data = data.drop(labels=['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [7]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


There are 10000 entries and 11 columns and none of the data is null.

 # Distinguish the feature and target set

In [9]:
X_data = data.iloc[:, :-1] #Features

In [10]:
X_data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


Convert the categorical columns like Geography and Gender to multiple columns by encoding using the dummies.

In [11]:
#Get Dummies
X_data = pd.get_dummies(X_data, prefix_sep='_', drop_first=False)

In [12]:
X_data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,1,0
3,699,39,1,0.0,2,0,0,93826.63,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,1,0


In [13]:
X_data.shape

(10000, 13)

In [14]:
Y_data = data.iloc[:, data.columns.size -1:] #Target


In [15]:
Y_data.head()

Unnamed: 0,Exited
0,1
1,0
2,1
3,0
4,0


# Divide the data set into training and test sets

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size = 0.3, random_state = 4)

Convert the labels into categorical values

In [17]:
num_classes = 2 # number of classes is 2. Existed/ non-Exited (0/1)
y_train_class = tf.keras.utils.to_categorical(y_train, num_classes)   # Converting the target into categorical which is stored as numeric
y_test_class = tf.keras.utils.to_categorical(y_test, num_classes)  # Keras converst these into 1-hot coded vectors as these are lables 

# Normalize the train and test data

In [18]:
X_train = preprocessing.normalize(X_train) # Normalize train data

In [19]:
X_test = preprocessing.normalize(X_test) # Normalize test data

 # Initialize & build the model

In [20]:
# Initialize Sequential model
model = tf.keras.models.Sequential()

#Reshape data to 1D
model.add(tf.keras.layers.Reshape((13,),input_shape=(13,)))
model.add(tf.keras.layers.Dense(10, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
# Add and output layer
model.add(tf.keras.layers.Dense(2, activation="sigmoid"))

In [21]:
# Compile the model - add mse as loss and stochastic gradient descent as optimizer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
model.fit(X_train, y_train_class, batch_size = 50, epochs = 10, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fee70444f98>

In [23]:
results = model.evaluate(X_test, y_test_class)



### 7. Predict the results using 0.5 as a threshold

In [24]:
Y_pred = model.predict(X_test, batch_size=30, verbose=0)

In [25]:
Y_pred

array([[0.75257576, 0.28844917],
       [0.86590743, 0.14856084],
       [0.7608329 , 0.30976593],
       ...,
       [0.7599057 , 0.30579633],
       [0.7611685 , 0.31122455],
       [0.8659465 , 0.14840092]], dtype=float32)

### 2. Print the Accuracy score and confusion matrix

In [26]:
print(model.metrics_names)
print(results)    

['loss', 'accuracy']
[0.49087414145469666, 0.8053333163261414]


Confusion Matrix

In [27]:
conf = confusion_matrix(y_test_class.argmax(axis=1), Y_pred.argmax(axis=1))

In [28]:
# Confusion Matrix
conf

array([[2416,    0],
       [ 584,    0]])