# Importing the data

In [9]:
# First we import the library we will need
import numpy as np
import pandas as pd
#from google.colab import drive
import tensorflow as tf
#import torch
#import torch.nn as nn
from sklearn.metrics import confusion_matrix

In [12]:
# Define the file path and import the datas in a pandas dataframe named df
path ='/home/splatch58/DSTI/ML_PythonLabs/DOCKER_PYTHON/python_code/Churn_Modelling.csv'

In [13]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Analysing the dataset

In [6]:
df.shape

(10000, 14)

In [7]:
df[df['Exited']==1]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
16,17,15737452,Romeo,653,Germany,Male,58,1,132602.88,1,1,0,5097.67,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,9982,15672754,Burbidge,498,Germany,Male,42,3,152039.70,1,1,1,53445.17,1
9982,9983,15768163,Griffin,655,Germany,Female,46,7,137145.12,1,1,0,115146.40,1
9991,9992,15769959,Ajuluchukwu,597,France,Female,53,4,88381.21,1,1,0,69384.71,1
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1


we have 10 000 samples with 13 features (Rownumber is not considered as a usable feature)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


There is 3 categorical features (to be transformed later) and the remaining data are numerical features.
Let's now check if there is some missing values.

In [8]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Let's check the number of unique value.

In [9]:
df.nunique()

RowNumber          10000
CustomerId         10000
Surname             2932
CreditScore          460
Geography              3
Gender                 2
Age                   70
Tenure                11
Balance             6382
NumOfProducts          4
HasCrCard              2
IsActiveMember         2
EstimatedSalary     9999
Exited                 2
dtype: int64

We map the gender columns as a binary value because we can't use an object/categorical data as an input for the model.
We also one-hot encode the Geography feature so that we can use these categorical features in a model.

We then remove the unneeded columns ('RowNumber','CustomerId','Surname': because they are not usabel for the model, 'Exited' because it is the label (Y)  and 'Geography' because we just one-hot encoded it) and create the X dataset and the label dataset (Y) (we are working on a supervised model).

In [10]:
df['Gender'] = df['Gender'].map({'Female': 0, 'Male':1}) # mapping gender
df = pd.concat([df,pd.get_dummies(df['Geography'], prefix = 'country',
              drop_first= True)], axis = 1) # One-hot encoding Geography feature

df_X = df.drop(['Geography','Exited','RowNumber','CustomerId','Surname'],
            axis = 1)
X = df_X.values # creating X dataset
Y = df['Exited'].values # creating Y label

In [11]:
print(X.shape, Y.shape, type(X), type(Y))

(10000, 11) (10000,) <class 'numpy.ndarray'> <class 'numpy.ndarray'>


# Modelling
First we split the datas in train / test  dataset

In [12]:
#split the data into training and test set (20% for the test set)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
                                                    random_state = 10)
# we use random_state to make sure spliting contains the same data each time
# (the train/test dataset will have the same rows each time)

In [13]:
Y_train

array([0, 0, 0, ..., 1, 0, 1])

In [14]:
# standardize the data (x_standardised = (x - x_mean)/std_dev)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) # we use the scale set calculated from the
# training set just above and we apply it to transform the test set

Let's try ANN with tensorflow

In [None]:
model = tf.keras.models.Sequential()

#add input layer  and first hidden layer
model.add(tf.keras.layers.Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu')) #initializer=uniform means to all the weights will initialized with the same value

#xxx.Dense means that every node is connected with the nodes next to himself

#add 2nd hidden layer
model.add(tf.keras.layers.Dense(units = 6, kernel_initializer='uniform', activation = 'relu'))

# Add output layer
model.add(tf.keras.layers.Dense(units = 1, kernel_initializer='uniform', activation='sigmoid')) # sigmoid for binary, Softmax for multiclass

# compilation
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Training
history = model.fit(X_train, Y_train, batch_size  = 10, epochs = 100, verbose = 2)

Epoch 1/100
800/800 - 4s - loss: 0.4890 - accuracy: 0.7977
Epoch 2/100
800/800 - 4s - loss: 0.4234 - accuracy: 0.7981
Epoch 3/100
800/800 - 4s - loss: 0.4165 - accuracy: 0.8166
Epoch 4/100
800/800 - 4s - loss: 0.4104 - accuracy: 0.8322
Epoch 5/100
800/800 - 4s - loss: 0.4054 - accuracy: 0.8356
Epoch 6/100
800/800 - 6s - loss: 0.4019 - accuracy: 0.8375
Epoch 7/100
800/800 - 6s - loss: 0.3993 - accuracy: 0.8375
Epoch 8/100
800/800 - 8s - loss: 0.3980 - accuracy: 0.8380
Epoch 9/100
800/800 - 6s - loss: 0.3967 - accuracy: 0.8395
Epoch 10/100
800/800 - 3s - loss: 0.3956 - accuracy: 0.8394
Epoch 11/100
800/800 - 4s - loss: 0.3948 - accuracy: 0.8385
Epoch 12/100
800/800 - 4s - loss: 0.3942 - accuracy: 0.8380
Epoch 13/100
800/800 - 6s - loss: 0.3933 - accuracy: 0.8395
Epoch 14/100
800/800 - 5s - loss: 0.3928 - accuracy: 0.8413
Epoch 15/100
800/800 - 5s - loss: 0.3922 - accuracy: 0.8389
Epoch 16/100
800/800 - 6s - loss: 0.3924 - accuracy: 0.8389
Epoch 17/100
800/800 - 4s - loss: 0.3915 - accura

In [None]:
y_ann_pred = model.predict(X_test)
y_ann_pred = (y_ann_pred > 0.5)
cm_ann = confusion_matrix(y_ann_pred, Y_test)

accuracy_ann = round(model.evaluate(X_test, Y_test, verbose = False)[1],2)

print(cm_ann)
print(accuracy_ann)

[[1495  215]
 [  83  207]]
0.85


We will use pytorch so we transform the train/test dataset and labels as tensor.

In [14]:
X_train = torch.from_numpy(X_train.astype(np.float32))
Y_train = torch.from_numpy(Y_train.astype(np.float32))

X_test = torch.from_numpy(X_test.astype(np.float32))
Y_test = torch.from_numpy(Y_test.astype(np.float32))

In [15]:
# Here we flatten the labels in order to get
# the right input dimension in the model
Y_train = Y_train.view(-1,1)
Y_test = Y_test.view(-1,1)

In [16]:
D_in = X_train.shape[1] # This is the number of features of our X dataset
H1 = 15
H2 = 6
H3 = 4
D_out = 1 # This is our binary output

# Defining the model: we choose here a two hidden layers with 15 and 6 neurons
# with ReLU activation functions for the hidden layers and sigmoid function for
# the output layer which is ideal for binary output

model = nn.Sequential(
    nn.Linear(D_in, H1),
    nn.ReLU(),
    nn.Linear(H1, H2),
    nn.ReLU(),
    nn.Linear(H2, H3),
    nn.ReLU(),
    nn.Linear(H3, D_out),
    nn.Sigmoid()
)

# Defining the loss function: we choose the binary cross entropy which is ideal
# for binary classification problem like ours
loss_fn = nn.BCELoss()

# Defining the optimizer: ADAM optimizer is chosen here
learning_rate = 1e-2
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
batch_size = X_train.shape[0]
N_iter = 3 * batch_size
log_interval = 200

# Training the model
iter = 0

for iter in range(N_iter):
  y_pred = model(X_train.float())
  loss = loss_fn(input = y_pred, target = Y_train)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  epoch = 0
  if iter % batch_size == 0:
    epoch = epoch + 1

    y_pred = model(X_test)>0.5
    cm = confusion_matrix(y_pred, Y_test)
    accuracy = round((cm[0,0] + cm[1,1])/ len(Y_test),2)

    print(f'\nEpoch {int(iter / batch_size + 1)} \t Loss: {round(loss.item(), 4)} \t Accuracy: {accuracy * 100} %')



Let's try logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression() # create the model
logisticRegr.fit(X_train, Y_train.reshape(8000)) # train the model

accuracy_log = logisticRegr.score(X_test, Y_test)

print(f'Logistic regression accuracy: {accuracy_log}')

Logistic regression accuracy: 0.7995


In [30]:
y_log_pred = logisticRegr.predict(X_test)
cm_logistic = confusion_matrix(y_log_pred, Y_test)
cm_logistic

array([[1515,  338],
       [  63,   84]])

This model is less accurate than the ANN.

In [32]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier() # create the model
knn.fit(X_train, Y_train.reshape(8000)) # train the model

accuracy_knn = knn.score(X_test, Y_test)
print(f'KNN accuracy: {accuracy_knn}')


y_knn_pred = knn.predict(X_test)
cm_knn = confusion_matrix(y_knn_pred, Y_test)
print(cm_knn)


KNN accuracy: 0.8205
[[1490  271]
 [  88  151]]


Let's try a decision tree, naive bayes, support vector machine

In [20]:
new_customer1 = [[619, 0, 42, 2, 0, 1,1,1, 101348, 0, 0]]
new_customer1 = sc.transform(new_customer1)
new_customer1 = torch.from_numpy(new_customer1.astype(np.float32))

new_customer2 = [[824, 1, 49, 8, 133231, 1,1,1, 67885, 1, 0]]
new_customer2 = sc.transform(new_customer2)
new_customer2 = torch.from_numpy(new_customer2.astype(np.float32))


In [21]:
print(model(new_customer1), model(new_customer2))

tensor([[0.2776]], grad_fn=<SigmoidBackward0>) tensor([[0.4036]], grad_fn=<SigmoidBackward0>)


In [22]:
y_pred

tensor([[0.4086],
        [0.0631],
        [0.0084],
        ...,
        [0.1156],
        [0.1226],
        [0.3905]], grad_fn=<SigmoidBackward0>)

We finally choose the ANN model because it gives more accurate predictions

In [23]:
print(X_test[10],Y_test[10])

tensor([ 1.4487,  0.9126, -0.2827,  0.3461,  0.5570, -0.9187,  0.6401,  0.9753,
         1.5626, -0.5766, -0.5739]) tensor([0.])


In [24]:
len(Y_test[Y_test==1])

422

In [25]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,country_Germany,country_Spain
0,1,15634602,Hargrave,619,France,0,42,2,0.0,1,1,1,101348.88,1,0,0
1,2,15647311,Hill,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0,0,1
2,3,15619304,Onio,502,France,0,42,8,159660.8,3,1,0,113931.57,1,0,0
3,4,15701354,Boni,699,France,0,39,1,0.0,2,0,0,93826.63,0,0,0
4,5,15737888,Mitchell,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0,0,1
