In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/kaggle/input/ann-deep-learning/Churn_Modelling.csv')  
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# dropping columns which do not help the neural network learn anything, because they might not contain meaningful paterns for churning
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# we find x, split features, and y, label, which is basically y being the target  (which we want to predict) and x being all the other variables except y
X = df.drop('Exited', axis=1)
y = df['Exited']

In [4]:
# now we perform label encoding on gender, as it has only two values to evaluate (it basically converts the text to number)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])

# one hot encoding on geography since it has 3 values and label encoding might give false learnings to the network. in one hot encoding the 3 values are split into 3 columns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('geo', OneHotEncoder(drop='first'), ['Geography'])], remainder='passthrough')
X = ct.fit_transform(X)

# drop first is used because we dont need 3 columns, only 2 columns are enough to tell which person leaves where (00->Germany, 10->France, 01->Spain)
# X has now become a numpy matrix

In [5]:
# we scale the values so that its easier and faster for the network to learn, otherwise it confuses the gradient descent algo
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
# x_scaled = (x-mean)/std
X

array([[-0.57873591, -0.57380915, -0.32622142, ...,  0.64609167,
         0.97024255,  0.02188649],
       [-0.57873591,  1.74273971, -0.44003595, ..., -1.54776799,
         0.97024255,  0.21653375],
       [-0.57873591, -0.57380915, -1.53679418, ...,  0.64609167,
        -1.03067011,  0.2406869 ],
       ...,
       [-0.57873591, -0.57380915,  0.60498839, ..., -1.54776799,
         0.97024255, -1.00864308],
       [ 1.72790383, -0.57380915,  1.25683526, ...,  0.64609167,
        -1.03067011, -0.12523071],
       [-0.57873591, -0.57380915,  1.46377078, ...,  0.64609167,
        -1.03067011, -1.07636976]])

In [6]:
# we will split our data into 80% training and 20% testing. random_state ensures same splitting on every run. train_test_split shuffles the data and randomly mixes them
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# now we convert our data to be feasible for pytorch understanding. so we convert them into tensors of correct datatypes
import torch
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

In [8]:
# since ANN training is done in mini-batches we use a DataLoader for faster training
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test,y_test)
# corresponds X to y

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# shuffle on training data but not on testing data

In [9]:
# the input layer has 11 neurons(11 columns). our structure will have 64 neurons and 32 neurons in hidden layers and 1 output
import torch.nn as nn

class ANNModel(nn.Module):
    def __init__(self, input_dim):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.out = nn.Linear(64, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    def forward(self,x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.out(x))
        return x
# x -> fc1 -> relu -> fc2 -> relu -> out -> sigmoid
input_size = X_train.shape[1]
model = ANNModel(input_size)

In [10]:
#error calculation and backprop
import torch.optim as optim

criterion = nn.BCELoss()
# learning rate means how much step should be taken for each weight
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
# data shape was (8000) when it should be (8000,1)
y_train = y_train.unsqueeze(1)
y_test = y_test.unsqueeze(1)

In [12]:
epochs = 100
for epoch in range(epochs):
    model.train()

    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [10/100], Loss: 0.5800
Epoch [20/100], Loss: 0.5144
Epoch [30/100], Loss: 0.4840
Epoch [40/100], Loss: 0.4567
Epoch [50/100], Loss: 0.4382
Epoch [60/100], Loss: 0.4255
Epoch [70/100], Loss: 0.4173
Epoch [80/100], Loss: 0.4107
Epoch [90/100], Loss: 0.4039
Epoch [100/100], Loss: 0.3965


In [13]:
model.eval()

with torch.no_grad():
    y_pred_test = model(X_test)

# if probability is more than 0.5 then y_pred_class will be 1
y_pred_class = (y_pred_test >=0.5).float()

In [14]:
# calculating accuracy
correct = (y_pred_class == y_test).sum().item()
total = y_test.shape[0]

acc = correct/total
print(f"Accuracy is {acc*100:.2f}%")

Accuracy is 83.40%


In [15]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test.numpy(), y_pred_class.numpy())
print(cm)
print(classification_report(y_test.numpy(), y_pred_class.numpy()))

[[1548   59]
 [ 273  120]]
              precision    recall  f1-score   support

         0.0       0.85      0.96      0.90      1607
         1.0       0.67      0.31      0.42       393

    accuracy                           0.83      2000
   macro avg       0.76      0.63      0.66      2000
weighted avg       0.81      0.83      0.81      2000

