In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F

# Load dataset (update path if needed)
df = pd.read_csv("/kaggle/input/churn-modelling/Churn_Modelling.csv")

df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [23]:
# Drop non-informative columns
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

# Encode Gender
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])

# One-hot encode Geography
df = pd.get_dummies(df, columns=["Geography"], drop_first=True)

df.head()


Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,0,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,0,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,0,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,0,43,2,125510.82,1,1,1,79084.1,0,False,True


In [24]:
X = df.drop("Exited", axis=1).values
y = df["Exited"].values

print("Feature shape:", X.shape)
print("Target shape:", y.shape)


Feature shape: (10000, 11)
Target shape: (10000,)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [26]:
# StandardScaler is applied to normalize all input features.
# Neural networks perform better when features are on a similar scale,
# as this improves gradient descent convergence and training stability.
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test  = torch.tensor(X_test, dtype=torch.float32)

y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test  = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [28]:
class ChurnANN(nn.Module):
    def __init__(self):
        super(ChurnANN, self).__init__()
        
        # First hidden layer with 16 neurons.
        # 16 neurons provide sufficient capacity to capture
        # non-linear relationships without causing overfitting.
        self.fc1 = nn.Linear(11, 16)
        
        # Second hidden layer with 16 neurons.
        # Using two hidden layers improves model expressiveness
        # compared to a single-layer network.
        self.fc2 = nn.Linear(16, 16)
        
        # Output layer with 1 neuron for binary classification.
        # Sigmoid activation will be applied to produce probabilities.
        self.out = nn.Linear(16, 1)
        
    def forward(self, x):
        # ReLU activation is used to avoid vanishing gradients
        # and enable faster convergence.
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        # Sigmoid activation converts output into probability (0â€“1)
        x = torch.sigmoid(self.out(x))
        return x

In [29]:
model = ChurnANN()

# Binary Cross Entropy Loss is chosen because the task
# involves binary classification (Exited: 0 or 1).
criterion = nn.BCELoss()

# Adam optimizer is used for faster and more stable convergence.
# Learning rate of 0.001 provides a good balance between
# training speed and stability.
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.001
)

In [30]:
# The model is trained for 100 epochs.
# Fewer epochs caused underfitting, while more epochs
# showed minimal improvement and risked overfitting.
epochs = 100

for epoch in range(epochs):
    
    # Forward pass: compute predictions
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # Backward pass: compute gradients
    optimizer.zero_grad()
    loss.backward()
    
    # Update model parameters
    optimizer.step()
    
    # Print loss every 10 epochs to monitor training progress
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [10/100], Loss: 0.6900
Epoch [20/100], Loss: 0.6751
Epoch [30/100], Loss: 0.6581
Epoch [40/100], Loss: 0.6383
Epoch [50/100], Loss: 0.6159
Epoch [60/100], Loss: 0.5917
Epoch [70/100], Loss: 0.5665
Epoch [80/100], Loss: 0.5418
Epoch [90/100], Loss: 0.5190
Epoch [100/100], Loss: 0.4996


In [31]:
with torch.no_grad():
    # Predictions on test data
    y_pred = model(X_test)
    
    # Convert probabilities to class labels using 0.5 threshold
    y_pred_class = (y_pred >= 0.5).float()
    
    # Accuracy calculation
    accuracy = (y_pred_class == y_test).sum() / y_test.shape[0]
    
    print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8035
