# Neural Network Saturation with Sigmoid Activation
This notebook demonstrates how large initial weights in a neural network using sigmoid activation can cause training to stall due to saturation, and provides a fix.

In [None]:
import numpy as np

# XOR inputs
X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

# XOR outputs
y = np.array([
    [0],
    [1],
    [1],
    [0]
])

In [None]:
import matplotlib.pyplot as plt

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

# Visualize sigmoid and its gradient
x_vals = np.linspace(-10, 10, 500)
sig_vals = sigmoid(x_vals)
grad_vals = sigmoid_derivative(x_vals)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(x_vals, sig_vals, label="sigmoid(x)", color="blue")
plt.title("Sigmoid Function")
plt.xlabel("x")
plt.ylabel("sigmoid(x)")
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(x_vals, grad_vals, label="sigmoid'(x)", color="red")
plt.title("Sigmoid Gradient")
plt.xlabel("x")
plt.ylabel("sigmoid'(x)")
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Network with large initial weights — will stall

np.random.seed(0)
W1 = np.random.randn(2, 4) * 10
b1 = np.zeros((1, 4))
W2 = np.random.randn(4, 1) * 10
b2 = np.zeros((1, 1))

lr = 0.1
epochs = 5000

print("🚫 Training with saturated sigmoid (large weights):")
for epoch in range(epochs):
    Z1 = X @ W1 + b1
    A1 = sigmoid(Z1)
    Z2 = A1 @ W2 + b2
    A2 = sigmoid(Z2)

    loss = np.mean((y - A2) ** 2)

    dA2 = -(y - A2)
    dZ2 = dA2 * sigmoid_derivative(Z2)
    dW2 = A1.T @ dZ2
    db2 = np.sum(dZ2, axis=0, keepdims=True)

    dA1 = dZ2 @ W2.T
    dZ1 = dA1 * sigmoid_derivative(Z1)
    dW1 = X.T @ dZ1
    db1 = np.sum(dZ1, axis=0, keepdims=True)

    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.6f}")

In [None]:
# ✅ Fixed version: small weight init + optional BCE loss

np.random.seed(0)
W1 = np.random.randn(2, 4) * 0.1
b1 = np.zeros((1, 4))
W2 = np.random.randn(4, 1) * 0.1
b2 = np.zeros((1, 1))

lr = 0.1
epochs = 10000

print("\n✅ Training with small weights and BCE loss:")
for epoch in range(epochs):
    Z1 = X @ W1 + b1
    A1 = sigmoid(Z1)
    Z2 = A1 @ W2 + b2
    A2 = sigmoid(Z2)

    eps = 1e-8
    loss = -np.mean(y * np.log(A2 + eps) + (1 - y) * np.log(1 - A2 + eps))

    dA2 = -(y / (A2 + eps)) + ((1 - y) / (1 - A2 + eps))
    dZ2 = dA2 * sigmoid_derivative(Z2)
    dW2 = A1.T @ dZ2
    db2 = np.sum(dZ2, axis=0, keepdims=True)

    dA1 = dZ2 @ W2.T
    dZ1 = dA1 * sigmoid_derivative(Z1)
    dW1 = X.T @ dZ1
    db1 = np.sum(dZ1, axis=0, keepdims=True)

    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.6f}")

preds = A2.round()
print("\nPredictions:\n", preds)