In [3]:
import numpy as np
import pandas as pd

# Activation Functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# XOR Data
X = np.array([[0,0],
              [0,1],
              [1,0],
              [1,1]])
y = np.array([[0],
              [1],
              [1],
              [0]])

# Hyperparameters
np.random.seed(42)
input_size, hidden_size, output_size = 2, 4, 1
learning_rate = 0.05
epochs = 10000
l2_lambda = 0.001
dropout_rate = 0.2
beta1, beta2 = 0.9, 0.999
epsilon = 1e-8
patience = 300

# Parameter Initialization
weights1 = np.random.randn(input_size, hidden_size)
bias1 = np.random.randn(1, hidden_size)
weights2 = np.random.randn(hidden_size, output_size)
bias2 = np.random.randn(1, output_size)

# Store initial parameters
initial_weights1 = weights1.copy()
initial_weights2 = weights2.copy()
initial_bias1 = bias1.copy()
initial_bias2 = bias2.copy()

# Adam optimizer variables
m_w1, v_w1 = np.zeros_like(weights1), np.zeros_like(weights1)
m_b1, v_b1 = np.zeros_like(bias1), np.zeros_like(bias1)
m_w2, v_w2 = np.zeros_like(weights2), np.zeros_like(weights2)
m_b2, v_b2 = np.zeros_like(bias2), np.zeros_like(bias2)

# Early stopping setup
best_loss = np.inf
no_improve = 0

# Training Loop
for epoch in range(1, epochs + 1):
    # Forward pass
    z1 = np.dot(X, weights1) + bias1
    a1 = sigmoid(z1)

    # Dropout
    dropout_mask = (np.random.rand(*a1.shape) > dropout_rate).astype(float)
    a1 *= dropout_mask
    a1 /= (1 - dropout_rate)

    z2 = np.dot(a1, weights2) + bias2
    output = sigmoid(z2)

    # Compute loss (MSE + L2 Regularization)
    loss = np.mean((y - output) ** 2) + l2_lambda * (np.sum(weights1 ** 2) + np.sum(weights2 ** 2))

    # Backpropagation
    d_output = (y - output) * sigmoid_derivative(output)
    d_weights2 = np.dot(a1.T, d_output) / len(X) + l2_lambda * weights2
    d_bias2 = np.mean(d_output, axis=0, keepdims=True)

    d_hidden = np.dot(d_output, weights2.T) * sigmoid_derivative(a1)
    d_weights1 = np.dot(X.T, d_hidden) / len(X) + l2_lambda * weights1
    d_bias1 = np.mean(d_hidden, axis=0, keepdims=True)

    # Adam Optimizer Update
    m_w1 = beta1 * m_w1 + (1 - beta1) * d_weights1
    v_w1 = beta2 * v_w1 + (1 - beta2) * (d_weights1 ** 2)
    m_b1 = beta1 * m_b1 + (1 - beta1) * d_bias1
    v_b1 = beta2 * v_b1 + (1 - beta2) * (d_bias1 ** 2)

    m_w2 = beta1 * m_w2 + (1 - beta1) * d_weights2
    v_w2 = beta2 * v_w2 + (1 - beta2) * (d_weights2 ** 2)
    m_b2 = beta1 * m_b2 + (1 - beta1) * d_bias2
    v_b2 = beta2 * v_b2 + (1 - beta2) * (d_bias2 ** 2)

    # Bias correction
    m_w1_hat = m_w1 / (1 - beta1 ** epoch)
    v_w1_hat = v_w1 / (1 - beta2 ** epoch)
    m_b1_hat = m_b1 / (1 - beta1 ** epoch)
    v_b1_hat = v_b1 / (1 - beta2 ** epoch)

    m_w2_hat = m_w2 / (1 - beta1 ** epoch)
    v_w2_hat = v_w2 / (1 - beta2 ** epoch)
    m_b2_hat = m_b2 / (1 - beta1 ** epoch)
    v_b2_hat = v_b2 / (1 - beta2 ** epoch)

    # Update parameters
    weights1 += learning_rate * m_w1_hat / (np.sqrt(v_w1_hat) + epsilon)
    bias1 += learning_rate * m_b1_hat / (np.sqrt(v_b1_hat) + epsilon)
    weights2 += learning_rate * m_w2_hat / (np.sqrt(v_w2_hat) + epsilon)
    bias2 += learning_rate * m_b2_hat / (np.sqrt(v_b2_hat) + epsilon)

    # Early stopping
    if loss < best_loss - 1e-6:
        best_loss = loss
        no_improve = 0
    else:
        no_improve += 1
    if no_improve > patience:
        print(f"\nEarly stopping at epoch {epoch}")
        break

# Final Evaluation
final_hidden = sigmoid(np.dot(X, weights1) + bias1)
final_output = sigmoid(np.dot(final_hidden, weights2) + bias2)
predicted = np.round(final_output)

# Display Results
print("\n==================== TRAINING SUMMARY ====================")
print(f"Total Epochs Used: {epoch}")
print(f"Final Loss: {best_loss:.6f}\n")

print("Input Data (X):\n", X)
print("\nTarget Output (y):\n", y)

print("\nInitial Weights Layer 1:\n", initial_weights1)
print("Initial Bias Layer 1:\n", initial_bias1)
print("\nInitial Weights Layer 2:\n", initial_weights2)
print("Initial Bias Layer 2:\n", initial_bias2)

print("\n-----------------------------------------------")
print("Final Weights Layer 1:\n", weights1)
print("Final Bias Layer 1:\n", bias1)
print("\nFinal Weights Layer 2:\n", weights2)
print("Final Bias Layer 2:\n", bias2)
print("-----------------------------------------------")

# Tabular Output
results = pd.DataFrame({
    "Input 1": X[:, 0],
    "Input 2": X[:, 1],
    "Target Output": y.flatten(),
    "Predicted Output": predicted.flatten()
})

print(results.to_string(index=False))
print("===============================================================")



Early stopping at epoch 319

Total Epochs Used: 319
Final Loss: 0.174733

Input Data (X):
 [[0 0]
 [0 1]
 [1 0]
 [1 1]]

Target Output (y):
 [[0]
 [1]
 [1]
 [0]]

Initial Weights Layer 1:
 [[ 0.49671415 -0.1382643   0.64768854  1.52302986]
 [-0.23415337 -0.23413696  1.57921282  0.76743473]]
Initial Bias Layer 1:
 [[-0.46947439  0.54256004 -0.46341769 -0.46572975]]

Initial Weights Layer 2:
 [[ 0.24196227]
 [-1.91328024]
 [-1.72491783]
 [-0.56228753]]
Initial Bias Layer 2:
 [[-1.01283112]]

-----------------------------------------------
Final Weights Layer 1:
 [[ 21.58247024 -20.56320003 -20.43346894  24.0117975 ]
 [ 17.57390747 -21.63234466  21.56489899  21.09123143]]
Final Bias Layer 1:
 [[-1.13156993  0.18133697  2.94857857 -1.99120494]]

Final Weights Layer 2:
 [[  0.64213963]
 [-14.71749449]
 [ -1.81830192]
 [  0.9841282 ]]
Final Bias Layer 2:
 [[0.42782053]]
-----------------------------------------------

--- XOR Gate Results (After Training with Adam + Regularization) ---
 Inp