In [3]:
import numpy as np

def tanh(x):
    return np.tanh(x)

def tanh_derivative(y):
    return 1 - y**2

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(y):
    return y * (1 - y)

X = np.array([[1, 1, 0, 1]])
y = np.array([[1]])

lr = 0.74
tolerance = 0.001
max_epochs = 10000  # safety cap

W1 = np.random.randn(4, 3)
b1 = np.zeros((1, 3))

W2 = np.random.randn(3, 2)
b2 = np.zeros((1, 2))

W3 = np.random.randn(2, 1)
b3 = np.zeros((1, 1))

epoch = 0
while epoch < max_epochs:
    # Forward pass
    z1 = np.dot(X, W1) + b1
    h1 = tanh(z1)

    z2 = np.dot(h1, W2) + b2
    h2 = tanh(z2)

    z3 = np.dot(h2, W3) + b3
    o = sigmoid(z3)

    # Error
    error = y - o

    delta_out = error * sigmoid_derivative(o)
    delta_h2 = delta_out.dot(W3.T) * tanh_derivative(h2)
    delta_h1 = delta_h2.dot(W2.T) * tanh_derivative(h1)

    # Update weights and biases
    W3 += lr * h2.T.dot(delta_out)
    b3 += lr * delta_out

    W2 += lr * h1.T.dot(delta_h2)
    b2 += lr * delta_h2

    W1 += lr * X.T.dot(delta_h1)
    b1 += lr * delta_h1

    # Check convergence
    if abs(error[0][0]) < tolerance:
        print("Converged at epoch:", epoch)
        print("Output (O):", o)
        print("Error (D - O):", error)
        print("Updated W1:\n", W1)
        print("Updated b1:\n", b1)
        print("Updated W2:\n", W2)
        print("Updated b2:\n", b2)
        break

    epoch += 1


In [5]:
import numpy as np

# --- Activation functions and derivatives ---
def sigmoid(x):
    x = np.clip(x, -50, 50)  # prevent overflow
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(y):
    return y * (1 - y)

def tanh(x):
    return np.tanh(x)

def tanh_derivative(y):
    return 1 - y**2

def relu(x):
    return np.maximum(0, x)

def relu_derivative(y):
    return (y > 0).astype(float)

# Dictionary for easy selection
activations = {
    "sigmoid": (sigmoid, sigmoid_derivative),
    "tanh": (tanh, tanh_derivative),
    "relu": (relu, relu_derivative)
}

# --- Training setup ---
X = np.array([[1, 1, 0, 1]], dtype=np.float64)
y = np.array([[1]], dtype=np.float64)

lr = 0.05
tolerance = 1e-3
max_epochs = 10000

rng = np.random.default_rng(42)
W1 = rng.normal(0, 1, (4, 3))
b1 = np.zeros((1, 3))

W2 = rng.normal(0, 1, (3, 2))
b2 = np.zeros((1, 2))

W3 = rng.normal(0, 1, (2, 1))
b3 = np.zeros((1, 1))

# Choose activation per layer
act1, d_act1 = activations["tanh"]
act2, d_act2 = activations["relu"]
act3, d_act3 = activations["tanh"]  # output layer

# --- Training loop ---
epoch = 0
while epoch < max_epochs:
    # Forward pass
    z1 = X.dot(W1) + b1
    h1 = act1(z1)

    z2 = h1.dot(W2) + b2
    h2 = act2(z2)

    z3 = h2.dot(W3) + b3
    o = act3(z3)

    # Error and loss
    error = y - o
    mse = 0.5 * np.mean(error**2)

    # Backpropagation
    delta_out = (o - y) * d_act3(o)
    delta_h2 = delta_out.dot(W3.T) * d_act2(h2)
    delta_h1 = delta_h2.dot(W2.T) * d_act1(h1)

    # Update weights and biases
    W3 -= lr * h2.T.dot(delta_out)
    b3 -= lr * delta_out

    W2 -= lr * h1.T.dot(delta_h2)
    b2 -= lr * delta_h2

    W1 -= lr * X.T.dot(delta_h1)
    b1 -= lr * delta_h1

    # Convergence check
    if mse < tolerance:
        print("Converged at epoch:", epoch)
        print("Output (O):", o)
        print("MSE:", mse)
        break

    epoch += 1

if epoch == max_epochs and mse >= tolerance:
    print("Did not converge within max_epochs.")
    print("Last Output (O):", o)
    print("Final MSE:", mse)


Converged at epoch: 194
Output (O): [[0.9554197]]
MSE: 0.0009937016008367034


In [7]:
import numpy as np
def sigmoid(x):
    x = np.clip(x, -50, 50)
    return 1 / (1 + np.exp(-x))
def sigmoid_derivative(y):
    return y * (1 - y)
def tanh(x):
    return np.tanh(x)
def tanh_derivative(y):
    return 1 - y**2
def relu(x):
    return np.maximum(0, x)
def relu_derivative(y):
    return (y > 0).astype(float)
def leaky_relu(x, alpha=0.01):
    return np.where(x > 0, x, alpha * x)
def leaky_relu_derivative(y, alpha=0.01):
    return np.where(y > 0, 1, alpha)
def swish(x):
    return x * sigmoid(x)
def swish_derivative(x):
    s = sigmoid(x)
    return s + x * s * (1 - s)
activations = {
    "sigmoid": (sigmoid, sigmoid_derivative),
    "tanh": (tanh, tanh_derivative),
    "relu": (relu, relu_derivative),
    "leakyrelu": (leaky_relu, leaky_relu_derivative),
    "swish": (swish, swish_derivative)
}
X = np.array([[1, 1, 0, 1]], dtype=np.float64)
y = np.array([[1]], dtype=np.float64)
lr = 0.74          
tolerance = 1e-3  
rng = np.random.default_rng(42)
W1 = rng.normal(0, 1, (4, 3))
b1 = np.zeros((1, 3))
W2 = rng.normal(0, 1, (3, 2))
b2 = np.zeros((1, 2))
W3 = rng.normal(0, 1, (2, 1))
b3 = np.zeros((1, 1))
act1, d_act1 = activations["relu"]       
act2, d_act2 = activations["relu"]   
act3, d_act3 = activations["sigmoid"]     
epoch = 0
while True:
    z1 = X.dot(W1) + b1
    h1 = act1(z1)
    z2 = h1.dot(W2) + b2
    h2 = act2(z2)
    z3 = h2.dot(W3) + b3
    o = act3(z3)
    error = y - o
    mse = 0.5 * np.mean(error**2)
    delta_out = (o - y) * d_act3(o)
    delta_h2 = delta_out.dot(W3.T) * d_act2(h2)
    delta_h1 = delta_h2.dot(W2.T) * d_act1(h1)
    W3 -= lr * h2.T.dot(delta_out)
    b3 -= lr * delta_out
    W2 -= lr * h1.T.dot(delta_h2)
    b2 -= lr * delta_h2
    W1 -= lr * X.T.dot(delta_h1)
    b1 -= lr * delta_h1
    if mse < tolerance:
        print("Converged at epoch:", epoch)
        print("Output (O):", o)
        print("E:", mse)
        print("Updated W1:\n", W1)
        print("Updated b1:\n", b1)
        print("Updated W2:\n", W2)
        print("Updated b2:\n", b2)
        break
        break

    epoch += 1


Converged at epoch: 38
Output (O): [[0.95530503]]
E: 0.0009988201092671605
Updated W1:
 [[ 0.40040796 -1.03998411  1.04724543]
 [ 1.0362556  -1.95103519 -1.00538527]
 [ 0.1278404  -0.31624259 -0.01680116]
 [-0.75735305  0.87939797  1.07458617]]
Updated b1:
 [[0.09569088 0.         0.29679424]]
Updated W2:
 [[ 0.35528704  1.12400792]
 [ 0.46750934 -0.85929246]
 [ 0.77805364 -0.96139341]]
Updated b2:
 [[ 0.57116743 -0.00817777]]


In [5]:
import numpy as np
def relu(x):
    return np.maximum(0, x)
def relu_derivative(y):
    return (y > 0).astype(float)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)
X = np.array([[1, 1, 0, 1]], dtype=np.float64)
y = np.array([[0, 1]], dtype=np.float64)
lr = 0.74
tolerance = 1e-3
epoch = 0
rng = np.random.default_rng(42)
W1 = rng.normal(0, 1, (4, 3))
b1 = np.zeros((1, 3))
W2 = rng.normal(0, 1, (3, 2))
b2 = np.zeros((1, 2))
W3 = rng.normal(0, 1, (2, 2))
b3 = np.zeros((1, 2))
act1, d_act1 = relu, relu_derivative
act2, d_act2 = relu, relu_derivative
act3 = softmax   
while True:
    z1 = X.dot(W1) + b1
    h1 = act1(z1)
    z2 = h1.dot(W2) + b2
    h2 = act2(z2)
    z3 = h2.dot(W3) + b3
    o = act3(z3)
    loss = -np.mean(np.sum(y * np.log(o + 1e-9), axis=1))
    delta_out = o - y
    delta_h2 = delta_out.dot(W3.T) * d_act2(h2)
    delta_h1 = delta_h2.dot(W2.T) * d_act1(h1)
    W3 -= lr * h2.T.dot(delta_out)
    b3 -= lr * delta_out
    W2 -= lr * h1.T.dot(delta_h2)
    b2 -= lr * delta_h2
    W1 -= lr * X.T.dot(delta_h1)
    b1 -= lr * delta_h1
    if loss < tolerance:
        print("Converged at epoch:", epoch)
        print("Output (Softmax):", o)
        print("Loss:", loss)
        print("\nUpdated W1:\n", W1)
        print("Updated b1:\n", b1)
        print("\nUpdated W2:\n", W2)
        print("Updated b2:\n", b2)
        print("\nUpdated W3:\n", W3)
        print("Updated b3:\n", b3)
        break

    epoch += 1


Converged at epoch: 676
Output (Softmax): [[0.00099949 0.99900051]]
Loss: 0.000999986191424735

Updated W1:
 [[ 0.05075092 -1.03998411  0.80502349]
 [ 0.68659856 -1.95103519 -1.24760721]
 [ 0.1278404  -0.31624259 -0.01680116]
 [-1.10701008  0.87939797  0.83236423]]
Updated b1:
 [[-0.25396616  0.          0.0545723 ]]

Updated W2:
 [[-0.08301365  1.04760107]
 [ 0.46750934 -0.85929246]
 [ 0.28285009 -1.00478265]]
Updated b2:
 [[-0.37998458 -0.2030404 ]]

Updated W3:
 [[ 0.8337298  -0.00520541]
 [-0.27710971 -0.58868219]]
Updated b3:
 [[-3.45437365  3.45437365]]
