In [2]:
import numpy as np
from tabulate import tabulate
np.random.seed(1)

In [3]:
def relu(x):
    return (x > 0) * x 

def relu_grad(x):
    return x > 0

In [4]:
streetlights = np.array([[1,0,1], 
                         [0,1,1], 
                         [0,0,1], 
                         [1,1,1], 
                         [0,1,1], 
                         [1,0,1]])

In [5]:
walk_vs_stop = np.array([[0], [1], [0], [1], [1], [0]])

In [6]:
X,y = streetlights, walk_vs_stop

In [7]:
hidden_nodes = 8

In [8]:
epochs = 100  # number of iterations to go through the network

lr = 0.01      # how much we change the weights of the network each iteration

In [9]:
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, y.shape[1]) - 0.5

In [10]:
for epoch in range(epochs):     #number of training iterations, or times to change the weights of the nn
    for i in range(X.shape[0]): #for all samples in X, each streetlight
        layer_in = X[i:i+1]
        
        #forward pass/prediction
        layer_1 = relu(layer_in.dot(ws_1))
        
        layer_out = layer_1.dot(ws_2)
        
        #calc error/distance (how far are we from goal)
        delta_2 = layer_out - y[i:i+1]
        
        #calc the the error each node in prev layer contributed
        delta_1 = delta_2.dot(ws_2.T) * relu_grad(layer_1)
        
        #update weights
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes,1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1],1).dot(delta_1))
    
    if epoch % 10 == 0:
        error = delta_2**2
        print(round(error[0][0],6))#, end='\r')

0.019479
0.046095
0.062677
0.061989
0.052434
0.041336
0.031658
0.024053
0.018328
0.01518


Task 1: Replace the activation function with the Sigmoid function

In [11]:
def Sigmoid(x):
    # Convert Input to a NumPy Array
    x = np.array(x)
    # Initialize an Empty Array for Sigmoid Outputs
    Sigmoid = np.empty_like(x, dtype=np.float64)
    
    # Create masks for positive and negative values
    positive = x >= 0
    negative = x < 0

    # Compute sigmoid for positive values
    Sigmoid[positive] = 1 / (1 + np.exp(-x[positive]))

    # Compute sigmoid for negative values to prevent overflow
    Sigmoid[negative] = np.exp(x[negative]) / (1 + np.exp(x[negative]))

    return Sigmoid

def Sigmoid_grad(x):
    return np.exp(-x)/pow((1 + np.exp(-x)),2)

In [12]:
for epoch in range(epochs): #number of training iterations, or times to change the weights of the nn
    for i in range(X.shape[0]): #for all samples in X, each streetlight
        layer_in = X[i:i+1]
        
        #forward pass/prediction
        layer_1 = Sigmoid(layer_in.dot(ws_1))
        
        layer_out = layer_1.dot(ws_2)
        
        #calc error/distance (how far are we from goal)
        delta_2 = layer_out - y[i:i+1]
        
        #calc the the error each node in prev layer contributed
        delta_1 = delta_2.dot(ws_2.T) * Sigmoid_grad(layer_1)
        
        #update weights
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes,1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1],1).dot(delta_1))
    
    if epoch % 10 == 0:
        error = delta_2**2
        print(round(error[0][0],6))#, end='\r')

0.039486
0.129486
0.158598
0.161098
0.155551
0.147852
0.139694
0.131555
0.123576
0.115803


Switching from ReLU to Sigmoid limited outputs to between 0 and 1 and slowed training due to smaller gradients. ReLU provided more varied outputs and faster convergence. This highlights how the activation function choice affects both the network’s behavior and training efficiency. The sigmoid function maps input values to an output ranged between 0 and 1. For very large positive or negative inputs, the function saturates, meaning the output approaches 1 or 0 asymptotically.

Task 2: learning rate: [0.001, 0.01, 0.1, 1, 10]

In [13]:
learning_rates = [0.001, 0.01, 0.1, 1, 10]
x = np.random.rand(100,10)
y = np.random.rand(100,5)
hidden_node = 8

def best_learning_rate(epochs, X, y, learning_rates, hidden_nodes):
    errors = [[],[],[],[],[]]

    for lr in range(len(learning_rates)):
        ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
        ws_2 = np.random.rand(hidden_nodes, y.shape[1]) - 0.5
        for epoch in range(epochs):     #number of training iterations, or times to change the weights of the nn
            for i in range(X.shape[0]): #for all samples in X, each streetlight
                layer_in = X[i:i+1]
                
                #forward pass/prediction
                layer_1 = Sigmoid(layer_in.dot(ws_1))
                
                layer_out = layer_1.dot(ws_2)
                
                #calc error/distance (how far are we from goal)
                delta_2 = layer_out - y[i:i+1]
                
                #calc the the error each node in prev layer contributed
                delta_1 = delta_2.dot(ws_2.T) * Sigmoid_grad(layer_1)
                
                #update weights
                ws_2 -= learning_rates[lr] * (layer_1.T.reshape(hidden_nodes,1).dot(delta_2))
                ws_1 -= learning_rates[lr] * (layer_in.T.reshape(X.shape[1],1).dot(delta_1))
            
            if epoch % 10 == 0:
                error = delta_2**2
                errors[lr].append(float(round(error[0][0],6)))

    transposed_errors = list(zip(*errors))

    # Prepare Headers
    headers = [f"lr {lr} results" for lr in learning_rates]

    # Generate Table Data
    table = transposed_errors

    # Print the Table
    print(tabulate(table, headers=headers, tablefmt="grid"))
        
# Run the function
best_learning_rate(epochs, X, y, learning_rates, hidden_nodes)

+--------------------+-------------------+------------------+----------------+-----------------+
|   lr 0.001 results |   lr 0.01 results |   lr 0.1 results |   lr 1 results |   lr 10 results |
|           0.029294 |          0.151207 |         0.34972  |       0.085761 |        0.536373 |
+--------------------+-------------------+------------------+----------------+-----------------+
|           0.038451 |          0.125369 |         0.152473 |       0.116792 |        0.536373 |
+--------------------+-------------------+------------------+----------------+-----------------+
|           0.047498 |          0.119752 |         0.158513 |       0.122696 |        0.536373 |
+--------------------+-------------------+------------------+----------------+-----------------+
|           0.056238 |          0.119213 |         0.162423 |       0.124186 |        0.536373 |
+--------------------+-------------------+------------------+----------------+-----------------+
|           0.064546 |        

Task 3: Add another “hidden” layer

In [14]:
# Size of layers
hidden_nodes1 = 8
hidden_nodes2 = 6

#np.random.seed(42)  # For reproducibility
X = np.random.rand(100, 5)  # 100 samples, 5 features
y = np.random.rand(100, 1)  # 100 samples, 1 output

# Hyperparameters
epochs = 1000
lr = 0.1  # Learning rate

ws_1 = np.random.rand(X.shape[1], hidden_nodes1) - 0.5
ws_2 = np.random.rand(hidden_nodes1, hidden_nodes2) - 0.5
ws_3 = np.random.rand(hidden_nodes2, y.shape[1]) - 0.5

for epoch in range(epochs): #number of training iterations, or times to change the weights of the nn
    for i in range(X.shape[0]): #for all samples in X, each streetlight
        layer_in = X[i:i+1]
        
        #forward pass/prediction
        layer_1 = Sigmoid(layer_in.dot(ws_1))
        layer_2 = Sigmoid(layer_1.dot(ws_2))
        layer_out = layer_2.dot(ws_3)

        #calc error/distance (how far are we from goal)
        delta_out = layer_out - y[i:i+1]
        
        #calc error/distance (how far are we from goal)
        delta_2 = delta_out.dot(ws_3.T) * Sigmoid_grad(layer_2)
        
        #calc the the error each node in prev layer contributed
        delta_1 = delta_2.dot(ws_2.T) * Sigmoid_grad(layer_1)
        
        #update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes2,1).dot(delta_out))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes1,1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1],1).dot(delta_1))
    
    if epoch % 10 == 0:
        error = np.mean(delta_out**2)
        print(f'Epoch {epoch}: Error {error:.6f}')

Epoch 0: Error 0.170505
Epoch 10: Error 0.173425
Epoch 20: Error 0.176280
Epoch 30: Error 0.179096
Epoch 40: Error 0.181887
Epoch 50: Error 0.184659
Epoch 60: Error 0.187413
Epoch 70: Error 0.190147
Epoch 80: Error 0.192855
Epoch 90: Error 0.195530
Epoch 100: Error 0.198161
Epoch 110: Error 0.200733
Epoch 120: Error 0.203233
Epoch 130: Error 0.205644
Epoch 140: Error 0.207948
Epoch 150: Error 0.210129
Epoch 160: Error 0.212174
Epoch 170: Error 0.214071
Epoch 180: Error 0.215814
Epoch 190: Error 0.217397
Epoch 200: Error 0.218823
Epoch 210: Error 0.220095
Epoch 220: Error 0.221220
Epoch 230: Error 0.222207
Epoch 240: Error 0.223068
Epoch 250: Error 0.223814
Epoch 260: Error 0.224458
Epoch 270: Error 0.225010
Epoch 280: Error 0.225482
Epoch 290: Error 0.225884
Epoch 300: Error 0.226225
Epoch 310: Error 0.226514
Epoch 320: Error 0.226757
Epoch 330: Error 0.226962
Epoch 340: Error 0.227133
Epoch 350: Error 0.227276
Epoch 360: Error 0.227394
Epoch 370: Error 0.227492
Epoch 380: Error 0.2275

Adding another hidden layer can improve performance by capturing more complex patterns. However, it also increases complexity, leading to longer training times and a higher risk of overfitting. You'll need to adjust the learning rate and possibly increase epochs to effectively train the deeper network.

Task 4: Including an activation functions at various stages

Case (a): Activation function after hidden layer 1 only

In [15]:
# Size of layers
hidden_nodes1 = 8
hidden_nodes2 = 6
# Hyperparameters
epochs = 20

ws_1 = np.random.rand(X.shape[1], hidden_nodes1) - 0.5
ws_2 = np.random.rand(hidden_nodes1, hidden_nodes2) - 0.5
ws_3 = np.random.rand(hidden_nodes2, y.shape[1]) - 0.5

for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # Apply activation after the first hidden layer only
        layer_1 = Sigmoid(layer_in.dot(ws_1)) # Activation applied here
        layer_2 = layer_1.dot(ws_2)
        layer_out = layer_2.dot(ws_3)

        # Calculate errors and backpropagation          
        delta_out = layer_out - y[i:i+1]
        delta_2 = delta_out.dot(ws_3.T) * layer_2
        delta_1 = delta_2.dot(ws_2.T) * Sigmoid_grad(layer_1)
        
        #update weights
        ws_3 -= lr * layer_2.T.dot(delta_out)
        ws_2 -= lr * layer_1.T.dot(delta_2)
        ws_1 -= lr * layer_in.T.dot(delta_1)
    
    if epoch % 10 == 0:
        error = np.mean(delta_out**2)
        print(f'Epoch {epoch}: Error {error:.6f}')

Epoch 0: Error 0.175893
Epoch 10: Error 0.141324


The second hidden layer is linear without an activation function, limiting the network's ability to model non-linear patterns. It can be removed without losing expressive power, reducing the number of weight parameters.

Case (b): Activation Function after Hidden Layer 2 Only

In [16]:
# Size of layers
hidden_nodes1 = 8
hidden_nodes2 = 6
# Hyperparameters
epochs = 240

ws_1 = np.random.rand(X.shape[1], hidden_nodes1) - 0.5
ws_2 = np.random.rand(hidden_nodes1, hidden_nodes2) - 0.5
ws_3 = np.random.rand(hidden_nodes2, y.shape[1]) - 0.5

for epoch in range(epochs): #number of training iterations, or times to change the weights of the nn
    for i in range(X.shape[0]): #for all samples in X, each streetlight
        layer_in = X[i:i+1]
        
        #forward pass/prediction
        layer_1 = (layer_in.dot(ws_1))
        layer_2 = Sigmoid(layer_1.dot(ws_2)) # Activation applied here
        layer_out = layer_2.dot(ws_3) 

        # Calculate errors and backpropagation
        delta_out = layer_out - y[i:i+1]
        delta_2 = delta_out.dot(ws_3.T) * Sigmoid_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * layer_1
        
        #update weights
        ws_3 -= lr * (layer_2.T.dot(delta_out))
        ws_2 -= lr * (layer_1.T.dot(delta_2))
        ws_1 -= lr * (layer_in.T.dot(delta_1))
    
    if epoch % 10 == 0:
        error = np.mean(delta_out**2)
        print(f'Epoch {epoch}: Error {error:.6f}')

Epoch 0: Error 0.201496
Epoch 10: Error 0.183157
Epoch 20: Error 0.176965
Epoch 30: Error 0.175760
Epoch 40: Error 0.176750
Epoch 50: Error 0.178677
Epoch 60: Error 0.181012
Epoch 70: Error 0.183576
Epoch 80: Error 0.186350
Epoch 90: Error 0.189376
Epoch 100: Error 0.192696
Epoch 110: Error 0.196319
Epoch 120: Error 0.200177
Epoch 130: Error 0.204132
Epoch 140: Error 0.208002
Epoch 150: Error 0.211626
Epoch 160: Error 0.214931
Epoch 170: Error 0.217959
Epoch 180: Error 0.220868
Epoch 190: Error 0.223775
Epoch 200: Error 0.226011
Epoch 210: Error 0.225529
Epoch 220: Error 0.222401
Epoch 230: Error 0.218594


The first hidden layer is linear, with non-linearity only after the second. Both layers can be collapsed into one, reducing weight parameters without changing functionality.

Case (c): Activation Function after Both Hidden Layers

In [17]:
# Size of layers
hidden_nodes1 = 8
hidden_nodes2 = 6

ws_1 = np.random.rand(X.shape[1], hidden_nodes1) - 0.5
ws_2 = np.random.rand(hidden_nodes1, hidden_nodes2) - 0.5
ws_3 = np.random.rand(hidden_nodes2, y.shape[1]) - 0.5

for epoch in range(epochs): #number of training iterations, or times to change the weights of the nn
    for i in range(X.shape[0]): #for all samples in X, each streetlight
        layer_in = X[i:i+1]
        
        # Apply activation after both hidden layers
        layer_1 = Sigmoid(layer_in.dot(ws_1)) # Activation applied here
        layer_2 = Sigmoid(layer_1.dot(ws_2)) # Activation applied here
        layer_out = layer_2.dot(ws_3)

        # Calculate errors and backpropagation
        delta_out = layer_out - y[i:i+1]
        delta_2 = delta_out.dot(ws_3.T) * Sigmoid_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * Sigmoid_grad(layer_1)
        
        #update weights
        ws_3 -= lr * (layer_2.T.dot(delta_out))
        ws_2 -= lr * (layer_1.T.dot(delta_2))
        ws_1 -= lr * (layer_in.T.dot(delta_1))
    
    if epoch % 10 == 0:
        error = np.mean(delta_out**2)
        print(f'Epoch {epoch}: Error {error:.6f}')

Epoch 0: Error 0.171383
Epoch 10: Error 0.176684
Epoch 20: Error 0.181491
Epoch 30: Error 0.185874
Epoch 40: Error 0.189887
Epoch 50: Error 0.193573
Epoch 60: Error 0.196968
Epoch 70: Error 0.200101
Epoch 80: Error 0.202996
Epoch 90: Error 0.205675
Epoch 100: Error 0.208152
Epoch 110: Error 0.210439
Epoch 120: Error 0.212547
Epoch 130: Error 0.214483
Epoch 140: Error 0.216252
Epoch 150: Error 0.217859
Epoch 160: Error 0.219309
Epoch 170: Error 0.220606
Epoch 180: Error 0.221757
Epoch 190: Error 0.222767
Epoch 200: Error 0.223645
Epoch 210: Error 0.224401
Epoch 220: Error 0.225043
Epoch 230: Error 0.225584


Both hidden layers apply activation functions, making the network more capable of learning complex patterns. you could potentially reduce the number of neurons in either layer to simplify the model.