This the first dev notebook for building a deep learning library from scarcth in native numpy 

the overall goal : grasp a deep understanding of the main building blocks in DL systems

this notebook goal : design a simple nn to predict on mack random data point

In [1]:
import numpy as np 

x = np.array([[1, 2, 3],
            [4, 5, 6],
            [7, 8, 9]])

y = np.array([1,0,0])

print(x.shape)
print(y.shape)


(3, 3)
(3,)


![Neural Network](images/nn.jpeg)


In [2]:
# lets decide the number of hiiden neurons we want
# we only we use on hidden  layer
n_neurons = 6

num_of_example , num_of_features = x.shape 

# initialize the weights and biases
W1 = np.random.randn(num_of_features, n_neurons)
b1 = np.zeros((n_neurons,)) 

print(W1)
print(b1)
print(W1.shape)
print(b1.shape)



[[ 1.02008805 -0.48709471  0.49338108 -0.0247763  -2.13042258  0.65129621]
 [ 0.52289062 -1.10864025 -0.24051315 -0.64075336 -0.27325533 -0.4135334 ]
 [ 0.57556149 -1.13897708  1.73982202  0.15234672 -0.2840554  -0.55961484]]
[0. 0. 0. 0. 0. 0.]
(3, 6)
(6,)


In [3]:
# the forward pass (linear transformation)

z1 = np.dot(x, W1) + b1

print(z1)
print(z1.shape)

[[  3.79255376  -6.12130646   5.23182085  -0.84924286  -3.52909944
   -1.85461511]
 [ 10.14817425 -14.3254426   11.20989071  -2.38879168 -11.59229938
   -2.8201712 ]
 [ 16.50379475 -22.52957874  17.18796057  -3.9283405  -19.65549931
   -3.78572728]]
(3, 6)


In [4]:
# activation function
## adding the non linearity to the linear transformation

a1 = np.tanh(z1)

print(a1)
print(a1.shape)



[[ 0.9989846  -0.99999036  0.99994289 -0.69067372 -0.99828083 -0.9521787 ]
 [ 1.         -1.          1.         -0.98330786 -1.         -0.99292183]
 [ 1.         -1.          1.         -0.99922599 -1.         -0.99897064]]
(3, 6)


In [5]:
# the output of the network
out_layer_neurons = 1
W2 = np.random.randn(n_neurons, out_layer_neurons)
b2 = np.zeros((out_layer_neurons,))

print(W2)
print(b2)
print(W2.shape)
print(b2.shape)


[[-1.33424614]
 [-0.28318377]
 [ 0.3744034 ]
 [-0.89129719]
 [ 1.64891485]
 [-0.60307707]]
[0.]
(6, 1)
(1,)


In [6]:
# second forward pass
z2 = np.dot(a1 , W2) + b2 
print(z2)
print(z2.shape)



[[-1.13157567]
 [-0.85034588]
 [-0.83251021]]
(3, 1)


In [7]:
## adding the non linearity to the linear transformation 
## this type we will use sigmoid activation function since this is a binary classification problem 

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


a2 = sigmoid(z2)

print(a2)
print(a2.shape)


[[0.24387043]
 [0.29936031]
 [0.30311456]]
(3, 1)


In [8]:
## next we must calculate the loss
## we will use binary cross entropy loss function , again becuse this is a binary classification problem

def binary_cross_entropy(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

loss = binary_cross_entropy(y, a2)

print(loss)

0.6448607372060217


In [9]:
## now we must optimize (minmize this loss function) by the power of chain rule and gradient descent 

def binary_cross_entropy_derivative(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -((y_true / y_pred) - ((1 - y_true) / (1 - y_pred)))

d_loss_d_a2 = binary_cross_entropy_derivative(y, a2)
print(d_loss_d_a2)

[[-4.10053807  1.32252466  1.32252466]
 [-3.34045623  1.42726712  1.42726712]
 [-3.29908267  1.43495609  1.43495609]]


In [10]:
def sigmoid_derivative(x):
    return x*(1 - x) 

d_a2_d_z2 = sigmoid_derivative(a2)

print(d_a2_d_z2)


[[0.18439765]
 [0.20974371]
 [0.21123612]]


In [11]:
out_layer_grad = d_loss_d_a2 * d_a2_d_z2 
print(out_layer_grad)

[[-0.75612957  0.24387043  0.24387043]
 [-0.70063969  0.29936031  0.29936031]
 [-0.69688544  0.30311456  0.30311456]]


In [12]:
## TO simply we can use the this function and get the same result :

def out_layer_grad(y_true, y_pred):
    epsilon = 1e-15
    
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return (y_pred - y_true)

# Reshape y to match a2
y = y.reshape(-1, 1)  # Ensure y has shape (3,1)

# Compute the gradient of the loss with respect to a2
grad_a2 = out_layer_grad(y, a2)

print("Gradient w.r.t. a2:", grad_a2)
print(grad_a2.shape)


Gradient w.r.t. a2: [[-0.75612957]
 [ 0.29936031]
 [ 0.30311456]]
(3, 1)


SAME RESULT !! by the power of math 

In [13]:
## lets move to the backprop of the hidden layer:
def tanh_derivative(x):
    return 1 - np.tanh(x)**2 


grad_z2 = tanh_derivative(a2) * grad_a2

print(grad_z2)
print(grad_z2.shape)

[[-0.71288511]
 [ 0.2740577 ]
 [ 0.27688598]]
(3, 1)


In [14]:
# now we need  to calcukate the grad for the parametrs in the hidden layer (W2 , B2)
dW2 = np.dot( a1.T, grad_z2) / num_of_example
db2 = np.sum(grad_z2, axis=0, keepdims=True) / num_of_example

print(dW2)
print(dW2.shape)

print(db2)
print(db2.shape)

[[-0.05373919]
 [ 0.05397819]
 [-0.05396691]
 [-0.01792792]
 [ 0.05357195]
 [ 0.04335839]]
(6, 1)
[[-0.05398048]]
(1, 1)


In [15]:
grad_a1 = np.dot(grad_z2, W2.T)

print(grad_a1)
print(grad_a1.shape)


[[ 0.95116421  0.2018775  -0.26690661  0.6353925  -1.17548684  0.42992467]
 [-0.36566043 -0.07760869  0.10260814 -0.24426686  0.45189781 -0.16527792]
 [-0.36943405 -0.07840962  0.10366705 -0.2467877   0.4565614  -0.16698359]]
(3, 6)


In [16]:
dz1  = grad_a1 * tanh_derivative(a1)
print(dz1)
print(dz1.shape)

[[ 0.4000827   0.08478461 -0.11210368  0.4078584  -0.49496814  0.19400912]
 [-0.153568   -0.03259366  0.04309278 -0.10521511  0.18978549 -0.07016341]
 [-0.15515282 -0.03293003  0.0435375  -0.10376674  0.19174407 -0.07023883]]
(3, 6)


In [17]:
# now we need  to calcukate the grad for the parametrs in the hidden layer (W2 , B2)
dW1 = np.dot( x.T, dz1) / num_of_example
db1 = np.sum(dz1, axis=0, keepdims=True) / num_of_example

print(dW2)
print(dW2.shape)

print(db2)
print(db2.shape)

[[-0.05373919]
 [ 0.05397819]
 [-0.05396691]
 [-0.01792792]
 [ 0.05357195]
 [ 0.04335839]]
(6, 1)
[[-0.05398048]]
(1, 1)


In [18]:
lr = 0.01

W1 -= lr * dW1
b1 -= lr * db1.ravel()  # Fix here
W2 -= lr * dW2
b2 -= lr * db2.ravel()  # No need to fix since b2 is already (1,)


print("new W1:", W1)
print("new b1:", b1)
print("new W2:", W2)
print("new bw:", b2)



new W1: [[ 1.02442225 -0.48617437  0.49216432 -0.02231174 -2.13577719  0.65322393]
 [ 0.52692028 -1.10778412 -0.24164501 -0.63895171 -0.27823181 -0.41178437]
 [ 0.5792866  -1.13818515  1.73877508  0.15348544 -0.28865375 -0.5580445 ]]
new b1: [-3.04539601e-04 -6.42030881e-05  8.49113114e-05 -6.62921834e-04
  3.78128603e-04 -1.78689580e-04]
new W2: [[-1.33370875]
 [-0.28372355]
 [ 0.37494307]
 [-0.89111792]
 [ 1.64837913]
 [-0.60351065]]
new bw: [0.0005398]


In [19]:
# Forward pass
z1 = np.dot(x, W1) + b1
a1 = np.tanh(z1)

z2 = np.dot(a1, W2) + b2
a2 = sigmoid(z2)

# Compute new loss
loss = binary_cross_entropy(y, a2)

print("New loss:", loss)


New loss: 0.7103104629722242


In [20]:
# Hyperparameters
lr = 0.01
epochs = 100  # Number of training iterations

for epoch in range(epochs):
    # Forward pass
    z1 = np.dot(x, W1) + b1
    a1 = np.tanh(z1)

    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)

    # Compute loss
    loss = binary_cross_entropy(y, a2)

    # Backpropagation
    grad_a2 = out_layer_grad(y, a2)
    grad_z2 = grad_a2 * sigmoid(a2) * (1 - sigmoid(a2))

    dW2 = np.dot(a1.T, grad_z2) / num_of_example
    db2 = np.sum(grad_z2, axis=0, keepdims=True) / num_of_example

    grad_z1 = np.dot(grad_z2, W2.T) * tanh_derivative(a1)

    dW1 = np.dot(x.T, grad_z1) / num_of_example
    db1 = np.sum(grad_z1, axis=0, keepdims=True) / num_of_example

    # Parameter updates
    W1 -= lr * dW1
    b1 -= lr * db1.ravel()
    W2 -= lr * dW2
    b2 -= lr * db2.ravel()

    # Print loss every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")


Epoch 0, Loss: 0.7103104629722242
Epoch 10, Loss: 0.7130924490207929
Epoch 20, Loss: 0.7158640169142508
Epoch 30, Loss: 0.7186140713945002
Epoch 40, Loss: 0.7213306956938573
Epoch 50, Loss: 0.7240011629424207
Epoch 60, Loss: 0.7266118963590932
Epoch 70, Loss: 0.7291483650060098
Epoch 80, Loss: 0.7315948999691964
Epoch 90, Loss: 0.7339344128948743


In [21]:
import torch 
import torch.nn as nn 
import torch.optim as optim

tensor_x = torch.tensor(x , dtype=torch.float32)
tensor_y = torch.tensor(y , dtype = torch.float32)

class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(3, 6)  # input: 3, output: 6
        self.tanh = nn.Tanh()
        self.fc2 = nn.Linear(6, 1)  # input: 6, output: 1
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        z1 = self.fc1(x)         # automatically: z1 = x * W1^T + b1, shape: (3,6)
        a1 = self.tanh(z1)
        z2 = self.fc2(a1)        # shape: (3,1)
        a2 = self.sigmoid(z2)
        return a2 
    
np.random.seed(42)

W1 = np.random.randn(3, 6)
b1 = np.zeros((6,))
W2 = np.random.randn(6, 1)
b2 = np.zeros((1,))

model = MyNet()

model.fc1.weight.data = torch.tensor(W1.T, dtype=torch.float32)  # shape: (6,3)
model.fc1.bias.data = torch.tensor(b1, dtype=torch.float32)        # shape: (6,)

# For fc2: your NumPy W2 is (6,1), and PyTorch expects (1,6)
model.fc2.weight.data = torch.tensor(W2.T, dtype=torch.float32)  # shape: (1,6)
model.fc2.bias.data = torch.tensor(b2, dtype=torch.float32)        # shape: (1,)

# Verify parameter shapes:
print("fc1 weight shape:", model.fc1.weight.data.shape)  # (6,3)
print("fc1 bias shape:", model.fc1.bias.data.shape)      # (6,)
print("fc2 weight shape:", model.fc2.weight.data.shape)  # (1,6)
print("fc2 bias shape:", model.fc2.bias.data.shape)      # (1,)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop (using the same hyperparameters)
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()         # Reset gradients
    output = model(tensor_x)        # Forward pass, output shape: (3,1)
    loss = criterion(output, tensor_y)
    loss.backward()               # Backward pass (computes gradients automatically)
    optimizer.step()              # Update parameters

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Final forward pass to see predictions
with torch.no_grad():
    final_output = model(tensor_x)
    print("Final predictions:", final_output)



fc1 weight shape: torch.Size([6, 3])
fc1 bias shape: torch.Size([6])
fc2 weight shape: torch.Size([1, 6])
fc2 bias shape: torch.Size([1])
Epoch 0, Loss: 0.8977
Epoch 10, Loss: 0.8208
Epoch 20, Loss: 0.7587
Epoch 30, Loss: 0.7182
Epoch 40, Loss: 0.6902
Epoch 50, Loss: 0.6694
Epoch 60, Loss: 0.6528
Epoch 70, Loss: 0.6385
Epoch 80, Loss: 0.6252
Epoch 90, Loss: 0.6119
Final predictions: tensor([[0.4061],
        [0.3600],
        [0.3600]])
