In [None]:
################################################################
# simple_ml_model.ipynb
# (c) May 2024 Sachin Alexander Reddy
#################################################################

In [None]:
import numpy as np

Define the input data (X) and ground truth (y)

In [None]:
# --- Input data ---
# This has 1 sample and 3 inputs
# The inputs are also called features and notationally are called X (uppercase)
X = np.array([[0.1, 0.2, 0.3]])

# --- Ground truth ---
# This has 1 sample and 1 output
# The ground truth is also called a target and notationally is called y (lowercase)
# The goal of a neural network is the prediction of y. This output is called y_hat, y_pred, or y prime
y = np.array([[0.6]]) 

#The above is very simple and in reality you would have many samples and many features

Define the neural network parameters

In [None]:
#Seed is fixed to make the results reproducible. This is not used in production code.
np.random.seed(0)

#--- Input and Output of model ---
input_size = X.shape[1] # Number of features
output_size = y.shape[1] # Number of outputs

# -- Hyperparameters ---
# Hyperparameters are parameters that are set before training
hidden_size = 4 # Number of neurons in the hidden layer
# Learning rate is a hyperparameter that controls how much we are adjusting the 
# weights of our network with respect the loss gradient.
learning_rate = 0.1 

#--- Weights and biases initialization ---
#These will be updated during backpropagation.
#Initialize weights and biases randomly. Randomness is important to break symmetry
#between neurons and to prevent overfitting.
W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1,hidden_size)) # 1 dim makes it a matrix or tensor
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1,output_size))

Forward pass. Here the inputs are modified by the weights and biases. Then the prediction and loss are calculated

In [None]:
# --- Input layer ---
#This is the features defined by X
hidden_layer_input = np.dot(X, W1) + b1

# --- Activation function ---
#This is the output of the hidden layer and adds non-linearity to the model
#Non linearity where ML gets its power
hidden_layer_output = np.maximum(0, hidden_layer_input)

# --- Hidden layers ---
#In production code additional layers would be here
#Multiple layers == deep learning
#Between hidden layers 'dropout' and 'batch normalization' are often used to prevent overfitting

#--- Output layer ---
#This has same shape as y_true
output_layer_input = np.dot(hidden_layer_output, W2) + b2 
y_pred = output_layer_input #y_true is often denoted as $\hat{y}$

The first prediction

In [None]:
#--- 1st Prediction ---
#By definition it is random and not accurate. No 'learning' has happened yet.
print(f'1st Prediction (y`): {y_pred[0][0]:.2f}')
print (f'Ground Truth (y): {y[0][0]:.2f}')

#--- Loss Function ---
#This is the difference between the predicted value and the ground truth
#This is the error that the model will try to minimize
#Mean Squared Error (MSE) is a common loss function
#You will also hear this referred to as the 'cost function'. Its the same thing.
loss_e1 = round(np.square(y_pred - y).sum(),4)
print(f'1st MSE Loss: {loss_e1}')

Backward pass. Here the gradient of the weights are calculated using the caculus chain rule

In [None]:
#--- Backpropagation ---
# This is the process of updating the weights and biases to minimize the loss function
# This is where the 'learning' happens

# --- Calculate Gradient of Output Layer ---
#You can also think of this as the 'error' of the model
#This is the difference between the predicted value and the ground truth
grad_output = y_pred - y 

# --- Calculate Gradient of Hidden Layer ---
#Caluclate gradient for Weight 2
# We have to transpose the weights because of the dot product
grad_W2 = np.dot(hidden_layer_output.T, grad_output) #Shape (hidden_size, output_size)
#For now this is same value as grad_output
grad_b2 = np.sum(grad_output, axis=0, keepdims=True) #Shape (1, output_size)

#--- Calculate Gradient of Hidden Layer ---
grad_hidden = np.dot(grad_output, W2.T) #Shape (1, hidden_size)
grad_hidden[hidden_layer_input <= 0] = 0

# --- Calculate Gradient of Input Layer ---
grad_W1 = np.dot(X.T, grad_hidden) #Shape (input_size, hidden_size)
grad_b1 = np.sum(grad_hidden, axis=0, keepdims=True)  #Shape (1, hidden_size)

#The above shows the gradients for the weights and biases using backpropagation
#This process uses the chain rule to calculate the gradients of the loss function with respect to the weights and biases

Update the weights and biases

In [None]:
#Note the negative sign in the update step. 
#This is because we are trying to minimize the loss function
#This is where Adam, RMSProp, and other optimization algorithms come in
#We're using the simplest form of gradient descent here
W1 -= learning_rate * grad_W1
b1 -= learning_rate * grad_b1
W2 -= learning_rate * grad_W2
b2 -= learning_rate * grad_b2

#We have now completed one iteration of the training process
#We would repeat this process many times to train the model
#This is called an 'epoch'

Epoch 2 - Repeat the above

In [None]:
# -- Forward Pass --
hidden_layer_input = np.dot(X, W1) + b1
hidden_layer_output = np.maximum(0, hidden_layer_input)
output_layer_input = np.dot(hidden_layer_output, W2) + b2 
y_pred = output_layer_input #y_true is often denoted as $hat{y}$

# --- Print Results ---
loss_e2 = round(np.square(y_pred - y).sum(),4)
print(f'2nd Prediction (y`): {y_pred[0][0]:.2f}')
print(f'Ground Truth (y): {y[0][0]:.2f}')
print(f'2nd MSE Loss: {loss_e2}')

#--- Backpropagation ---
grad_output = y_pred - y
grad_W2 = np.dot(hidden_layer_output.T, grad_output)
grad_b2 = np.sum(grad_output, axis=0, keepdims=True)
grad_hidden = np.dot(grad_output, W2.T)
grad_hidden[hidden_layer_input <= 0] = 0
grad_W1 = np.dot(X.T, grad_hidden)
grad_b1 = np.sum(grad_hidden, axis=0, keepdims=True)

#--- Update Weights and Biases ---
W1 -= learning_rate * grad_W1
b1 -= learning_rate * grad_b1
W2 -= learning_rate * grad_W2
b2 -= learning_rate * grad_b2


Epoch 3 - Repeat the above

In [None]:
# -- Forward Pass --
hidden_layer_input = np.dot(X, W1) + b1
hidden_layer_output = np.maximum(0, hidden_layer_input)
output_layer_input = np.dot(hidden_layer_output, W2) + b2 
y_pred = output_layer_input #y_true is often denoted as $hat{y}$

# --- Print Results ---
loss_e3 = round(np.square(y_pred - y).sum(),4)
print(f'3rd Prediction (y`): {y_pred[0][0]:.2f}')
print(f'Ground Truth (y): {y[0][0]:.2f}')
print(f'3rd MSE Loss: {loss_e3}')

#--- Backpropagation ---
grad_output = y_pred - y
grad_W2 = np.dot(hidden_layer_output.T, grad_output)
grad_b2 = np.sum(grad_output, axis=0, keepdims=True)
grad_hidden = np.dot(grad_output, W2.T)
grad_hidden[hidden_layer_input <= 0] = 0
grad_W1 = np.dot(X.T, grad_hidden)
grad_b1 = np.sum(grad_hidden, axis=0, keepdims=True)

#--- Update Weights and Biases ---
W1 -= learning_rate * grad_W1
b1 -= learning_rate * grad_b1
W2 -= learning_rate * grad_W2
b2 -= learning_rate * grad_b2

#After 3 epochs you can see we are 'learning' and the loss function is minimizing

This can all be combined into a single function with training loop

In [None]:
def machine_learning(X, y, learning_rate, epochs):
    #Seed is fixed to make the results reproducible. This is not used in production code.
    np.random.seed(0)

    #--- Input and Output of model ---
    input_size = X.shape[1] # Number of features
    output_size = y.shape[1] # Number of output neurons

    # -- Hyperparameters ---
    hidden_size = 4 # Number of neurons in the hidden layer
    learning_rate = 0.1 

    #--- Weights and biases initialization ---
    W1 = np.random.randn(input_size, hidden_size)
    b1 = np.zeros((1,hidden_size)) # 1 makes nested arr aka a matrix
    W2 = np.random.randn(hidden_size, output_size)
    b2 = np.zeros((1,output_size))

    for epoch in range(epochs):
        # --- Forward Pass ---
        hidden_layer_input = np.dot(X, W1) + b1
        hidden_layer_output = np.maximum(0, hidden_layer_input)
        output_layer_input = np.dot(hidden_layer_output, W2) + b2
        y_pred = output_layer_input

        # --- Loss Function ---
        loss = np.square(y_pred - y).sum()

        # --- Backpropagation ---
        grad_output = y_pred - y
        grad_W2 = np.dot(hidden_layer_output.T, grad_output)
        grad_b2 = np.sum(grad_output, axis=0, keepdims=True)
        grad_hidden = np.dot(grad_output, W2.T)
        grad_hidden[hidden_layer_input <= 0] = 0
        grad_W1 = np.dot(X.T, grad_hidden)

        # --- Update Weights and Biases ---
        W1 -= learning_rate * grad_W1
        b1 -= learning_rate * grad_b1
        W2 -= learning_rate * grad_W2
        b2 -= learning_rate * grad_b2

        print (f'Epoch {epoch+1}, y: {y[0][0]:.2f}, y`: {y_pred[0][0]:.2f}, Loss:{loss:.4f}')

# --- Input data ---
X = np.array([[0.1, 0.2, 0.3]]) # This has 1 sample, 3 features
y = np.array([[0.6]]) # This has 1 sample, 1 output

# --- Training Loop ---
machine_learning(X, y, learning_rate=0.1, epochs=10) # adjust epochs to see how the model learns

Save model, load, make predictions on unseen data

In [None]:
# ---  Save Model ---
# All this involves is saving the weights and biases
# PyToch, TensorFlow, and other libraries do this with .save() or .state_dict() methods
model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
np.save('model.npy', model)

In [None]:
# --- Load Model ---
model = np.load('model.npy', allow_pickle=True).item()
W1 = model['W1']
b1 = model['b1']
W2 = model['W2']
b2 = model['b2']

# --- Inference ---
#This is the process of using the trained model to make predictions
#The reason predictions are so quick is bause we only have to do a forward pass
#which is a simple series of dot products and activation functions
#based on the final weights and biases

hidden_layer_input = np.dot(X, W1) + b1
hidden_layer_output = np.maximum(0, hidden_layer_input)
output_layer_input = np.dot(hidden_layer_output, W2) + b2
y_pred = output_layer_input

print(f'Inference, y: {y[0][0]:.2f}, y`: {y_pred[0][0]:.2f}')