In [1]:
import matplotlib.pyplot as plt
import numpy as np
import time
%matplotlib tk

In [20]:
## =============== ##
## Define our data ##
## =============== ##

# input to our model. Represents time in seconds
x_data = np.array([0,1,2]).reshape(3,1)
# outputs associated to each input. Represents cantidad de lluvia in mm^3
t_data = np.array([0.2,1.3,2.4]).reshape(3,1)

## display
plt.plot(x_data,t_data,'o', markersize = 8, label = 'data observations')
plt.xlabel('tiempo')
plt.ylabel('cantidad de lluvia')
plt.legend()

<matplotlib.legend.Legend at 0x7f80ea6b8950>

In [21]:
## ======================================================= ##
## ======== functionality for computational graph ======== ##
## ======================================================= ##

## function implementing an activation function
def activation_function_linear(x):
    return x

## function that implements the computational graph
def computation_graph_linear(x,w,b):
    ''' This function represents a computational graph, a neural network, that implements a linear operation'''
    # this is the W^0 x from the theory above implemented using a transposition ;)
    y = activation_function_linear(np.matmul(x,w) + b)
    return y

## function that implements the computational graph
def computation_graph_linear_just_weight(x,w):
    ''' This function represents a computational graph, a neural network, that implements a linear operation, with no weight'''
    # this is the W^0 x from the theory above implemented using a transposition ;)
    y = activation_function_linear(np.matmul(x,w))
    return y

## function that initializes the values of a computational graph
def create_computation_graph_linear(n_in,n_out):
    ''' Create elements of the computational graph'''
    # parameters
    w = np.random.randn(n_in,n_out) + 1 # get a random value from standard normal distribution
    b = np.random.randn(n_out,)*5 # get a random value from Gaussian with mean 0 and standard deviation 5.

    return w,b

## function implementing squared loss function
def squared_loss_function_just_weight(x,t,w):
    y_pred = activation_function_linear(np.matmul(x,w))
    return (y_pred-t)**2

def grad_squared_loss_just_weight(x,t,w):
    # forward operation
    y_pred = activation_function_linear(np.matmul(x,w))
    
    # backward operation (compute gradients / backpropagation / reverse mode autodiff)
    grad_w = np.sum(2*(y_pred-t)*x, axis = 0, keepdims = True)
    
    return grad_w

In [None]:
## ====================================== ##
## ========== Gradient Descent ========== ##
## ====================================== ##

## number of points in the domain used to plot the functions 
N_points_domain = 100
x_range = np.linspace(-1,4, N_points_domain).reshape((N_points_domain,1))

## specify our computational graph
n_in = 1
n_out = 1

## first of all draw loss function against a set of parameters
w_range = np.linspace(-10,15,500).reshape((500,n_in,n_out))

loss_range = squared_loss_function_just_weight(x_data,t_data,w_range)

## accumulate loss per datapoint
loss_acc_range = np.sum(loss_range, axis = 1)

## squeeze and display
loss_acc_range = np.squeeze(loss_acc_range)
w_range = np.squeeze(w_range)

# plot grid
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 6))

# display loss function
ax1.plot(w_range, loss_acc_range, color = 'C0')
ax1.set_xlabel('Weight')
ax1.set_ylabel('Loss')

# Initialize parameters
w = np.array([9]).reshape(n_in,n_out)

## gradient descent parameters
lr = 0.025 # try 0.1, 0.01, 0.15, 0.21 to show: fast convergence, slow convergence, convergence with bumping, divergence
epochs = 10

for e in range(epochs):

    ## forward plus backward
    grad_w = grad_squared_loss_just_weight(x_data,t_data,w)

    ## compute function at current parameter value
    function = computation_graph_linear_just_weight(x_range, w)

    ## compute predictions at current parameter value
    y_pred = computation_graph_linear_just_weight(x_data, w)

    ## compute loss at current parameter value
    loss = squared_loss_function_just_weight(x_data,t_data,w)
    loss_acc = np.sum(loss)

    ## get the gradient function at the point w (tangent at the point)
    gradient_function_w_at_current_w = grad_w * w_range + loss_acc - grad_w * w

    ## ============= ##
    ## ============= ##
    ## START DRAWING ##
    ## ============= ##
    ## ============= ##
    # Clear previous data
    ax1.clear()  
    ax2.clear()
    
    w_plot = np.squeeze(w)
    grad_w_plot = np.squeeze(grad_w)
    x_data_plot = np.squeeze(x_data)
    t_data_plot = np.squeeze(t_data)
    y_pred_plot = np.squeeze(y_pred)
    loss_plot = np.squeeze(loss)
    
    # get new weight after grad descent. Just for illustration purposes, the real step is done at the end of the loop
    w_new_plot = np.squeeze(w-lr*grad_w)

    ## ================ ##
    ## function picture ##
    ax2.plot(x_range,function, color = 'C1', label = 'function: y = w*x')
    ax2.plot(x_data,t_data,'o', markersize = 8, label = 'data observations')

    ## plot squared loss associated at each point and draw line between dots to highliht what the loss measures
    for idx, (xi, ti, yi, sl) in enumerate(zip(x_data_plot,t_data_plot,y_pred_plot,loss_plot)):
        if idx == 0:
            ax2.plot(xi,yi, 'x', color = 'C1', label = 'network prediction')
        else:
            ax2.plot(xi,yi, 'x', color = 'C1')
        ax2.plot([xi,xi], [ti, yi], '--',color = f"C1", alpha = 0.5)
        ax2.text(xi, yi, f'{sl:.2f}', fontsize=12, va='top', color = f"C1" ) 

    # label function with the weight at that moment
    ax2.text(x_range[-20],function[-20], f'w = {w_plot}', color = 'k', fontsize = 12)
    
    ax2.text(1, 33, f"Iteration {e}, squared loss = {loss_acc:.2f}", fontsize=12, va='bottom', color = f"C1" ) 
    ax2.set_xlabel('tiempo')
    ax2.set_ylabel('cantidad de lluvia')
    ax2.set_ylim([-10,30])
    ax2.legend()

    ## draw
    fig.canvas.draw()
    fig.canvas.flush_events()
    
    ## ===================== ##
    ## loss function picture ##
    
    ## 0. label and axis limits
    ax1.set_xlabel('Weight')
    ax1.set_ylabel('Loss')
    ax1.set_ylim([-200,900])

    ## 1. display loss function
    ax1.plot(w_range, loss_acc_range, color = 'C0', label = 'loss', zorder = 20)
    
    ## 2. display current weight
    ax1.plot(w_plot, -190, '*', color = 'C1', label = 'current weight', zorder = 50, markersize = 10)
    ax1.text(w_plot + 1, -190 , f"w = {w_plot:.2f}", fontsize=12, va='bottom', color = f"C1" , zorder = 50)
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(2)

    ## animation by drawing horizontal lines on current parameter and updated parameter values
    ax1.vlines(np.squeeze(w), ymin=-200, ymax=loss_acc, color='k', linestyles='dotted', zorder = -50)

    ## 3. display current loss
    ax1.plot(w_plot, loss_acc, 'o', color = 'C0', label = 'loss at current weight', zorder = 20)
    ax1.text(w_plot + 0.5, loss_acc , f"loss = {loss_acc:.2f}", fontsize=12, va='bottom', color = "C0" , zorder = 50)
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(2)
    
    ## 4. display the gradient function
    ax1.plot(w_range, np.squeeze(gradient_function_w_at_current_w), color = 'C2', label = 'gradient function: f(w) = grad_w * w + loss - grad_w * w', zorder = 20)
    ax1.text(w_range[-1], np.squeeze(gradient_function_w_at_current_w)[-1], f"grad_w = {grad_w_plot:.2f}", fontsize=12, va='bottom', color = f"C2" , zorder = 200) 
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(2)

    ## draw rest of lines to show update
    ax1.hlines(y = loss_acc, xmin=w_new_plot, xmax=w_plot, color='k', linestyles='dotted', zorder = -50)
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(2)
    
    ax1.vlines(w_new_plot, ymin=-200, ymax=loss_acc, color='k', linestyles='dotted', zorder = -50)
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(2)
    
    
    ## 5. display new weight
    ax1.plot(w_new_plot, -190, '*', color = 'C3', label = 'updated weight: w_new = w - lr*grad_w', zorder = 200, markersize = 10)
    ax1.text(w_new_plot, -160, f"w_new = {w_plot:.2f} -{lr:.2f}*{grad_w_plot:.2f} = {w_plot-lr*grad_w_plot:.2f}", fontsize=12, va='bottom', color = f"C3" , zorder = 200) 
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(2)
    


    ## wait to see
    time.sleep(3)

    ## update parameter with gradient descent, for the next update
    w = w-lr*grad_w