# Backpropagation in NumPy vs TensorFlow vs PyTorch

## 1) NumPy

In [1]:
import numpy as np

In [2]:
rowdim, coldim = 4,3

#### create data

In [4]:
Xdata = np.random.randn(rowdim, coldim)
print(Xdata)
Ydata = np.ones((rowdim, coldim))
print(Ydata)
Zdata = np.arange(1,13).reshape(4,3)
print(Zdata)

[[ 0.07687585  0.11738723 -0.95467756]
 [ 1.27800295  0.1663312   0.30979514]
 [-1.19850963  0.3288992  -0.82811468]
 [ 1.25750666  1.02112264 -1.2381702 ]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


#### forward computations

In [5]:
# multiplication op
# this is element-wise
node1 = Xdata * Ydata
node1

array([[ 0.07687585,  0.11738723, -0.95467756],
       [ 1.27800295,  0.1663312 ,  0.30979514],
       [-1.19850963,  0.3288992 , -0.82811468],
       [ 1.25750666,  1.02112264, -1.2381702 ]])

In [6]:
# addition op
node2 = node1 + Zdata
node2

array([[  1.07687585,   2.11738723,   2.04532244],
       [  5.27800295,   5.1663312 ,   6.30979514],
       [  5.80149037,   8.3288992 ,   8.17188532],
       [ 11.25750666,  12.02112264,  10.7618298 ]])

In [7]:
# summarize whole matrix
node3 = np.sum(node2)
node3

78.336448806556831

#### backprop

In [8]:
# Let's call the result r
# then this is dr/dnode3, which is just 1
grad_node3 = 1.0

In [9]:
# chain rule
# dr/dnode2 = dr/dnode3 * dnode3/dnode2
# the latter is 1 for every element because of the summation operation
grad_node2 = grad_node3 * np.ones((rowdim, coldim))
grad_node2

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [10]:
# chain rule again
# dr/dnode1 = dr/dnode3 * dnode3/dnode2 * dnode2/dnode1
# the latter is 1 because of the addition
grad_node1 = grad_node2.copy()
grad_node1

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [11]:
# chain rule again
# dr/Z = dr/dnode3 * dnode3/dnode2 * dnode2/dZ
# the latter is 1 because of the addition
grad_Z = grad_node2.copy()
grad_Z

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [12]:
# chain rule again
# dr/dY = dr/dnode3 * dnode3/dnode2 * dnode2/dnode1 * dnode1/dY
# the latter is X because of the multiplication
grad_Y = grad_node1 * Xdata
grad_Y

array([[ 0.07687585,  0.11738723, -0.95467756],
       [ 1.27800295,  0.1663312 ,  0.30979514],
       [-1.19850963,  0.3288992 , -0.82811468],
       [ 1.25750666,  1.02112264, -1.2381702 ]])

In [13]:
# chain rule again
# dr/dX = dr/dnode3 * dnode3/dnode2 * dnode2/dnode1 * dnode1/dX
# the latter is Y because of the multiplication
grad_X = grad_node1 * Ydata
grad_X

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

## 2) TensorFlow

In [14]:
import tensorflow as tf

#### define graph

In [15]:
X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)
Z = tf.placeholder(tf.float32)

In [16]:
node1 = X * Y
node1

<tf.Tensor 'mul:0' shape=<unknown> dtype=float32>

In [17]:
node2 = node1 + Z
node2

<tf.Tensor 'add:0' shape=<unknown> dtype=float32>

In [18]:
node3 = tf.reduce_sum(node2)
node3

<tf.Tensor 'Sum:0' shape=<unknown> dtype=float32>

#### nodes to get the gradients

In [19]:
# from the docs
# tf.gradients(ys, xs, grad_ys=None, name='gradients', colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None)
# Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.

grad_X, grad_Y, grad_Z = tf.gradients(node3, [X,Y,Z])

#### run

In [20]:
with tf.Session() as sess:
    node3_eval, grad_X_eval, grad_Y_eval, grad_Z_eval = sess.run(
        [node3, grad_X, grad_Y, grad_Z], feed_dict = {X : Xdata, Y: Ydata, Z: Zdata})    
    print(node3_eval)
    print(grad_X_eval)
    print(grad_Y_eval)
    print(grad_Z_eval)

78.3364
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
[[ 0.07687585  0.11738723 -0.95467758]
 [ 1.27800298  0.1663312   0.30979514]
 [-1.19850957  0.3288992  -0.82811469]
 [ 1.25750661  1.02112269 -1.23817015]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]


## 3) PyTorch

In [None]:
import torch
from torch.autograd import Variable

#### Variables hold data (Tensors) and gradients

In [None]:
rowdim, coldim = 4,3

X = Variable(torch.randn(rowdim, coldim), requires_grad = True)
Y = Variable(torch.ones(rowdim, coldim), requires_grad = True)
Z = Variable(torch.arange(1, 13).view(rowdim, coldim), requires_grad = True)

X, Y, Z

#### forward computation

In [None]:
node1 = X * Y
node1

In [None]:
node2 = node1 + Z
node2

In [None]:
node3 = torch.sum(node2)
node3

#### autograd!

In [None]:
node3.backward()

In [None]:
print(X.grad.data)
print(Y.grad.data)
print(Z.grad.data)