# Backpropagation in NumPy vs TensorFlow vs PyTorch

## 1) NumPy

In [1]:
import numpy as np

In [2]:
rowdim, coldim = 4,3

#### create data

In [3]:
Xdata = np.random.randn(rowdim, coldim)
print(Xdata)
Ydata = np.ones((rowdim, coldim))
print(Ydata)
Zdata = np.arange(1,13).reshape(4,3)
print(Zdata)

[[ 0.43787304  0.96557349  1.0921883 ]
 [ 0.13662005 -0.38701585  0.68148381]
 [-0.09396063  1.12197495 -1.1509547 ]
 [ 0.90929498 -0.33493472 -0.35300799]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


#### forward computations

In [4]:
# multiplication op
# this is element-wise
node1 = Xdata * Ydata
node1

array([[ 0.43787304,  0.96557349,  1.0921883 ],
       [ 0.13662005, -0.38701585,  0.68148381],
       [-0.09396063,  1.12197495, -1.1509547 ],
       [ 0.90929498, -0.33493472, -0.35300799]])

In [5]:
# addition op
node2 = node1 + Zdata
node2

array([[  1.43787304,   2.96557349,   4.0921883 ],
       [  4.13662005,   4.61298415,   6.68148381],
       [  6.90603937,   9.12197495,   7.8490453 ],
       [ 10.90929498,  10.66506528,  11.64699201]])

In [6]:
# summarize whole matrix
node3 = np.sum(node2)
node3

81.025134745767133

#### backprop

In [8]:
# Let's call the result r
# then this is dr/dnode3, which is just 1
grad_node3 = 1.0

In [9]:
# chain rule
# dr/dnode2 = dr/dnode3 * dnode3/dnode2
# the latter is 1 for every element because of the summation operation
grad_node2 = grad_node3 * np.ones((rowdim, coldim))
grad_node2

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [10]:
# chain rule again
# dr/dnode1 = dr/dnode3 * dnode3/dnode2 * dnode2/dnode1
# the latter is 1 because of the addition
grad_node1 = grad_node2.copy()
grad_node1

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [11]:
# chain rule again
# dr/Z = dr/dnode3 * dnode3/dnode2 * dnode2/dZ
# the latter is 1 because of the addition
grad_Z = grad_node2.copy()
grad_Z

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [12]:
# chain rule again
# dr/dY = dr/dnode3 * dnode3/dnode2 * dnode2/dnode1 * dnode1/dY
# the latter is X because of the multiplication
grad_Y = grad_node1 * Xdata
grad_Y

array([[ 0.43787304,  0.96557349,  1.0921883 ],
       [ 0.13662005, -0.38701585,  0.68148381],
       [-0.09396063,  1.12197495, -1.1509547 ],
       [ 0.90929498, -0.33493472, -0.35300799]])

In [13]:
# chain rule again
# dr/dX = dr/dnode3 * dnode3/dnode2 * dnode2/dnode1 * dnode1/dX
# the latter is Y because of the multiplication
grad_X = grad_node1 * Ydata
grad_X

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

## 2) TensorFlow

In [14]:
import tensorflow as tf

#### define graph

In [15]:
X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)
Z = tf.placeholder(tf.float32)

In [16]:
node1 = X * Y
node1

<tf.Tensor 'mul:0' shape=<unknown> dtype=float32>

In [17]:
node2 = node1 + Z
node2

<tf.Tensor 'add:0' shape=<unknown> dtype=float32>

In [18]:
node3 = tf.reduce_sum(node2)
node3

<tf.Tensor 'Sum:0' shape=<unknown> dtype=float32>

#### nodes to get the gradients

In [20]:
# from the docs
# tf.gradients(ys, xs, grad_ys=None, name='gradients', colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None)
# Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.

grad_X, grad_Y, grad_Z = tf.gradients(node3, [X,Y,Z])

#### run

In [21]:
with tf.Session() as sess:
    node3_eval, grad_X_eval, grad_Y_eval, grad_Z_eval = sess.run(
        [node3, grad_X, grad_Y, grad_Z], feed_dict = {X : Xdata, Y: Ydata, Z: Zdata})    
    print(node3_eval)
    print(grad_X_eval)
    print(grad_Y_eval)
    print(grad_Z_eval)

81.0251
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
[[ 0.43787304  0.96557349  1.09218836]
 [ 0.13662006 -0.38701585  0.68148381]
 [-0.09396063  1.12197495 -1.15095472]
 [ 0.90929496 -0.33493471 -0.35300797]]
[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]


## 3) PyTorch

In [1]:
import torch
from torch.autograd import Variable

#### Variables hold data (Tensors) and gradients

In [3]:
rowdim, coldim = 4,3

X = Variable(torch.randn(rowdim, coldim), requires_grad = True)
Y = Variable(torch.ones(rowdim, coldim), requires_grad = True)
Z = Variable(torch.arange(1, 13).view(rowdim, coldim), requires_grad = True)

X, Y, Z

(Variable containing:
 -0.1362 -0.6896  1.0246
 -0.6552  1.0568 -0.0626
  0.5539  0.2326  0.0676
  0.1412  0.1692  0.3174
 [torch.FloatTensor of size 4x3], Variable containing:
  1  1  1
  1  1  1
  1  1  1
  1  1  1
 [torch.FloatTensor of size 4x3], Variable containing:
   1   2   3
   4   5   6
   7   8   9
  10  11  12
 [torch.FloatTensor of size 4x3])

#### forward computation

In [4]:
node1 = X * Y
node1

Variable containing:
-0.1362 -0.6896  1.0246
-0.6552  1.0568 -0.0626
 0.5539  0.2326  0.0676
 0.1412  0.1692  0.3174
[torch.FloatTensor of size 4x3]

In [5]:
node2 = node1 + Z
node2

Variable containing:
  0.8638   1.3104   4.0246
  3.3448   6.0568   5.9374
  7.5539   8.2326   9.0676
 10.1412  11.1692  12.3174
[torch.FloatTensor of size 4x3]

In [6]:
node3 = torch.sum(node2)
node3

Variable containing:
 80.0198
[torch.FloatTensor of size 1]

#### autograd!

In [7]:
node3.backward()

In [8]:
print(X.grad.data)
print(Y.grad.data)
print(Z.grad.data)


 1  1  1
 1  1  1
 1  1  1
 1  1  1
[torch.FloatTensor of size 4x3]


-0.1362 -0.6896  1.0246
-0.6552  1.0568 -0.0626
 0.5539  0.2326  0.0676
 0.1412  0.1692  0.3174
[torch.FloatTensor of size 4x3]


 1  1  1
 1  1  1
 1  1  1
 1  1  1
[torch.FloatTensor of size 4x3]

