In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 39790405.63040405
1 35351362.98309002
2 31457561.85181899
3 24469432.051714174
4 16077411.470824562
5 9279603.281604454
6 5170980.676031596
7 3035958.4280475522
8 1970408.6065058778
9 1407998.974299888
10 1080224.8688999384
11 866513.0698874111
12 714003.4930162297
13 598501.4533967425
14 507371.1886547646
15 433779.59745261003
16 373387.5551833365
17 323307.8357475868
18 281273.0695333455
19 245712.17354216552
20 215486.91168461976
21 189629.496489537
22 167401.53218895098
23 148218.72253041278
24 131592.23081915738
25 117142.27681352958
26 104550.81335277238
27 93533.29596371009
28 83843.96643112894
29 75307.82836401036
30 67766.25789154563
31 61085.562540082916
32 55155.63600887217
33 49880.159440235395
34 45176.71139718444
35 40973.05298449869
36 37209.7342049423
37 33834.68660541959
38 30803.281575393194
39 28076.783787170065
40 25620.31003030524
41 23402.418551968516
42 21398.18675167228
43 19587.350668829422
44 17948.747190727874
45 16462.224987174588
46 15111.80331343602
47 1

^^ Warm-up: numpy
Before introducing PyTorch, we will first implement the network using numpy.

Numpy provides an n-dimensional array object, and many functions for manipulating these arrays. Numpy is a generic framework for scientific computing; it does not know anything about computation graphs, or deep learning, or gradients. However we can easily use numpy to fit a two-layer network to random data by manually implementing the forward and backward passes through the network using numpy operations:

PyTorch: Tensors and autograd
In the above examples, we had to manually implement both the forward and backward passes of our neural network. Manually implementing the backward pass is not a big deal for a small two-layer network, but can quickly get very hairy for large complex networks.

Thankfully, we can use automatic differentiation to automate the computation of backward passes in neural networks. The autograd package in PyTorch provides exactly this functionality. When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

This sounds complicated, it’s pretty simple to use in practice. Each Tensor represents a node in a computational graph. If x is a Tensor that has x.requires_grad=True then x.grad is another Tensor holding the gradient of x with respect to some scalar value.

Here we use PyTorch Tensors and autograd to implement our two-layer network; now we no longer need to manually implement the backward pass through the network:

In [3]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 28349808.0
1 22321058.0
2 20010376.0
3 18303112.0
4 16003870.0
5 12884274.0
6 9564278.0
7 6625170.5
8 4428145.0
9 2934918.5
10 1985720.0
11 1392191.25
12 1020905.5
13 782016.875
14 622424.5625
15 510774.0
16 429086.3125
17 366645.625
18 317201.375
19 276993.03125
20 243570.71875
21 215356.71875
22 191216.625
23 170415.1875
24 152421.25
25 136692.71875
26 122883.359375
27 110721.1484375
28 99993.0
29 90497.421875
30 82047.7890625
31 74514.234375
32 67780.0234375
33 61752.3125
34 56363.12890625
35 51524.43359375
36 47166.1328125
37 43235.1484375
38 39679.5
39 36459.140625
40 33539.21875
41 30892.853515625
42 28484.873046875
43 26292.73828125
44 24293.60546875
45 22468.052734375
46 20798.646484375
47 19270.3359375
48 17870.283203125
49 16585.92578125
50 15409.5029296875
51 14328.341796875
52 13333.5810546875
53 12416.6728515625
54 11571.3984375
55 10791.607421875
56 10071.951171875
57 9406.5361328125
58 8790.986328125
59 8221.302734375
60 7693.255859375
61 7203.74609375
62 6749.64160156