In [None]:
%matplotlib inline


PyTorch: Defining New autograd Functions
----------------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Variables, and uses PyTorch autograd to compute gradients.

In this implementation we implement our own custom autograd function to perform
the ReLU function.



In [1]:
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print("%4d\t%.6f" % (t, loss))

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

   0	54046960.000000
   1	55070600.000000
   2	46665268.000000
   3	27945894.000000
   4	12396962.000000
   5	5251271.000000
   6	2775569.000000
   7	1860413.250000
   8	1416878.375000
   9	1138690.500000
  10	937261.875000
  11	781948.562500
  12	658709.812500
  13	559323.062500
  14	478091.625000
  15	411089.218750
  16	355327.406250
  17	308593.875000
  18	269250.656250
  19	235910.250000
  20	207449.953125
  21	183025.984375
  22	161971.046875
  23	143749.812500
  24	127946.617188
  25	114168.718750
  26	102113.039062
  27	91527.234375
  28	82210.187500
  29	73987.578125
  30	66709.687500
  31	60248.187500
  32	54499.656250
  33	49374.042969
  34	44789.984375
  35	40686.312500
  36	37007.113281
  37	33701.589844
  38	30728.539062
  39	28048.916016
  40	25633.107422
  41	23452.644531
  42	21479.587891
  43	19691.175781
  44	18070.726562
  45	16598.136719
  46	15258.511719
  47	14038.626953
  48	12926.473633
  49	11911.927734
  50	10985.184570
  51	10136.625000
  52	9359.799805
  53	