In [1]:
# Code in file autograd/two_layer_net_autograd.py
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Variables; these
  # are exactly the same operations we used to compute the forward pass using
  # Tensors, but we do not need to keep references to intermediate values since
  # we are not implementing the backward pass by hand.
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  
  # Compute and print loss using operations on Variables.
  # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
  # (1,); loss.data[0] is a scalar value holding the loss.
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.data[0])
  
  # Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Variables with requires_grad=True.
  # After this call w1.grad and w2.grad will be Variables holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  loss.backward()

  # Update weights using gradient descent; w1.data and w2.data are Tensors,
  # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
  # Tensors.
  w1.data -= learning_rate * w1.grad.data
  w2.data -= learning_rate * w2.grad.data

  # Manually zero the gradients 
  w1.grad.data.zero_()
  w2.grad.data.zero_()



0 tensor(3.0381e+07)
1 tensor(2.2981e+07)
2 tensor(2.0299e+07)
3 tensor(1.8700e+07)
4 tensor(1.6627e+07)
5 tensor(1.3689e+07)
6 tensor(1.0334e+07)
7 tensor(7.2445e+06)
8 tensor(1.00000e+06 *
       4.8503)
9 tensor(1.00000e+06 *
       3.1961)
10 tensor(1.00000e+06 *
       2.1316)
11 tensor(1.00000e+06 *
       1.4636)
12 tensor(1.00000e+06 *
       1.0449)
13 tensor(1.00000e+05 *
       7.7647)
14 tensor(1.00000e+05 *
       5.9878)
15 tensor(1.00000e+05 *
       4.7620)
16 tensor(1.00000e+05 *
       3.8826)
17 tensor(1.00000e+05 *
       3.2277)
18 tensor(1.00000e+05 *
       2.7241)
19 tensor(1.00000e+05 *
       2.3266)
20 tensor(1.00000e+05 *
       2.0064)
21 tensor(1.00000e+05 *
       1.7440)
22 tensor(1.00000e+05 *
       1.5256)
23 tensor(1.00000e+05 *
       1.3417)
24 tensor(1.00000e+05 *
       1.1855)
25 tensor(1.00000e+05 *
       1.0517)
26 tensor(93667.4766)
27 tensor(83688.8672)
28 tensor(74994.2031)
29 tensor(67387.7031)
30 tensor(60708.0547)
31 tensor(54814.9141)


391 tensor(1.00000e-03 *
       1.0175)
392 tensor(1.00000e-04 *
       9.8592)
393 tensor(1.00000e-04 *
       9.5280)
394 tensor(1.00000e-04 *
       9.2384)
395 tensor(1.00000e-04 *
       8.9280)
396 tensor(1.00000e-04 *
       8.6540)
397 tensor(1.00000e-04 *
       8.3971)
398 tensor(1.00000e-04 *
       8.1254)
399 tensor(1.00000e-04 *
       7.8516)
400 tensor(1.00000e-04 *
       7.5842)
401 tensor(1.00000e-04 *
       7.3690)
402 tensor(1.00000e-04 *
       7.1448)
403 tensor(1.00000e-04 *
       6.9386)
404 tensor(1.00000e-04 *
       6.7201)
405 tensor(1.00000e-04 *
       6.5115)
406 tensor(1.00000e-04 *
       6.3204)
407 tensor(1.00000e-04 *
       6.1427)
408 tensor(1.00000e-04 *
       5.9509)
409 tensor(1.00000e-04 *
       5.7960)
410 tensor(1.00000e-04 *
       5.6185)
411 tensor(1.00000e-04 *
       5.4747)
412 tensor(1.00000e-04 *
       5.2945)
413 tensor(1.00000e-04 *
       5.1639)
414 tensor(1.00000e-04 *
       5.0154)
415 tensor(1.00000e-04 *
       4.8763)
