# Simple neural network with PyTorch

Using one hidden layer with relU activation and squared error loss for the output.

In [1]:
import torch
from torch.autograd import Variable

## Create computational graph

In [2]:
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
dtype = torch.FloatTensor

In [3]:
# N is batch size; D_in is input dimension; H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [4]:
# wrap tensors in Variables
# a Variable v is a node in a computational graph
# v.data is a tensor
# v.grad is another Variable holding the gradient of x wrt to some scalar
# requires_grad=False: no need to compute gradients during the backward pass
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

In [5]:
# wrap weights in variables
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

In [6]:
learning_rate = 1e-6

## Forward propagation

In [7]:
# no need to keep references to intermediate values since we are not implementing the backward pass by hand
y_pred = x.mm(w1).clamp(min=0).mm(w2)

In [8]:
# loss is a Variable of shape (1,)
# loss.data is a Tensor of shape (1,)
# loss.data[0] is a scalar value holding the loss
loss = (y_pred - y).pow(2).sum()
print(loss.data[0])

29532964.0


## Backpropagation using autograd

In [9]:
# Manually zero the gradients before running the backward pass
# will be needed when doing this in a loop
w1.grad.data.zero_()
w2.grad.data.zero_()


    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 100x10]

In [10]:
# Use autograd to compute the complete backward pass
# w1.data and w2.data will be Variables holding the gradient of the loss with respect to w1 and w2 respectively
loss.backward()

In [13]:
# Update weights using gradient descent
# w1.data and w2.data are the weights 
# w1.grad.data and w2.grad.data are the gradients wrt to the weights
w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data

## Backprop loop

In [14]:
for t in range(500):
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  loss = (y_pred - y).pow(2).sum()
  if t % 10 == 0: print("Iteration: {}\t loss:{}".format(t, loss.data[0]))
    
  w1.grad.data.zero_()
  w2.grad.data.zero_()

  loss.backward()

  w1.data -= learning_rate * w1.grad.data
  w2.data -= learning_rate * w2.grad.data

Iteration: 0	 loss:21833930.0
Iteration: 10	 loss:1697919.25
Iteration: 20	 loss:171826.984375
Iteration: 30	 loss:51356.10546875
Iteration: 40	 loss:18955.466796875
Iteration: 50	 loss:8050.6484375
Iteration: 60	 loss:3793.65673828125
Iteration: 70	 loss:1937.0438232421875
Iteration: 80	 loss:1051.53466796875
Iteration: 90	 loss:596.9180297851562
Iteration: 100	 loss:350.2346496582031
Iteration: 110	 loss:210.4900360107422
Iteration: 120	 loss:128.79383850097656
Iteration: 130	 loss:79.85877227783203
Iteration: 140	 loss:50.01691436767578
Iteration: 150	 loss:31.564390182495117
Iteration: 160	 loss:20.0388240814209
Iteration: 170	 loss:12.78122329711914
Iteration: 180	 loss:8.183023452758789
Iteration: 190	 loss:5.255309581756592
Iteration: 200	 loss:3.383953332901001
Iteration: 210	 loss:2.183619499206543
Iteration: 220	 loss:1.4116019010543823
Iteration: 230	 loss:0.9141182899475098
Iteration: 240	 loss:0.5927556157112122
Iteration: 250	 loss:0.3848434388637543
Iteration: 260	 loss: