# Simple neural network with PyTorch: nn

Using one hidden layer with relU activation and squared error loss for the output.

In [1]:
import torch
from torch.autograd import Variable

## Create computational graph

In [2]:
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
dtype = torch.FloatTensor

In [3]:
# N is batch size; D_in is input dimension; H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [4]:
# wrap tensors in Variables
# a Variable v is a node in a computational graph
# v.data is a tensor
# v.grad is another Variable holding the gradient of x wrt to some scalar
# requires_grad=False: no need to compute gradients during the backward pass
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

In [5]:
# Use the nn package to define our model as a sequence of layers
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )

In [6]:
# loss function
loss_fn = torch.nn.MSELoss(size_average=False)

In [7]:
learning_rate = 1e-4

## Forward propagation

In [8]:
# Module objects override the __call__ operator so you can call them like functions
y_pred = model(x)

In [9]:
loss = loss_fn(y_pred, y)

## Backpropagation using autograd

From PyTorch documentation:

Each Variable has a .creator attribute, that points to the function, of which it is an output.

This is an entry point to a directed acyclic graph (DAG) consisting of Function objects as nodes, and references between them being the edges.

Every time an operation is performed, a new Function representing it is instantiated, its forward() method is called, and its output Variable s creators are set to it. 

Then, by following the path from any Variable to the leaves, it is possible to reconstruct the sequence of operations that has created the data, and automatically compute the gradients.

In [10]:
model.zero_grad()

In [11]:
loss.backward()

In [12]:
# Update the weights using gradient descent. 
for param in model.parameters():
    param.data -= learning_rate * param.grad.data

## Training loop

In [13]:
for t in range(500):
  y_pred = model(x)
  loss = loss_fn(y_pred, y)
  if t % 10 == 0: print(t, loss.data[0])
  model.zero_grad()
  loss.backward()
  for param in model.parameters():
    param.data -= learning_rate * param.grad.data

0 598.1449584960938
10 338.0163269042969
20 200.1707000732422
30 112.33590698242188
40 60.30597686767578
50 31.86884307861328
60 16.918045043945312
70 9.090893745422363
80 4.994498252868652
90 2.8065693378448486
100 1.6137747764587402
110 0.9488677382469177
120 0.5651736855506897
130 0.3429165780544281
140 0.2116415649652481
150 0.13264800608158112
160 0.08452332764863968
170 0.054514262825250626
180 0.03551381081342697
190 0.02335045486688614
200 0.015474722720682621
210 0.01032930240035057
220 0.006937050726264715
230 0.004683402366936207
240 0.0031785054598003626
250 0.0021677673794329166
260 0.0014843856915831566
270 0.0010201847180724144
280 0.0007036924944259226
290 0.000486893201014027
300 0.0003379075205884874
310 0.00023514726490247995
320 0.0001640560949454084
330 0.00011472574988147244
340 8.040101238293573e-05
350 5.6457967730239034e-05
360 3.971722617279738e-05
370 2.7993079129373655e-05
380 1.975927261810284e-05
390 1.3967529412184376e-05
400 9.888683962344658e-06
410 7.0