In [19]:
%matplotlib inline


PyTorch: nn
-----------

A fully-connected ReLU network with one hidden layer, trained to predict y from x
by minimizing squared Euclidean distance.

This implementation uses the nn package from PyTorch to build the network.
PyTorch autograd makes it easy to define computational graphs and take gradients,
but raw autograd can be a bit too low-level for defining complex neural networks;
this is where the nn package can help. The nn package defines a set of Modules,
which you can think of as a neural network layer that has produces output from
input and may have some trainable weights.



In [20]:
import torch

N, D_in , H , D_out = 64, 1000, 100 , 10

In [21]:
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [22]:
model  = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

In [23]:
loss_fn = torch.nn.MSELoss(size_average=False)

In [24]:
learning_rate = 1e-4

In [25]:
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    model.zero_grad()
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 627.1444091796875
1 582.899658203125
2 544.015869140625
3 509.1175842285156
4 477.9831237792969
5 450.1801452636719
6 424.7296142578125
7 401.2441101074219
8 379.7007141113281
9 359.6974182128906
10 340.947998046875
11 323.2514343261719
12 306.60595703125
13 290.9416809082031
14 276.1319274902344
15 262.08612060546875
16 248.7198028564453
17 235.97113037109375
18 223.85635375976562
19 212.2568359375
20 201.1905517578125
21 190.6453399658203
22 180.5921630859375
23 170.97608947753906
24 161.82162475585938
25 153.08729553222656
26 144.7654571533203
27 136.8440704345703
28 129.3097686767578
29 122.13715362548828
30 115.31373596191406
31 108.84214782714844
32 102.71412658691406
33 96.8945083618164
34 91.38057708740234
35 86.1625747680664
36 81.23050689697266
37 76.55181884765625
38 72.12983703613281
39 67.94912719726562
40 63.99406814575195
41 60.26537322998047
42 56.746421813964844
43 53.43491744995117
44 50.31966781616211
45 47.382225036621094
46 44.620018005371094
47 42.01705169677734

386 8.842569513944909e-05
387 8.572710066800937e-05
388 8.31089710118249e-05
389 8.05722302175127e-05
390 7.811709656380117e-05
391 7.573455513920635e-05
392 7.342486060224473e-05
393 7.118927169358358e-05
394 6.902030145283788e-05
395 6.691562884952873e-05
396 6.487917562481016e-05
397 6.290405872277915e-05
398 6.0988702898612246e-05
399 5.913676432101056e-05
400 5.73384968447499e-05
401 5.559482815442607e-05
402 5.390615842770785e-05
403 5.2266572311054915e-05
404 5.067816164228134e-05
405 4.914070086670108e-05
406 4.765290213981643e-05
407 4.620828985935077e-05
408 4.480624193092808e-05
409 4.344836270320229e-05
410 4.2129584471695125e-05
411 4.0853290556697175e-05
412 3.961513357353397e-05
413 3.8414607843151316e-05
414 3.725215356098488e-05
415 3.612581349443644e-05
416 3.503084008116275e-05
417 3.3972009987337515e-05
418 3.294257840025239e-05
419 3.1947663956088945e-05
420 3.098241359111853e-05
421 3.0046800020500086e-05
422 2.9138107493054122e-05
423 2.8255888537387364e-05
424 2

In [11]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 666.7636108398438
1 622.267822265625
2 583.4746704101562
3 548.838134765625
4 517.470458984375
5 489.04974365234375
6 463.03948974609375
7 439.0096435546875
8 416.5318908691406
9 395.5527648925781
10 375.8348388671875
11 357.1593017578125
12 339.50140380859375
13 322.8538818359375
14 307.0242614746094
15 291.9519348144531
16 277.56005859375
17 263.84686279296875
18 250.7209930419922
19 238.13414001464844
20 226.05108642578125
21 214.5466766357422
22 203.5541534423828
23 193.00645446777344
24 182.90863037109375
25 173.25546264648438
26 164.02273559570312
27 155.23492431640625
28 146.8634796142578
29 138.90750122070312
30 131.34246826171875
31 124.14666748046875
32 117.30436706542969
33 110.82870483398438
34 104.68427276611328
35 98.87447357177734
36 93.37075805664062
37 88.1475601196289
38 83.2155990600586
39 78.5649185180664
40 74.17158508300781
41 70.01920318603516
42 66.10405731201172
43 62.40451431274414
44 58.91514205932617
45 55.6313362121582
46 52.53571701049805
47 49.619979858

394 0.0010284815216436982
395 0.0010101848747581244
396 0.0009922432946041226
397 0.0009746552677825093
398 0.000957385404035449
399 0.0009404448792338371
400 0.0009238187340088189
401 0.0009075133129954338
402 0.0008915187208913267
403 0.000875817786436528
404 0.0008604184258729219
405 0.000845308939460665
406 0.0008304849616251886
407 0.0008159271674230695
408 0.0008016427163966
409 0.0007876295712776482
410 0.000773887149989605
411 0.0007603972335346043
412 0.0007471491117030382
413 0.0007341512828134
414 0.0007213791832327843
415 0.000708859006408602
416 0.000696566014084965
417 0.0006844979943707585
418 0.0006726449937559664
419 0.0006610219134017825
420 0.0006495954585261643
421 0.0006383885047398508
422 0.0006273876060731709
423 0.0006165775121189654
424 0.0006059728912077844
425 0.0005955637898296118
426 0.0005853352486155927
427 0.0005752900615334511
428 0.0005654352135024965
429 0.0005557535332627594
430 0.0005462477565743029
431 0.0005369069403968751
432 0.000527738593518734