In [1]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [17]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

b1 = torch.zeros(1, H, device=device, dtype=dtype, requires_grad = True)
b2  = torch.zeros(1, D_out, device=device, dtype = dtype, requires_grad = True)

learning_rate = 1e-6
for t in range(1500):
    h = x.mm(w1) + b1 
    h_relu = h.clamp(min=0)
    y_pred  = h_relu.mm(w2) + b2
    
    loss= (y_pred-y).pow(2).sum()
    print(t,loss.item())
    loss.backward()
    with torch.no_grad():
        w1 -=learning_rate*w1.grad
        w2 -=learning_rate*w2.grad
        b1 -= learning_rate*b1.grad
        b2 -= learning_rate*b2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
        b1.grad.zero_()
        b2.grad.zero_()

0 24901286.0
1 16850528.0
2 13260525.0
3 11517751.0
4 10568420.0
5 9847779.0
6 9056603.0
7 8059603.0
8 6902358.5
9 5668491.0
10 4498525.5
11 3466785.0
12 2627236.75
13 1971370.5
14 1479250.875
15 1116065.5
16 852806.0625
17 661433.375
18 522416.3125
19 420276.90625
20 344525.90625
21 287318.03125
22 243385.34375
23 208967.96875
24 181525.703125
25 159254.5625
26 140857.671875
27 125432.2109375
28 112326.3359375
29 101053.0234375
30 91259.2734375
31 82683.578125
32 75131.96875
33 68440.5234375
34 62485.65234375
35 57160.0234375
36 52370.30078125
37 48049.8828125
38 44144.4609375
39 40609.6484375
40 37401.86328125
41 34485.3828125
42 31827.8828125
43 29400.916015625
44 27182.33203125
45 25153.31640625
46 23294.876953125
47 21591.65234375
48 20027.720703125
49 18589.708984375
50 17266.083984375
51 16046.81640625
52 14922.974609375
53 13887.578125
54 12932.263671875
55 12049.15234375
56 11232.84375
57 10477.1806640625
58 9777.2099609375
59 9128.2734375
60 8527.044921875
61 7969.59375
62 74

401 0.0010025949450209737
402 0.0009713753242976964
403 0.0009420713759027421
404 0.0009130589314736426
405 0.0008855042979121208
406 0.0008585070609115064
407 0.0008333779987879097
408 0.0008091702475212514
409 0.0007841050392016768
410 0.0007622455013915896
411 0.0007415512809529901
412 0.0007196614169515669
413 0.0006988393142819405
414 0.0006785001023672521
415 0.0006593863945454359
416 0.0006413039518520236
417 0.0006246500997804105
418 0.0006067823269404471
419 0.0005908791208639741
420 0.0005750629352405667
421 0.0005584871978498995
422 0.0005444757989607751
423 0.0005296313320286572
424 0.0005152819212526083
425 0.0005032008630223572
426 0.0004898115294054151
427 0.0004767726350110024
428 0.0004644859000109136
429 0.00045271829003468156
430 0.0004415042931213975
431 0.0004297017294447869
432 0.00041877166950143874
433 0.0004083411768078804
434 0.0003986919473391026
435 0.0003890586958732456
436 0.00037931278347969055
437 0.00036988136707805097
438 0.0003617485344875604
439 0.00

733 1.3818362276651897e-05
734 1.3784412658424117e-05
735 1.3733804735238664e-05
736 1.3586135537480004e-05
737 1.3541040971176699e-05
738 1.3451377526507713e-05
739 1.3436659173748922e-05
740 1.3445136573864147e-05
741 1.3415437933872454e-05
742 1.3331860827747732e-05
743 1.3316514923644718e-05
744 1.3293680240167305e-05
745 1.3197703992773313e-05
746 1.3160478374629747e-05
747 1.3088609193800949e-05
748 1.2996130863029975e-05
749 1.2894930478069e-05
750 1.281433469557669e-05
751 1.2756204341712873e-05
752 1.2738371879095212e-05
753 1.2666573638853151e-05
754 1.26167360576801e-05
755 1.2575385881063994e-05
756 1.248563603439834e-05
757 1.2438798876246437e-05
758 1.2392884855216835e-05
759 1.2321821486693807e-05
760 1.2256205081939697e-05
761 1.2214606613270007e-05
762 1.216380678670248e-05
763 1.2177116332168225e-05
764 1.2094600606360473e-05
765 1.2082776265742723e-05
766 1.1960225492657628e-05
767 1.1928942512895446e-05
768 1.1887425898748916e-05
769 1.1750997146009468e-05
770 1.171

1063 4.9154327825817745e-06
1064 4.877148057857994e-06
1065 4.888069724984234e-06
1066 4.8652859732101206e-06
1067 4.858086413150886e-06
1068 4.823149538424332e-06
1069 4.831123078474775e-06
1070 4.803531282959739e-06
1071 4.808991889149183e-06
1072 4.778060883836588e-06
1073 4.772608463099459e-06
1074 4.750703737954609e-06
1075 4.754942892759573e-06
1076 4.752959284815006e-06
1077 4.749707841256168e-06
1078 4.764776349475142e-06
1079 4.754353085445473e-06
1080 4.7041453399288e-06
1081 4.703801096184179e-06
1082 4.695410552812973e-06
1083 4.670300768339075e-06
1084 4.6523073251591995e-06
1085 4.639343842427479e-06
1086 4.642261046683416e-06
1087 4.653812993637985e-06
1088 4.643427018891089e-06
1089 4.6391101022891235e-06
1090 4.623102086043218e-06
1091 4.641225586965447e-06
1092 4.611070380633464e-06
1093 4.623273525794502e-06
1094 4.625665951607516e-06
1095 4.623508630174911e-06
1096 4.605958565662149e-06
1097 4.575566435960354e-06
1098 4.558487944450462e-06
1099 4.543690010905266e-06

1453 2.4853829927451443e-06
1454 2.4949827093223576e-06
1455 2.4909527382988017e-06
1456 2.4863863927748753e-06
1457 2.485768618498696e-06
1458 2.4933838176366407e-06
1459 2.4905816644604784e-06
1460 2.485412551322952e-06
1461 2.477911039022729e-06
1462 2.4705868781893514e-06
1463 2.4664839202159783e-06
1464 2.461492385918973e-06
1465 2.4646126348670805e-06
1466 2.469161017870647e-06
1467 2.455815092616831e-06
1468 2.4465184651489835e-06
1469 2.446282678647549e-06
1470 2.4438888885924825e-06
1471 2.4353428216272732e-06
1472 2.4172297798941145e-06
1473 2.4115529413393233e-06
1474 2.416849383735098e-06
1475 2.41973089032399e-06
1476 2.403423877694877e-06
1477 2.395275714661693e-06
1478 2.39680275626597e-06
1479 2.39150358538609e-06
1480 2.3899540337879444e-06
1481 2.382052343818941e-06
1482 2.382564161962364e-06
1483 2.3712195797997992e-06
1484 2.363425664952956e-06
1485 2.354332764298306e-06
1486 2.3538027562608477e-06
1487 2.3560853605886223e-06
1488 2.349263013456948e-06
1489 2.349168

In [None]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()