# Tensors
## Warm-up: numpy

In [1]:
import numpy as np

In [2]:
# N is batch size
# D_in is input dimension;
# H is hidden dimension
# D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [3]:
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [4]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [5]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 43414513.68199094
1 37844462.00575921
2 30612354.846733294
3 20616209.409339987
4 11980425.994109895
5 6529784.775456433
6 3766733.6453003846
7 2421671.8933325876
8 1732961.3182803805
9 1335788.658336803
10 1077190.1327452173
11 892233.3800708354
12 751311.8531789068
13 639804.0026317876
14 549451.3942314943
15 475109.6114590941
16 413260.45172621217
17 361350.894265544
18 317369.51993749785
19 279872.5270659276
20 247728.67195320898
21 220009.25229923637
22 195998.32963726242
23 175152.35755935087
24 156935.98197084444
25 140974.3478306926
26 126920.77168090628
27 114532.13922521268
28 103562.52333019035
29 93817.95434823356
30 85146.13127546325
31 77410.36287473673
32 70484.4868552319
33 64274.373010622745
34 58692.23418680877
35 53665.3223347125
36 49130.5469773877
37 45029.424148649385
38 41309.647819614715
39 37937.362055745674
40 34875.74546354277
41 32090.583006065244
42 29554.94977245085
43 27241.96240076028
44 25129.392215439228
45 23199.33976536934
46 21432.884938281277
47 

371 0.0008898749649607761
372 0.0008497732040998902
373 0.0008114958022187072
374 0.0007749503174139291
375 0.0007400545532694805
376 0.0007067326773393982
377 0.0006749187412482824
378 0.0006445397893138446
379 0.0006155410880632976
380 0.0005878540136855162
381 0.0005614122482039768
382 0.0005361699423063201
383 0.0005120617579073694
384 0.0004890406664853197
385 0.00046706108364832764
386 0.00044607557880822383
387 0.00042603170706754633
388 0.0004068941778075208
389 0.0003886170198938337
390 0.0003711655178497001
391 0.0003544999026636443
392 0.0003385918349110316
393 0.0003233949205197467
394 0.0003088833731301706
395 0.00029502565825508605
396 0.00028178949474531927
397 0.00026915249238248536
398 0.0002570852045403999
399 0.00024555811298353854
400 0.00023454918916484998
401 0.00022403648766081485
402 0.0002139979849955439
403 0.00020440896763193614
404 0.0001952528875552922
405 0.00018650797263492758
406 0.00017815511253164152
407 0.00017017728323546574
408 0.0001625584636812448

## PyTorch: Tensors

In [6]:
import torch

In [7]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") 
# Uncomment this to run on GPU

In [8]:
# N is batch size
# D_in is input dimension;
# H is hidden dimension
# D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [9]:
# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [10]:
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [11]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 24772378.0
1 20629562.0
2 20524780.0
3 21575188.0
4 21720548.0
5 19619578.0
6 15465898.0
7 10660554.0
8 6684835.5
9 3994766.25
10 2401097.75
11 1505269.125
12 1005617.0625
13 718238.5625
14 544080.125
15 431413.625
16 353596.9375
17 296552.0625
18 252654.546875
19 217650.65625
20 189004.3125
21 165102.796875
22 144911.859375
23 127697.1796875
24 112932.5
25 100186.6484375
26 89128.0859375
27 79495.4296875
28 71070.890625
29 63687.9453125
30 57194.140625
31 51469.9765625
32 46406.64453125
33 41918.953125
34 37932.609375
35 34382.90625
36 31216.146484375
37 28385.01953125
38 25849.390625
39 23573.9453125
40 21528.595703125
41 19687.162109375
42 18026.1796875
43 16526.517578125
44 15169.1396484375
45 13940.5126953125
46 12825.9951171875
47 11813.408203125
48 10891.591796875
49 10052.349609375
50 9286.30078125
51 8586.5927734375
52 7946.96142578125
53 7361.7626953125
54 6824.90625
55 6332.0537109375
56 5879.259765625
57 5462.82763671875
58 5079.39306640625
59 4725.99365234375
60 4400.012

438 0.0001608720631338656
439 0.00015737389912828803
440 0.00015372535563074052
441 0.0001505247491877526
442 0.00014786679821554571
443 0.00014493227354250848
444 0.0001418747560819611
445 0.00013917800970375538
446 0.00013620543177239597
447 0.00013411205145530403
448 0.00013117995695210993
449 0.0001288261410081759
450 0.00012632270227186382
451 0.0001239219564013183
452 0.00012149481335654855
453 0.00011917004303541034
454 0.00011690682731568813
455 0.00011481934052426368
456 0.0001123168331105262
457 0.00011063869169447571
458 0.00010857035522349179
459 0.00010681426647352055
460 0.00010457448661327362
461 0.00010265065066050738
462 0.00010109469440067187
463 9.91038978099823e-05
464 9.751823381520808e-05
465 9.558840974932536e-05
466 9.379936091136187e-05
467 9.235143079422414e-05
468 9.0666493633762e-05
469 8.926478767534718e-05
470 8.773391891736537e-05
471 8.63505556480959e-05
472 8.513705688528717e-05
473 8.363128290511668e-05
474 8.209467341657728e-05
475 8.066710142884403e-

# Autograd
## PyTorch: Tensors and autograd

In [12]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") 
# Uncomment this to run on GPU

# N is batch size
# D_in is input dimension;
# H is hidden dimension
# D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [13]:
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [14]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True) ## requires_grad=True
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [15]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward() ###################

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad(): ###################
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 26578464.0
1 20167920.0
2 17551826.0
3 16191083.0
4 14820937.0
5 12922709.0
6 10516121.0
7 7985546.5
8 5738670.5
9 3990049.25
10 2745529.75
11 1907381.25
12 1356862.125
13 997426.1875
14 759682.9375
15 598472.125
16 485453.46875
17 403235.75
18 341206.09375
19 292847.46875
20 254055.984375
21 222199.546875
22 195534.25
23 172936.84375
24 153588.734375
25 136869.546875
26 122319.8203125
27 109605.328125
28 98442.3671875
29 88608.2265625
30 79912.296875
31 72199.9296875
32 65343.359375
33 59235.328125
34 53783.46875
35 48901.015625
36 44522.6015625
37 40601.08984375
38 37110.00390625
39 33958.1875
40 31109.029296875
41 28529.982421875
42 26191.283203125
43 24067.5078125
44 22137.72265625
45 20381.61328125
46 18781.43359375
47 17321.619140625
48 15987.609375
49 14767.7021484375
50 13650.380859375
51 12626.3349609375
52 11687.2119140625
53 10825.6298828125
54 10033.8125
55 9305.1513671875
56 8634.501953125
57 8016.587890625
58 7447.212890625
59 6921.97802734375
60 6437.26708984375
61 598

430 9.206765389535576e-05
431 9.024390601553023e-05
432 8.859017543727532e-05
433 8.722264465177432e-05
434 8.537065878044814e-05
435 8.392015297431499e-05
436 8.232764957938343e-05
437 8.105809683911502e-05
438 7.94925435911864e-05
439 7.802910840837285e-05
440 7.677542453166097e-05
441 7.563186227343976e-05
442 7.427493983414024e-05
443 7.316356641240418e-05
444 7.169900345616043e-05
445 7.027059473330155e-05
446 6.949556700419635e-05
447 6.780779949622229e-05
448 6.678658974124119e-05
449 6.543927884195e-05
450 6.455463881138712e-05
451 6.361982377711684e-05
452 6.253940227907151e-05
453 6.15866738371551e-05
454 6.04744054726325e-05
455 5.9840513131348416e-05
456 5.876978684682399e-05
457 5.808545392937958e-05
458 5.737055107601918e-05
459 5.6493765441700816e-05
460 5.5723998229950666e-05
461 5.452211553347297e-05
462 5.391304875956848e-05
463 5.276838783174753e-05
464 5.211309689912014e-05
465 5.130857243784703e-05
466 5.0662667490541935e-05
467 4.982099198969081e-05
468 4.91453429

## PyTorch: Defining new autograd functions

define our own custom autograd function for performing the ReLU nonlinearity, and use it to implement our two-layer network

In [16]:
import torch

In [17]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [18]:
dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [19]:
learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 39629808.0
1 40648360.0
2 44289320.0
3 41116256.0
4 28968924.0
5 15225198.0
6 6885422.5
7 3296491.0
8 1923972.125
9 1347165.125
10 1048878.125
11 858894.125
12 720451.5
13 612412.75
14 525235.0625
15 453488.6875
16 393874.0625
17 343894.625
18 301716.3125
19 265845.875
20 235168.796875
21 208769.703125
22 185973.203125
23 166232.09375
24 149033.171875
25 133981.765625
26 120762.96875
27 109102.8984375
28 98780.9453125
29 89623.8828125
30 81479.703125
31 74212.1484375
32 67719.8125
33 61901.39453125
34 56676.75
35 51973.859375
36 47735.03515625
37 43902.671875
38 40436.73046875
39 37291.48046875
40 34431.1875
41 31825.23828125
42 29449.671875
43 27279.48046875
44 25296.07421875
45 23477.36328125
46 21809.998046875
47 20279.34765625
48 18872.177734375
49 17579.4296875
50 16388.04296875
51 15287.9052734375
52 14271.6220703125
53 13332.1064453125
54 12462.884765625
55 11656.40625
56 10909.51171875
57 10216.49609375
58 9572.498046875
59 8974.5625
60 8418.150390625
61 7900.578125
62 7418.6

393 0.0030918086413294077
394 0.002976670628413558
395 0.0028713743668049574
396 0.002770046005025506
397 0.0026737030129879713
398 0.0025809339713305235
399 0.0024908198975026608
400 0.002405998995527625
401 0.002323525957763195
402 0.002242419868707657
403 0.002164732199162245
404 0.002091712551191449
405 0.0020211751107126474
406 0.0019535755272954702
407 0.0018887927290052176
408 0.0018262672238051891
409 0.001764113549143076
410 0.0017060128739103675
411 0.0016520515782758594
412 0.0015988274244591594
413 0.0015451000072062016
414 0.001497262273915112
415 0.001448614289984107
416 0.001401175162754953
417 0.0013580115046352148
418 0.0013135425979271531
419 0.0012722195824608207
420 0.0012329412857070565
421 0.0011955610243603587
422 0.0011586337350308895
423 0.0011226162314414978
424 0.0010887605603784323
425 0.0010561287635937333
426 0.0010243365541100502
427 0.0009923926554620266
428 0.0009646022226661444
429 0.0009363094577565789
430 0.000908941263332963
431 0.000881978427059948

## TensorFlow: Static Graphs

In [None]:
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

N, D_in, H, D_out = 64, 1000, 100, 10

x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        loss_value, _, _ = sess.run([loss, new_w1, new_w2], feed_dict={x: x_value, y: y_value})
        print(loss_value)

# nn module
## PyTorch: nn

In [20]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [21]:
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out))

In [22]:
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

In [23]:
learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 690.8640747070312
1 641.4505004882812
2 598.3305053710938
3 560.519287109375
4 526.6614379882812
5 496.0784912109375
6 468.2153625488281
7 442.6169738769531
8 419.0066833496094
9 396.9976806640625
10 376.5498352050781
11 357.24591064453125
12 338.8869934082031
13 321.56817626953125
14 305.19366455078125
15 289.73699951171875
16 275.09130859375
17 261.08197021484375
18 247.67987060546875
19 234.79110717773438
20 222.41888427734375
21 210.59019470214844
22 199.3061065673828
23 188.48292541503906
24 178.15560913085938
25 168.2890167236328
26 158.89634704589844
27 149.94821166992188
28 141.448974609375
29 133.37496948242188
30 125.71249389648438
31 118.43006134033203
32 111.51254272460938
33 104.95935821533203
34 98.7691879272461
35 92.92034149169922
36 87.38578796386719
37 82.17219543457031
38 77.2551040649414
39 72.62371063232422
40 68.26313018798828
41 64.1644515991211
42 60.29682540893555
43 56.6583251953125
44 53.236000061035156
45 50.019752502441406
46 46.9993896484375
47 44.168369

364 0.0003731976612471044
365 0.0003623511001933366
366 0.0003518201701808721
367 0.00034161683288402855
368 0.00033170534879900515
369 0.00032208883203566074
370 0.0003127490053884685
371 0.00030370274907909334
372 0.0002949211047962308
373 0.00028638445655815303
374 0.0002781058428809047
375 0.0002700686745811254
376 0.0002622753381729126
377 0.00025470141554251313
378 0.0002473567146807909
379 0.00024022370052989572
380 0.00023330077237915248
381 0.000226580974413082
382 0.00022005365462973714
383 0.0002137190313078463
384 0.00020757167658302933
385 0.00020160406711511314
386 0.00019581051310524344
387 0.00019018526654690504
388 0.00018473269301466644
389 0.0001794305571820587
390 0.00017428566934540868
391 0.00016928887635003775
392 0.0001644496078370139
393 0.0001597360969753936
394 0.00015516194980591536
395 0.00015072331007104367
396 0.00014641694724559784
397 0.00014222839672584087
398 0.00013817561557516456
399 0.0001342290051979944
400 0.000130392552819103
401 0.0001266813051

## PyTorch: optim

In [24]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out))
loss_fn = torch.nn.MSELoss(reduction='sum')

In [25]:
# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 642.04931640625
1 625.51171875
2 609.4165649414062
3 593.7894287109375
4 578.7017211914062
5 564.059814453125
6 549.8132934570312
7 535.9879760742188
8 522.5380859375
9 509.4613037109375
10 496.7398681640625
11 484.4208068847656
12 472.4285583496094
13 460.79901123046875
14 449.4683532714844
15 438.43157958984375
16 427.6879577636719
17 417.2011413574219
18 407.0575256347656
19 397.2349548339844
20 387.70098876953125
21 378.4269104003906
22 369.37408447265625
23 360.5307922363281
24 351.9003601074219
25 343.4801940917969
26 335.2879333496094
27 327.3047180175781
28 319.5074768066406
29 311.8905029296875
30 304.4466552734375
31 297.2027587890625
32 290.1555480957031
33 283.26361083984375
34 276.5408935546875
35 269.953857421875
36 263.5067443847656
37 257.2037658691406
38 251.04000854492188
39 245.031494140625
40 239.12982177734375
41 233.37950134277344
42 227.76345825195312
43 222.27316284179688
44 216.89907836914062
45 211.6426239013672
46 206.49365234375
47 201.44349670410156
48 19

385 0.0002475139917805791
386 0.00023704090563114733
387 0.00022698621614836156
388 0.0002173466928070411
389 0.00020810076966881752
390 0.0001992367469938472
391 0.00019073695875704288
392 0.00018258563068229705
393 0.00017476698849350214
394 0.00016727759793866426
395 0.00016009144019335508
396 0.00015320048260036856
397 0.00014659378211945295
398 0.00014027548604644835
399 0.00013420797768048942
400 0.00012839690316468477
401 0.0001228212786372751
402 0.00011749019904527813
403 0.00011237709986744449
404 0.00010747463966254145
405 0.00010278417175868526
406 9.828596375882626e-05
407 9.397581015946344e-05
408 8.98527869139798e-05
409 8.590127981733531e-05
410 8.211765816668049e-05
411 7.849193934816867e-05
412 7.502181688323617e-05
413 7.170149910962209e-05
414 6.85203995089978e-05
415 6.547443626914173e-05
416 6.255564221646637e-05
417 5.9770674852188677e-05
418 5.7098390243481845e-05
419 5.4544449085369706e-05
420 5.2101368055446073e-05
421 4.975920819560997e-05
422 4.7517740313196

## PyTorch: Custom nn Modules

In [27]:
import torch

In [28]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [29]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [30]:
# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [31]:
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 687.9739379882812
1 632.7424926757812
2 586.0691528320312
3 545.788818359375
4 510.4735412597656
5 479.3359375
6 451.3263244628906
7 425.842529296875
8 402.4744873046875
9 380.70013427734375
10 360.49188232421875
11 341.5550537109375
12 323.717041015625
13 306.86920166015625
14 290.88720703125
15 275.7586975097656
16 261.39630126953125
17 247.7628936767578
18 234.75465393066406
19 222.34075927734375
20 210.52670288085938
21 199.31910705566406
22 188.6430206298828
23 178.42991638183594
24 168.73593139648438
25 159.49371337890625
26 150.716552734375
27 142.4001007080078
28 134.49697875976562
29 126.9930648803711
30 119.87416076660156
31 113.10725402832031
32 106.69337463378906
33 100.6213607788086
34 94.88345336914062
35 89.46338653564453
36 84.33936309814453
37 79.49376678466797
38 74.91572570800781
39 70.60164642333984
40 66.53530883789062
41 62.70233917236328
42 59.094024658203125
43 55.69192886352539
44 52.48746109008789
45 49.472293853759766
46 46.631187438964844
47 43.96364974975

402 0.00010804465273395181
403 0.00010492584260646254
404 0.00010189301974605769
405 9.89614927675575e-05
406 9.610476263333112e-05
407 9.333362686447799e-05
408 9.064500773092732e-05
409 8.803322270978242e-05
410 8.55020189192146e-05
411 8.304051880259067e-05
412 8.0649129813537e-05
413 7.833296695025638e-05
414 7.607967563671991e-05
415 7.38958697183989e-05
416 7.176892540883273e-05
417 6.971298716962337e-05
418 6.771062908228487e-05
419 6.576567102456465e-05
420 6.387910252669826e-05
421 6.20486680418253e-05
422 6.027404742781073e-05
423 5.854295159224421e-05
424 5.6863907957449555e-05
425 5.5238273489521816e-05
426 5.365265315049328e-05
427 5.211993266129866e-05
428 5.06282776768785e-05
429 4.9177786422660574e-05
430 4.777254434884526e-05
431 4.640624683815986e-05
432 4.5084514567861333e-05
433 4.379566598800011e-05
434 4.254183659213595e-05
435 4.132980029680766e-05
436 4.014990190626122e-05
437 3.900225055986084e-05
438 3.7891408283030614e-05
439 3.680988811538555e-05
440 3.57615

## PyTorch: Control Flow + Weight Sharing

In [32]:
import random
import torch

In [33]:
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. 
        This is a big improvement from Lua Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

In [34]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [35]:
# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

In [36]:
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 676.0377197265625
1 677.1350708007812
2 671.57763671875
3 702.5586547851562
4 652.101318359375
5 670.83154296875
6 669.3819580078125
7 667.8381958007812
8 657.4067993164062
9 402.6274108886719
10 364.0065612792969
11 663.7554931640625
12 280.6117248535156
13 237.92755126953125
14 622.92431640625
15 661.0291748046875
16 659.625244140625
17 112.13117218017578
18 93.5862808227539
19 574.6084594726562
20 68.06990051269531
21 633.0633544921875
22 57.63670349121094
23 642.089599609375
24 636.6403198242188
25 628.9006958007812
26 618.8046875
27 605.9906616210938
28 420.797119140625
29 390.26739501953125
30 555.2371215820312
31 309.1468200683594
32 266.9026794433594
33 479.7393798828125
34 194.2961883544922
35 166.33834838867188
36 140.99935913085938
37 332.25115966796875
38 315.4895324707031
39 90.81635284423828
40 202.75917053222656
41 63.53085708618164
42 310.1190490722656
43 83.42826080322266
44 65.37623596191406
45 72.27685546875
46 293.7470397949219
47 55.88072967529297
48 469.68994140

404 1.5405898094177246
405 1.881367564201355
406 1.6151447296142578
407 1.175499677658081
408 1.098007082939148
409 0.6045849323272705
410 0.9340944290161133
411 0.42089077830314636
412 0.6256698369979858
413 0.7100461721420288
414 5.903141975402832
415 5.064917087554932
416 2.620669364929199
417 2.2712063789367676
418 14.262487411499023
419 9.403079986572266
420 1.943732500076294
421 12.671951293945312
422 2.619797468185425
423 1.4061644077301025
424 0.8265479803085327
425 0.359214186668396
426 0.30962902307510376
427 2.9495677947998047
428 0.46400436758995056
429 0.3388005793094635
430 1.9049357175827026
431 6.185429573059082
432 1.1780707836151123
433 4.979336738586426
434 4.357420921325684
435 3.436495065689087
436 1.4161800146102905
437 0.2911151647567749
438 2.7259929180145264
439 1.1038415431976318
440 3.320280075073242
441 1.2559436559677124
442 1.5529234409332275
443 0.4434523284435272
444 0.3068230450153351
445 2.131815195083618
446 10.429633140563965
447 0.6284857988357544
4