In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33891717.8977
1 28983168.561
2 27933437.4906
3 26302922.6791
4 22404957.9724
5 16513159.548
6 10771937.7655
7 6493708.30246
8 3886658.6433
9 2429032.55469
10 1638144.53629
11 1193397.32867
12 926569.583053
13 752542.680106
14 629496.055164
15 536568.394765
16 463026.550353
17 403016.553263
18 353040.038165
19 310762.502442
20 274604.979909
21 243540.044891
22 216703.949847
23 193217.097796
24 172771.239161
25 154899.173173
26 139213.723744
27 125381.922012
28 113163.691345
29 102336.914151
30 92723.5143458
31 84152.6484489
32 76495.4996688
33 69640.3290984
34 63488.4796023
35 57954.0333755
36 52969.0744497
37 48469.5116062
38 44402.1148898
39 40719.1536673
40 37377.0764116
41 34344.4682794
42 31588.1865609
43 29078.879045
44 26791.908264
45 24703.1385929
46 22794.5248466
47 21047.9752182
48 19448.6681084
49 17983.412649
50 16640.1034816
51 15407.7677527
52 14275.2616935
53 13233.6135286
54 12274.5530985
55 11390.9668923
56 10576.5049223
57 9825.12598606
58 9131.64944889
59 8490.95737

434 1.6278254757e-05
435 1.55141530428e-05
436 1.47860203041e-05
437 1.40922121915e-05
438 1.34309890262e-05
439 1.28010354965e-05
440 1.22007085116e-05
441 1.16286604106e-05
442 1.10835745482e-05
443 1.05642037428e-05
444 1.0069154061e-05
445 9.59746664624e-06
446 9.14792294261e-06
447 8.71953302156e-06
448 8.3112515338e-06
449 7.92219374578e-06
450 7.55143937335e-06
451 7.19822134906e-06
452 6.86146830758e-06
453 6.54057390177e-06
454 6.23469041544e-06
455 5.9431841552e-06
456 5.66541367243e-06
457 5.40060750883e-06
458 5.14823705596e-06
459 4.90771194017e-06
460 4.67851520326e-06
461 4.4600509429e-06
462 4.25181018312e-06
463 4.05332193964e-06
464 3.86412821831e-06
465 3.68379342663e-06
466 3.5119276425e-06
467 3.34810225196e-06
468 3.19194856038e-06
469 3.04312544795e-06
470 2.90124997554e-06
471 2.76602914209e-06
472 2.63710635965e-06
473 2.51422994054e-06
474 2.39708687608e-06
475 2.28543570806e-06
476 2.17900088961e-06
477 2.07753082742e-06
478 1.9808216989e-06
479 1.88862515264

In [2]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28578787.805672407
1 23144470.492491677
2 22088825.44570624
3 21878877.43967569
4 20603790.011794906
5 17333075.62793944
6 12914133.999134868
7 8615055.228438344
8 5412532.691988396
9 3351614.7239160887
10 2148140.5164239584
11 1458179.4389256313
12 1057475.075253766
13 812814.2297156407
14 653683.6828760277
15 542780.1608544566
16 460512.92653499625
17 396362.1984006115
18 344489.3878442518
19 301493.02908052783
20 265242.16501357884
21 234312.4815978462
22 207694.20690107357
23 184680.86192842427
24 164627.17755845282
25 147114.94758192712
26 131745.6536555179
27 118226.04535831368
28 106302.57962420398
29 95759.32452622041
30 86404.4764476755
31 78086.79385915119
32 70685.9460970696
33 64082.48316304915
34 58177.303317343256
35 52884.147539953236
36 48138.744485443225
37 43874.93705926131
38 40033.935640708936
39 36568.82331977335
40 33440.87616242067
41 30610.963892938773
42 28049.79348450285
43 25728.405282061052
44 23620.066246202423
45 21704.472017561144
46 19963.30737984783
4

475 0.00012858207532025212
476 0.000126665130635209
477 0.0001243611169835096
478 0.00012238949215331862
479 0.00012067006246008205
480 0.00011841608469953524
481 0.00011673385687532845
482 0.00011490523929763574
483 0.00011295143247347372
484 0.00011130597260136654
485 0.00010979244039084624
486 0.00010809508349418628
487 0.00010674452862749831
488 0.00010493332200513586
489 0.0001035636677389068
490 0.00010222906604145932
491 0.00010073431939701394
492 9.909067989756493e-05
493 9.770281089534205e-05
494 9.629643559273815e-05
495 9.479646551058263e-05
496 9.325774749052718e-05
497 9.201066369385336e-05
498 9.09658353590026e-05
499 8.976672998234347e-05


In [3]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 30269094.0
1 30576084.0
2 33222616.0
3 32739016.0
4 26866454.0
5 17413696.0
6 9532378.0
7 4887147.0
8 2700072.25
9 1709078.5
10 1233189.375
11 969908.4375
12 800560.375
13 677816.5625
14 581997.8125
15 504152.03125
16 439299.6875
17 384723.625
18 338395.71875
19 298742.875
20 264718.6875
21 235340.0
22 209898.28125
23 187763.671875
24 168415.6875
25 151438.234375
26 136503.109375
27 123386.3359375
28 111758.9921875
29 101426.640625
30 92226.2890625
31 84007.5625
32 76644.1015625
33 70036.7109375
34 64091.87890625
35 58751.6015625
36 53936.7734375
37 49578.78515625
38 45629.13671875
39 42046.78515625
40 38787.8046875
41 35820.01171875
42 33112.30078125
43 30639.716796875
44 28376.859375
45 26304.353515625
46 24405.25
47 22661.48046875
48 21059.15625
49 19585.44921875
50 18227.748046875
51 16976.376953125
52 15821.939453125
53 14755.947265625
54 13769.853515625
55 12857.767578125
56 12013.259765625
57 11231.12109375
58 10505.6171875
59 9832.2958984375
60 9206.984375
61 8626.01171875
62

453 0.0006245505064725876
454 0.0006090705865062773
455 0.0005922876298427582
456 0.0005778101040050387
457 0.0005636606947518885
458 0.0005490193143486977
459 0.000536551175173372
460 0.0005226604989729822
461 0.0005101810675114393
462 0.0004972296301275492
463 0.00048568256897851825
464 0.00047402718337252736
465 0.0004631818737834692
466 0.0004523120878729969
467 0.0004413827555254102
468 0.0004309870710130781
469 0.0004212847852613777
470 0.0004112116002943367
471 0.0004009234544355422
472 0.00039237021701410413
473 0.0003836678515654057
474 0.0003754005883820355
475 0.000366498512448743
476 0.00035772283445112407
477 0.000350287213223055
478 0.0003422509762458503
479 0.00033550450461916625
480 0.0003284380945842713
481 0.0003214858006685972
482 0.0003138696774840355
483 0.0003075090644415468
484 0.00030107024940662086
485 0.0002947782340925187
486 0.00028915968141518533
487 0.00028331505018286407
488 0.000277456158073619
489 0.0002716432500164956
490 0.0002665550564415753
491 0.00

In [4]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    def forward(self, input):
        """
        In the forward pass we receive a Tensor containing the input and return a
        Tensor containing the output. You can cache arbitrary Tensors for use in the
        backward pass using the save_for_backward method.
        """
        self.save_for_backward(input)
        return input.clamp(min=0)

    def backward(self, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


# dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Construct an instance of our MyReLU class to use in our network
    relu = MyReLU()

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 33915004.0
1 30993030.0
2 33943712.0
3 36120344.0
4 32835284.0
5 23374626.0
6 13329000.0
7 6593905.5
8 3307368.25
9 1868034.125
10 1233564.875
11 920019.75
12 738580.6875
13 616557.125
14 525425.1875
15 453094.625
16 393755.375
17 344163.28125
18 302267.0
19 266566.9375
20 236003.890625
21 209636.40625
22 186748.734375
23 166817.0625
24 149377.984375
25 134080.4375
26 120601.625
27 108698.2578125
28 98181.03125
29 88858.328125
30 80557.28125
31 73150.1875
32 66520.3203125
33 60580.2109375
34 55248.53125
35 50453.19140625
36 46131.1875
37 42235.5546875
38 38714.78515625
39 35527.7578125
40 32636.28125
41 30010.30078125
42 27630.349609375
43 25468.0234375
44 23498.28125
45 21704.357421875
46 20064.529296875
47 18564.169921875
48 17190.482421875
49 15930.939453125
50 14774.9111328125
51 13714.1240234375
52 12738.96875
53 11841.0546875
54 11013.9609375
55 10252.4541015625
56 9549.263671875
57 8899.9521484375
58 8299.7158203125
59 7744.578125
60 7230.904296875
61 6755.544921875
62 6314.82

In [5]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

3.52353e+07
3.24723e+07
3.88377e+07
5.41699e+07
6.34345e+07
4.96237e+07
2.27111e+07
6.83702e+06
2.35823e+06
1.32802e+06
1.00174e+06
825919.0
698366.0
596840.0
513929.0
445202.0
387829.0
339509.0
298600.0
263730.0
233862.0
208147.0
185923.0
166563.0
149705.0
134953.0
121993.0
110550.0
100420.0
91421.6
83403.2
76236.4
69821.1
64056.8
58866.2
54186.3
49952.5
46114.2
42627.0
39451.9
36556.4
33906.8
31483.4
29264.5
27232.6
25367.9
23652.4
22072.3
20615.2
19269.8
18026.4
16876.3
15811.0
14823.1
13906.3
13054.6
12263.5
11528.1
10843.4
10205.3
9610.14
9054.63
8535.72
8050.59
7596.89
7172.08
6774.21
6401.28
6051.64
5723.5
5415.43
5126.07
4854.06
4598.27
4357.71
4131.29
3918.08
3717.24
3527.89
3349.27
3180.79
3021.78
2871.65
2729.84
2595.88
2469.28
2349.5
2236.21
2129.51
2028.46
1932.76
1842.18
1756.3
1674.86
1597.67
1524.39
1454.86
1388.89
1326.23
1266.7
1210.12
1156.35
1105.21
1056.57
1010.31
966.27
924.336
884.41
846.378
810.146
775.617
742.704
711.332
681.425
652.906
625.704
599.727
574.927
