In [10]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

In [18]:
for t in range(500):
    #Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    #Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    if t == 0:
        print(grad_h)
    grad_h[h < 0] = 0
    if t == 0:
        print(grad_h)
    grad_w1 = x.T.dot(grad_h)
    
    #Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2    

0 5.330988933571941e-19
[[ 7.09590563e-11  9.06420612e-11 -1.85893992e-11 ...  2.19841028e-11
   2.47407083e-10  2.16737071e-10]
 [-2.69350925e-11 -2.45654927e-11 -9.30235776e-12 ... -4.13482856e-11
  -3.80354066e-12 -2.04965548e-11]
 [ 2.42319887e-11  2.21774185e-11 -1.26664580e-11 ...  3.39003473e-12
   4.86403124e-11  6.48156674e-11]
 ...
 [-2.45655634e-11 -2.64338476e-11  6.19530477e-12 ... -1.61032928e-11
  -8.12449761e-11 -5.79916808e-11]
 [-3.09302199e-11 -3.65884145e-11  1.28275350e-11 ...  3.31018098e-11
  -1.90180192e-11 -8.43540608e-11]
 [-8.32792974e-11 -2.18586303e-11  1.91934061e-12 ...  3.84560371e-11
  -2.40171289e-11 -1.00751162e-10]]
[[ 7.09590563e-11  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   2.47407083e-10  2.16737071e-10]
 [-2.69350925e-11 -2.45654927e-11  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00 -2.04965548e-11]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  3.39003473e-12
   0.00000000e+00  0.00000000e+00]
 ...
 [-2.45655634e-11  0

336 6.507606867442087e-23
337 6.507523463351027e-23
338 6.437853794097548e-23
339 6.402545746619744e-23
340 6.342281547147856e-23
341 6.27151908521426e-23
342 6.213996817875277e-23
343 6.194325079686405e-23
344 6.141438017509169e-23
345 6.062484628600724e-23
346 6.033708726284121e-23
347 6.039053556511689e-23
348 5.997794429416931e-23
349 5.919866427335367e-23
350 5.905833097195502e-23
351 5.889979994215711e-23
352 5.826697492049084e-23
353 5.738213446448956e-23
354 5.723809846878745e-23
355 5.68721153634732e-23
356 5.62457178394846e-23
357 5.585199948560433e-23
358 5.56420700679116e-23
359 5.5569323512333e-23
360 5.493095132169995e-23
361 5.442959886999622e-23
362 5.424752076896354e-23
363 5.385692770249309e-23
364 5.359788689557367e-23
365 5.353410968540595e-23
366 5.303959297229094e-23
367 5.2576159551290626e-23
368 5.2434745294747675e-23
369 5.190385078869321e-23
370 5.161223887333996e-23
371 5.1558496300524364e-23
372 5.146035023263105e-23
373 5.1153684241185923e-23
374 5.05598504

PyTorch: Tensors

In [27]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 37123412.0
1 38480784.0
2 38740004.0
3 32223896.0
4 21065716.0
5 11067200.0
6 5423455.0
7 2863833.5
8 1776339.75
9 1268853.625
10 990830.75
11 810822.4375
12 679706.1875
13 577653.875
14 495344.59375
15 427581.15625
16 371109.78125
17 323686.5
18 283564.375
19 249399.15625
20 220152.46875
21 195007.78125
22 173283.75
23 154421.28125
24 137972.125
25 123581.328125
26 110962.734375
27 99850.9765625
28 90027.5078125
29 81315.1875
30 73570.4140625
31 66667.703125
32 60501.0234375
33 54984.578125
34 50036.66015625
35 45588.82421875
36 41584.5390625
37 37974.58984375
38 34714.703125
39 31763.654296875
40 29093.4609375
41 26670.96875
42 24469.99609375
43 22468.681640625
44 20649.1953125
45 18992.03125
46 17479.703125
47 16098.84375
48 14836.958984375
49 13682.708984375
50 12627.7548828125
51 11660.671875
52 10773.533203125
53 9959.255859375
54 9211.861328125
55 8525.615234375
56 7893.9658203125
57 7312.8837890625
58 6777.85986328125
59 6284.951171875
60 5830.4521484375
61 5411.18017578125
6

406 0.00023111789778340608
407 0.00022536524920724332
408 0.0002204615157097578
409 0.0002144852769561112
410 0.00020991216297261417
411 0.00020568034960888326
412 0.00020090446923859417
413 0.000196476248675026
414 0.00019200397946406156
415 0.00018773350166156888
416 0.0001835375151131302
417 0.00017915973148774356
418 0.0001755996490828693
419 0.00017196817498188466
420 0.0001678237022133544
421 0.00016435263387393206
422 0.00016041799972299486
423 0.00015721093222964555
424 0.00015359005192294717
425 0.0001506133849034086
426 0.00014761884813196957
427 0.00014445399574469775
428 0.00014159096463117748
429 0.00013848562957718968
430 0.00013606379798147827
431 0.00013315302203409374
432 0.00013108858547639102
433 0.00012833272921852767
434 0.00012585252989083529
435 0.00012294713815208524
436 0.00012068678915966302
437 0.00011857304343720898
438 0.00011614798131631687
439 0.00011357567564118654
440 0.00011160563008161262
441 0.000109863503894303
442 0.00010802938777487725
443 0.00010

Autograd

PyTorch: Tensors and autograd