In [None]:
%matplotlib inline


PyTorch: Tensors
----------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation uses PyTorch tensors to manually compute the forward pass,
loss, and backward pass.

A PyTorch Tensor is basically the same as a numpy array: it does not know
anything about deep learning or computational graphs or gradients, and is just
a generic n-dimensional array to be used for arbitrary numeric computation.

The biggest difference between a numpy array and a PyTorch Tensor is that
a PyTorch Tensor can run on either CPU or GPU. To run operations on the GPU,
just cast the Tensor to a cuda datatype.



In [1]:
import torch

In [2]:
dtype = torch.float
device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU
print(device)

cuda:0


In [3]:
N, D_in, H, D_out = 64, 1000 ,100, 10

In [7]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [8]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [9]:
learning_rate = 1e-6
for t in range(500):
    # forward
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # loss
    loss = (y_pred-y).pow(2).sum().item()
    print(t,loss)
    
    # backprop
    grad_y_pred = 2.0 * (y_pred-y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] =0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2
    

0 25430750.0
1 18651760.0
2 15518368.0
3 13651152.0
4 11995230.0
5 10223884.0
6 8292902.0
7 6429390.5
8 4786884.0
9 3480668.0
10 2503443.5
11 1807470.875
12 1322302.875
13 988209.625
14 756692.875
15 594602.4375
16 478759.15625
17 394008.65625
18 330292.21875
19 281138.65625
20 242329.046875
21 210949.90625
22 185072.921875
23 163390.125
24 144988.703125
25 129195.2265625
26 115543.078125
27 103660.65625
28 93249.4296875
29 84085.75
30 75993.25
31 68805.8359375
32 62406.9375
33 56695.69140625
34 51590.78515625
35 47021.95703125
36 42917.48046875
37 39220.16015625
38 35886.44140625
39 32875.859375
40 30153.322265625
41 27688.7421875
42 25454.17578125
43 23422.625
44 21571.943359375
45 19884.197265625
46 18344.283203125
47 16936.39453125
48 15648.9326171875
49 14468.853515625
50 13387.1298828125
51 12394.240234375
52 11482.7880859375
53 10644.3076171875
54 9872.5703125
55 9161.7939453125
56 8506.9814453125
57 7903.310546875
58 7346.25244140625
59 6831.767578125
60 6356.0576171875
61 5916

403 0.0003198553458787501
404 0.00031190848676487803
405 0.0003039414878003299
406 0.00029741958132945
407 0.00029020931106060743
408 0.00028359616408124566
409 0.0002770372375380248
410 0.0002706176892388612
411 0.0002643824846018106
412 0.0002587932685855776
413 0.0002529686607886106
414 0.0002471245243214071
415 0.0002408486179774627
416 0.00023566624440718442
417 0.00023099346435628831
418 0.0002262917550979182
419 0.00022103090304881334
420 0.00021632704010698944
421 0.0002121465076925233
422 0.00020664316252805293
423 0.00020227435743436217
424 0.00019824695482384413
425 0.00019442153279669583
426 0.00019003600755240768
427 0.00018616659508552402
428 0.00018245360115543008
429 0.00017881653911899775
430 0.00017536692030262202
431 0.00017212000966537744
432 0.0001687521580606699
433 0.0001654422376304865
434 0.0001620792900212109
435 0.00015910551883280277
436 0.0001561250101076439
437 0.00015295993944164366
438 0.00015022318984847516
439 0.00014711296535097063
440 0.0001441792992

In [10]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27803592.0
1 23134196.0
2 22641948.0
3 23141872.0
4 22391908.0
5 19324854.0
6 14522271.0
7 9672631.0
8 5974700.5
9 3627883.0
10 2271229.25
11 1510407.375
12 1075430.125
13 814723.25
14 647766.25
15 533420.25
16 450096.0
17 386220.90625
18 335321.75
19 293603.4375
20 258767.0
21 229241.609375
22 203915.703125
23 182022.375
24 162986.71875
25 146339.015625
26 131716.265625
27 118820.7421875
28 107426.65625
29 97327.8046875
30 88346.3828125
31 80329.78125
32 73161.46875
33 66740.953125
34 60980.87890625
35 55792.4609375
36 51108.96875
37 46878.1875
38 43045.4609375
39 39569.80078125
40 36413.92578125
41 33546.8203125
42 30935.4296875
43 28554.107421875
44 26379.09375
45 24388.76171875
46 22565.322265625
47 20895.298828125
48 19364.55859375
49 17958.673828125
50 16666.16015625
51 15477.0009765625
52 14382.2802734375
53 13373.251953125
54 12442.3212890625
55 11582.8427734375
56 10789.923828125
57 10055.8974609375
58 9376.7216796875
59 8747.9697265625
60 8165.1611328125
61 7624.90869140625

391 0.0006221561343409121
392 0.0006042773020453751
393 0.0005853329203091562
394 0.0005685716168954968
395 0.000550613971427083
396 0.0005353621090762317
397 0.0005193236866034567
398 0.0005044841091148555
399 0.0004898031475022435
400 0.00047604896826669574
401 0.00046175214811228216
402 0.00044923368841409683
403 0.0004369726812001318
404 0.0004257158434484154
405 0.00041319834417663515
406 0.0004022488137707114
407 0.00039075722452253103
408 0.0003801585698965937
409 0.0003704993869177997
410 0.0003607084508985281
411 0.00035171068157069385
412 0.000341628969181329
413 0.00033256839378736913
414 0.0003239458310417831
415 0.0003157758037559688
416 0.0003080582246184349
417 0.0003001517034135759
418 0.0002924554282799363
419 0.000285578949842602
420 0.0002776942274067551
421 0.00027184566715732217
422 0.00026487658033147454
423 0.00025879021268337965
424 0.0002524453157093376
425 0.0002468163729645312
426 0.00024084300093818456
427 0.0002347449481021613
428 0.00022989246645011008
429