## Learning PyTorch with Examples (1)

Codes are identical to: [pytorch tutorial](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html).

### Warmup: NumPy

Before directly trying PyTorch, we will implement simple neural network using numpy.

In [1]:
import numpy as np

N = 64
D_in, H, D_out = 1000, 100, 10

# input, output
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # forward propagation
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # backward propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred) # (H, N) dot (N, D_out) => (H, D_out)

    grad_h_relu = grad_y_pred.dot(w2.T) # (N, D_out) dot (D_out, H) => (N, H)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    
    grad_w1 = x.T.dot(grad_h) # (D_in, N) dot (N, H) => (D_in, H)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 32557342.0017742
1 33936275.1533782
2 39766433.10262542
3 43136284.36050465
4 37077686.33736059
5 23775772.53028822
6 11659246.879426442
7 5191286.727171885
8 2545547.2232502163
9 1531195.64591746
10 1095568.4242044631
11 866634.7261295571
12 719784.0305691361
13 612101.1663855724
14 527351.3849335301
15 458185.20470855443
16 400722.04699009715
17 352448.243312605
18 311538.0092572969
19 276619.1223502651
20 246592.12954510853
21 220601.8057213456
22 198020.34467547387
23 178314.91639595682
24 161036.9523575441
25 145816.23397140365
26 132357.3964985999
27 120427.2286994028
28 109787.5346325582
29 100275.48140594066
30 91753.8938192091
31 84101.55058441116
32 77212.12512336602
33 70992.42197892262
34 65369.22155603588
35 60274.099004744014
36 55654.82070509515
37 51455.73055388755
38 47626.96428449915
39 44130.24223505327
40 40935.36890777509
41 38007.266904513264
42 35322.499192669275
43 32857.96247879347
44 30589.276473425147
45 28502.07528014917
46 26578.50243808698
47 24802.47072

421 0.012531544677355778
422 0.012161273285564165
423 0.011802114671602483
424 0.011453682420040329
425 0.011115698084866087
426 0.010787878530163383
427 0.010469854847494206
428 0.01016147433689348
429 0.009862272005672655
430 0.009571960019259827
431 0.009290327847060726
432 0.00901713164193054
433 0.008752047724819467
434 0.00849492688620852
435 0.008245428937839323
436 0.008003430298794412
437 0.007768609130621363
438 0.007540773470229842
439 0.007319714264497491
440 0.007105253506871894
441 0.006897150788513396
442 0.0066952301268196675
443 0.006499284796938148
444 0.006309184867396447
445 0.0061247332678963425
446 0.005945733659397105
447 0.005772019992240038
448 0.005603452837173277
449 0.005439904003827888
450 0.00528122521405768
451 0.005127183462248205
452 0.004977693856940348
453 0.004832673314762054
454 0.004691898654173209
455 0.0045552831097328056
456 0.004422684439966293
457 0.004293998099243783
458 0.004169129477975313
459 0.004047921172637366
460 0.00393029293711653
46

### PyTorch: Tensors

Why PyTorch: use concept **Tensor**, that can utilize GPU on its computation.

In [8]:
import torch

dtype = torch.float
device = torch.device("cpu")

N = 64
D_in, H, D_out = 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # forward propagation
    # mm: matrix multiplication
    h = x.mm(w1)
    h_relu = h.clamp(0)
    y_pred = h_relu.mm(w2)
    
    # item() works only on one element tensor
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # backward propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 40215344.0
1 39363040.0
2 41843640.0
3 38549212.0
4 27324412.0
5 14593624.0
6 6686123.0
7 3181555.75
8 1812421.5
9 1233028.625
10 937526.0
11 753491.125
12 622139.1875
13 521339.75
14 441036.15625
15 375842.0625
16 322298.53125
17 277950.15625
18 240888.640625
19 209741.734375
20 183420.84375
21 161025.015625
22 141874.59375
23 125417.9375
24 111226.6640625
25 98949.0
26 88273.4921875
27 78952.1328125
28 70782.5078125
29 63602.4453125
30 57275.5234375
31 51689.12109375
32 46737.80078125
33 42336.71484375
34 38416.5625
35 34915.3125
36 31781.9375
37 28970.8515625
38 26443.998046875
39 24169.443359375
40 22120.88671875
41 20269.751953125
42 18594.884765625
43 17076.54296875
44 15698.875
45 14446.7958984375
46 13306.62109375
47 12267.556640625
48 11318.939453125
49 10452.2783203125
50 9660.265625
51 8934.8330078125
52 8269.8525390625
53 7660.11767578125
54 7099.71240234375
55 6584.4521484375
56 6110.40478515625
57 5673.7373046875
58 5271.55078125
59 4900.35546875
60 4557.72216796875
61 

405 0.0002756573085207492
406 0.000269326992565766
407 0.0002634899574331939
408 0.0002574683749116957
409 0.00025087734684348106
410 0.00024501755251549184
411 0.00023955281358212233
412 0.00023350170522462577
413 0.0002284915535710752
414 0.0002228644152637571
415 0.00021818579989485443
416 0.00021244020899757743
417 0.0002082886639982462
418 0.00020374688028823584
419 0.0001994311169255525
420 0.00019561604131013155
421 0.00019124838581774384
422 0.00018693922902457416
423 0.00018305810226593167
424 0.0001792814291547984
425 0.00017540593398734927
426 0.00017151853535324335
427 0.00016791073721833527
428 0.00016472768038511276
429 0.00016117551422212273
430 0.0001583093689987436
431 0.000155545276356861
432 0.00015267952403519303
433 0.00014995175297372043
434 0.00014652672689408064
435 0.00014352568541653454
436 0.00014103433932177722
437 0.00013812139513902366
438 0.00013604141713585705
439 0.0001330582017544657
440 0.00013121135998517275
441 0.00012888021592516452
442 0.000126286

### Autograd

- PyTorch autograd package supports auto computation of backward passes.
- If `x` is a Tensor that has `x.requires_grad=True` then `x.grad` is another tensor that holds the gradient.

In [10]:
import torch

dtype = torch.float
device = torch.device('cpu')

N = 64
D_in, H, D_out = 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # auto backward
    loss.backward()
    
    # pause tracking of autograd
    # update w1, w2 and zero the grads
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 35168256.0
1 33282198.0
2 33473942.0
3 30371536.0
4 23095916.0
5 14405005.0
6 7955375.0
7 4303159.5
8 2518321.25
9 1654290.875
10 1206019.375
11 943351.5
12 769800.5
13 643725.875
14 546563.375
15 468453.8125
16 404348.78125
17 350928.46875
18 306013.96875
19 267973.65625
20 235546.484375
21 207765.59375
22 183904.96875
23 163271.4375
24 145342.234375
25 129714.2421875
26 116051.3515625
27 104057.7578125
28 93500.640625
29 84185.609375
30 75936.0390625
31 68621.2109375
32 62104.98828125
33 56295.82421875
34 51112.078125
35 46475.046875
36 42314.55078125
37 38575.1171875
38 35219.140625
39 32196.044921875
40 29467.759765625
41 27001.4609375
42 24767.34765625
43 22740.46484375
44 20899.771484375
45 19224.7265625
46 17698.115234375
47 16307.07421875
48 15037.0458984375
49 13875.939453125
50 12813.7353515625
51 11840.3994140625
52 10948.51171875
53 10130.123046875
54 9378.5732421875
55 8688.3408203125
56 8053.25244140625
57 7468.87548828125
58 6930.5107421875
59 6434.255859375
60 5977.36

427 0.00017302404739893973
428 0.00016942857473623008
429 0.0001655536616453901
430 0.00016227700689341873
431 0.0001587531151017174
432 0.00015614056610502303
433 0.00015262242231983691
434 0.00014931618352420628
435 0.00014655390987172723
436 0.0001435426966054365
437 0.00014124462904874235
438 0.00013864925131201744
439 0.00013586608110927045
440 0.00013346466585062444
441 0.00013048235268797725
442 0.00012877526751253754
443 0.00012652478471864015
444 0.00012418543337844312
445 0.00012205239909235388
446 0.00011971734784310684
447 0.00011784865637309849
448 0.00011581536818994209
449 0.00011398997594369575
450 0.00011185796756763011
451 0.00010965978435706347
452 0.0001080979491234757
453 0.00010583855328150094
454 0.00010389810631750152
455 0.00010234052751911804
456 0.00010055405437014997
457 9.89523614407517e-05
458 9.743669943418354e-05
459 9.572684211889282e-05
460 9.458776912651956e-05
461 9.277810022467747e-05
462 9.131677506957203e-05
463 8.970637281890959e-05
464 8.8156339

### Defining new autograd functions

Primitive autograd operator is two functions that operate on Tensors.
- `forward()`: computes output tensors from input tensors.
- `backward()`: computes gradients of the input tensors. (by receiving gradients of output)

We define our own autograd operator by defining a subclass of `torch.autograd.Function`.

In [13]:
import torch

class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input_):
        ctx.save_for_backward(input_)
        return input_.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input_, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input_ < 0] = 0
        return grad_input
    
dtype = torch.float
device = torch.device('cpu')

N = 64
D_in, H, D_out = 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    relu = MyReLU.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 28984088.0
1 26258994.0
2 27854508.0
3 29318956.0
4 27533566.0
5 21232484.0
6 13595867.0
7 7537240.5
8 4060210.0
9 2322048.0
10 1493268.875
11 1075161.25
12 840750.0
13 691282.6875
14 584952.6875
15 503122.5
16 437110.90625
17 382325.75
18 336303.21875
19 297094.71875
20 263400.21875
21 234313.578125
22 209090.203125
23 187107.4375
24 167855.515625
25 150936.734375
26 136034.15625
27 122882.3359375
28 111222.65625
29 100851.4140625
30 91602.21875
31 83342.21875
32 75955.296875
33 69326.359375
34 63359.82421875
35 57979.91796875
36 53118.9609375
37 48719.4765625
38 44733.45703125
39 41114.51171875
40 37827.21875
41 34836.58984375
42 32111.978515625
43 29625.533203125
44 27352.7734375
45 25274.82421875
46 23371.435546875
47 21627.609375
48 20028.544921875
49 18558.884765625
50 17207.70703125
51 15964.4814453125
52 14819.4072265625
53 13763.876953125
54 12790.30078125
55 11892.9228515625
56 11063.9140625
57 10297.3828125
58 9588.083984375
59 8931.720703125
60 8323.888671875
61 7760.4697

391 0.00025373129756189883
392 0.00024681963259354234
393 0.00024086833582259715
394 0.00023429060820490122
395 0.0002273684076499194
396 0.00022189815354067832
397 0.00021590606775134802
398 0.00021063047461211681
399 0.0002045946312136948
400 0.0001998818916035816
401 0.00019477812747936696
402 0.0001902967633213848
403 0.0001854044385254383
404 0.000181041716132313
405 0.00017641621525399387
406 0.00017147342441603541
407 0.00016746632172726095
408 0.00016369450895581394
409 0.0001597984228283167
410 0.00015609436377417296
411 0.00015231412544380873
412 0.00014917005319148302
413 0.0001460008352296427
414 0.000142776858410798
415 0.0001390567485941574
416 0.00013624269922729582
417 0.00013326096814125776
418 0.00013013130228500813
419 0.0001272287336178124
420 0.00012479237921070307
421 0.00012199304183013737
422 0.00011913834896404296
423 0.00011653749970719218
424 0.00011448859731899574
425 0.00011227426875848323
426 0.0001101934103644453
427 0.00010748939530458301
428 0.000105389