## Learning PyTorch with Examples (1)

Codes are identical to: [pytorch tutorial](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html).

### Warmup: NumPy

Before directly trying PyTorch, we will implement simple neural network using numpy.

In [1]:
import numpy as np

N = 64
D_in, H, D_out = 1000, 100, 10

# input, output
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # forward propagation
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    if t % 20 == 0: print(t, loss)
    
    # backward propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred) # (H, N) dot (N, D_out) => (H, D_out)

    grad_h_relu = grad_y_pred.dot(w2.T) # (N, D_out) dot (D_out, H) => (N, H)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    
    grad_w1 = x.T.dot(grad_h) # (D_in, N) dot (N, H) => (D_in, H)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 31900382.275380343
20 216525.89680977212
40 27922.71699749924
60 6152.411313553863
80 1837.8875162748384
100 662.9635080426337
120 267.0818121612483
140 114.65948277077416
160 51.146208705379266
180 23.382495645755704
200 10.868462006957207
220 5.111242712521417
240 2.4243507134483684
260 1.157142111453086
280 0.5549743741841484
300 0.2671512299468225
320 0.128972451138516
340 0.06240927262403543
360 0.030257205879535523
380 0.014692555163470617
400 0.007143893346126569
420 0.003477607268483101
440 0.0016945763002369043
460 0.0008264629315300131
480 0.000403398652010207


### PyTorch: Tensors

Why PyTorch: use concept **Tensor**, that can utilize GPU on its computation.

In [2]:
import torch

dtype = torch.float
device = torch.device("cpu")

N = 64
D_in, H, D_out = 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # forward propagation
    # mm: matrix multiplication
    h = x.mm(w1)
    h_relu = h.clamp(0)
    y_pred = h_relu.mm(w2)
    
    # item() works only on one element tensor
    loss = (y_pred - y).pow(2).sum().item()
    if t % 20 == 0: print(t, loss)
    
    # backward propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33310936.0
20 241361.09375
40 29715.150390625
60 5819.16015625
80 1404.984375
100 379.70257568359375
120 109.82647705078125
140 33.211524963378906
160 10.366438865661621
180 3.313112735748291
200 1.078428030014038
220 0.35614708065986633
240 0.11905832588672638
260 0.040213409811258316
280 0.013779652304947376
300 0.004883614834398031
320 0.001871271408163011
340 0.0008093949290923774
360 0.0003989505930803716
380 0.00022095948224887252
400 0.0001338231231784448
420 8.757281466387212e-05
440 6.118955207057297e-05
460 4.4693417294183746e-05
480 3.455495243542828e-05


### Autograd

- PyTorch autograd package supports auto computation of backward passes.
- If `x` is a Tensor that has `x.requires_grad=True` then `x.grad` is another tensor that holds the gradient.

In [3]:
import torch

dtype = torch.float
device = torch.device('cpu')

N = 64
D_in, H, D_out = 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t % 20 == 0: print(t, loss.item())
    
    # auto backward
    loss.backward()
    
    # pause tracking of autograd
    # update w1, w2 and zero the grads
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 36565664.0
20 221575.828125
40 27674.568359375
60 5455.25390625
80 1302.4947509765625
100 341.1291198730469
120 94.07901000976562
140 26.78451919555664
160 7.795941352844238
180 2.3056395053863525
200 0.6908807754516602
220 0.20928654074668884
240 0.06400889158248901
260 0.0198016706854105
280 0.0062724994495511055
300 0.002143196063116193
320 0.000842496519908309
340 0.00038975474308244884
360 0.0002074890653602779
380 0.00012354753562249243
400 8.00056877778843e-05
420 5.549774505198002e-05
440 4.073899981449358e-05
460 3.123254646197893e-05
480 2.468096135999076e-05


### Defining new autograd functions

Primitive autograd operator is two functions that operate on Tensors.
- `forward()`: computes output tensors from input tensors.
- `backward()`: computes gradients of the input tensors. (by receiving gradients of output)

We define our own autograd operator by defining a subclass of `torch.autograd.Function`.

In [4]:
import torch

class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input_):
        ctx.save_for_backward(input_)
        return input_.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input_, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input_ < 0] = 0
        return grad_input
    
dtype = torch.float
device = torch.device('cpu')

N = 64
D_in, H, D_out = 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    relu = MyReLU.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t % 20 == 0: print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 34426288.0
20 228508.71875
40 33925.78515625
60 8762.6279296875
80 2869.01025390625
100 1067.605224609375
120 428.014892578125
140 179.7257843017578
160 77.81365966796875
180 34.41069030761719
200 15.454386711120605
220 7.022711753845215
240 3.2210559844970703
260 1.4885547161102295
280 0.6921794414520264
300 0.3236069679260254
320 0.15201005339622498
340 0.07172486931085587
360 0.034040845930576324
380 0.016309797763824463
400 0.00796152651309967
420 0.004013003781437874
440 0.0021224436350166798
460 0.0011807273840531707
480 0.0006944339256733656
