PyTorch provides two main features:

1. An n-dimensional Tensor, similar to numpy but can run on GPUs
2. Automatic differentiation for building and training neural networks

Numpy implementation of simple neural network

In [3]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29567787.738610502
1 24457765.147065353
2 22659605.8826482
3 20876017.786632977
4 17926053.166493855
5 13784792.658249406
6 9627475.332427816
7 6245486.32385573
8 3957971.239617762
9 2536621.770930306
10 1699221.564872202
11 1202452.5261491346
12 899854.4979643864
13 705530.9641489021
14 573325.2522519502
15 477984.4204233486
16 405813.7688010348
17 348910.5373758633
18 302646.43417220964
19 264243.56659488997
20 231884.20427245984
21 204333.74148629763
22 180651.77076087677
23 160233.3945872188
24 142477.147804356
25 126984.78904275344
26 113420.79218311023
27 101502.03534443589
28 91007.38179749422
29 81731.11435985743
30 73518.33448503519
31 66233.57529309057
32 59753.16610717938
33 53983.1721642143
34 48829.96527533239
35 44222.529949841315
36 40095.36811382796
37 36394.1852193477
38 33068.98821346843
39 30079.300746631227
40 27384.50598861856
41 24958.714476061643
42 22772.344341866203
43 20795.2204913157
44 19004.905277962065
45 17381.204079249394
46 15908.143105318792
47 14570

363 5.103116891708409e-05
364 4.842356281712531e-05
365 4.595142561641726e-05
366 4.360915090324755e-05
367 4.13862063469515e-05
368 3.9277815785248995e-05
369 3.7278466337585265e-05
370 3.5382615996263405e-05
371 3.358438049182483e-05
372 3.187959300429893e-05
373 3.026186873948641e-05
374 2.872779167098572e-05
375 2.7272276769362002e-05
376 2.5891959648536885e-05
377 2.458234439674e-05
378 2.3339833557484153e-05
379 2.216077908640371e-05
380 2.1042222952467133e-05
381 1.9980925388783588e-05
382 1.8974060005472825e-05
383 1.8018642000848312e-05
384 1.711203950042868e-05
385 1.6251574190151847e-05
386 1.543495275499156e-05
387 1.466028908662597e-05
388 1.3924799978571695e-05
389 1.322678041080549e-05
390 1.2564220408224946e-05
391 1.193524683757781e-05
392 1.1338150961045892e-05
393 1.0771325506880275e-05
394 1.0233283064218375e-05
395 9.722606159873932e-06
396 9.237682106487305e-06
397 8.777392819580389e-06
398 8.340499105188244e-06
399 7.925386446577256e-06
400 7.531250167373526e-06


Behind the scenes, Tensors can keep track of a computational graph and gradients, but they’re also useful as a generic tool for scientific computing.

In [4]:
import torch

In [5]:
dtype = torch.float
device = torch.device("cpu")

In [6]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [7]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 645.451171875
199 4.03122091293335
299 0.038073353469371796
399 0.0006547098746523261
499 6.61577214486897e-05


Autograd

In [8]:
import torch

In [9]:
dtype = torch.float
device = torch.device("cpu")

In [10]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [11]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [12]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [15]:
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
        
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 402.6940612792969
199 2.1221063137054443
299 0.026554986834526062
399 0.0006959157763049006
499 9.091220272239298e-05


Define new autogrd functions

In [16]:
import torch

In [17]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tenors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input

Pytorch high level API torch.nn, similar to Keras

In [18]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
        
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

99 1.8192226886749268
199 0.02704755589365959
299 0.0008998168050311506
399 4.299845022615045e-05
499 2.5010826902871486e-06
