In [0]:
import numpy as np
import torch

In [2]:
print(torch.__version__)
print(torch.cuda.get_device_name())
print(torch.cuda.is_available())

1.1.0
Tesla T4
True


## Two-layer network example: Manual learning

In [0]:
dtype = torch.float
device = torch.device('cuda')

batchSize, dimIn, dimHidden, dimOut = 64, 1000, 100, 10

# Random input and output data
X = torch.randn(batchSize, dimIn, device=device, dtype=dtype)
Y = torch.randn(batchSize, dimOut, device=device, dtype=dtype)

# Randomly initialized weights
W1 = torch.randn(dimIn, dimHidden, device=device, dtype=dtype)
W2 = torch.randn(dimHidden, dimOut, device=device, dtype=dtype)

In [4]:
lr = 1e-6
numEpochs = 500
for epoch in range(numEpochs):
    # Forward pass
    H = X.mm(W1)
    YPred = torch.relu(H).mm(W2)
    
    # Compute and print loss
    loss = (YPred - Y).pow(2).sum().item()
    if not epoch % 10:
        print("Epoch", epoch, ", loss =", loss)
    
    # Manual backprop
    gradYPred = 2.0 * (YPred - Y)
    gradW2 = torch.relu(H).t().mm(gradYPred)
    gradH = gradYPred.mm(W2.t())
    gradH[H < 0] = 0
    gradW1 = X.t().mm(gradH)
    
    # Update weights
    W1 -= lr * gradW1
    W2 -= lr * gradW2

Epoch 0 , loss = 30110434.0
Epoch 10 , loss = 1881811.25
Epoch 20 , loss = 232516.0625
Epoch 30 , loss = 78475.6484375
Epoch 40 , loss = 32140.7578125
Epoch 50 , loss = 14692.7177734375
Epoch 60 , loss = 7237.71728515625
Epoch 70 , loss = 3731.4755859375
Epoch 80 , loss = 1986.7626953125
Epoch 90 , loss = 1084.8441162109375
Epoch 100 , loss = 604.2651977539062
Epoch 110 , loss = 341.5616760253906
Epoch 120 , loss = 195.31419372558594
Epoch 130 , loss = 112.7406234741211
Epoch 140 , loss = 65.587646484375
Epoch 150 , loss = 38.40731430053711
Epoch 160 , loss = 22.619701385498047
Epoch 170 , loss = 13.38724136352539
Epoch 180 , loss = 7.9582672119140625
Epoch 190 , loss = 4.74953556060791
Epoch 200 , loss = 2.8446455001831055
Epoch 210 , loss = 1.7094639539718628
Epoch 220 , loss = 1.0303103923797607
Epoch 230 , loss = 0.6226578950881958
Epoch 240 , loss = 0.3773045241832733
Epoch 250 , loss = 0.22920745611190796
Epoch 260 , loss = 0.13956552743911743
Epoch 270 , loss = 0.085173964500427

## Learning with `autograd`

In [5]:
W1 = torch.randn(dimIn, dimHidden, device=device, dtype=dtype,
                 requires_grad=True)
W2 = torch.randn(dimHidden, dimOut, device=device, dtype=dtype,
                 requires_grad=True)

# X and Y are reused because they were automatically set `requires_grad=False`.

lr = 1e-6
numEpochs = 500
for epoch in range(numEpochs):
    # Forward
    YPred = torch.relu(X.mm(W1)).mm(W2)
    
    # Loss: now a Tensor of shape (1,).
    # loss.item() gets the scalar value.
    loss = (YPred - Y).pow(2).sum()
    if not epoch % 10:
        print("Epoch", epoch, ", loss =", loss.item())
    
    # Backward: compute `W1.grad` and `W2.grad`, both being Tensors
    # holding the gradient of loss on W1 and W2.
    loss.backward()
    
    # Manual update.
    # We don't want this "update" process to be also tracked.
    with torch.no_grad():
        W1 -= lr * W1.grad
        W2 -= lr * W2.grad
        
        # Manually zero the gradients after the update.
        W1.grad.zero_()
        W2.grad.zero_()

Epoch 0 , loss = 41813200.0
Epoch 10 , loss = 953174.3125
Epoch 20 , loss = 229398.171875
Epoch 30 , loss = 79504.2734375
Epoch 40 , loss = 32678.109375
Epoch 50 , loss = 14811.6865234375
Epoch 60 , loss = 7142.232421875
Epoch 70 , loss = 3595.380859375
Epoch 80 , loss = 1865.2724609375
Epoch 90 , loss = 989.3023681640625
Epoch 100 , loss = 533.7219848632812
Epoch 110 , loss = 291.90008544921875
Epoch 120 , loss = 161.42820739746094
Epoch 130 , loss = 90.09970092773438
Epoch 140 , loss = 50.67060470581055
Epoch 150 , loss = 28.68417739868164
Epoch 160 , loss = 16.32929801940918
Epoch 170 , loss = 9.339933395385742
Epoch 180 , loss = 5.364675521850586
Epoch 190 , loss = 3.0928311347961426
Epoch 200 , loss = 1.7889349460601807
Epoch 210 , loss = 1.0377182960510254
Epoch 220 , loss = 0.6035274863243103
Epoch 230 , loss = 0.35177552700042725
Epoch 240 , loss = 0.20550459623336792
Epoch 250 , loss = 0.12023797631263733
Epoch 260 , loss = 0.07052139937877655
Epoch 270 , loss = 0.041394375264

## Custom `autograd`

In [6]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input_):
        """Compute an ouput Tensor from an input Tensor.
        
        ctx: a context object for stashing information in backward pass.
        """
        ctx.save_for_backward(input_)
        return torch.relu(input_)
    
    @staticmethod
    def backward(ctx, gradOutput):
        """Compute the grad of loss wrt input.
        
        gradOutput: a Tensor containing the grad of loss wrt output.
        """
        input_, = ctx.saved_tensors
        gradInput = gradOutput.clone()
        gradInput[input_ < 0] = 0  # ReLU gradient
        return gradInput

W1 = torch.randn(dimIn, dimHidden, device=device, dtype=dtype,
                 requires_grad=True)
W2 = torch.randn(dimHidden, dimOut, device=device, dtype=dtype,
                 requires_grad=True)

lr = 1e-6
numEpochs = 500
for epoch in range(numEpochs):
    # Forward
    YPred = MyReLU.apply(X.mm(W1)).mm(W2)
    
    # Loss
    loss = (YPred - Y).pow(2).sum()
    if not epoch % 10:
        print("Epoch", epoch, ", loss:", loss.item())
    
    # Use our autograd to compute the loss grad wrt input.
    loss.backward()
    
    # Update the weights.
    with torch.no_grad():
        W1 -= lr * W1.grad
        W2 -= lr * W2.grad
        W1.grad.zero_()
        W2.grad.zero_()

Epoch 0 , loss: 37581768.0
Epoch 10 , loss: 996981.25
Epoch 20 , loss: 220346.15625
Epoch 30 , loss: 77763.125
Epoch 40 , loss: 33184.27734375
Epoch 50 , loss: 15686.3330078125
Epoch 60 , loss: 7895.3759765625
Epoch 70 , loss: 4162.25
Epoch 80 , loss: 2274.115234375
Epoch 90 , loss: 1279.881591796875
Epoch 100 , loss: 739.3388671875
Epoch 110 , loss: 437.3294677734375
Epoch 120 , loss: 263.8274841308594
Epoch 130 , loss: 161.9019775390625
Epoch 140 , loss: 100.83695983886719
Epoch 150 , loss: 63.615699768066406
Epoch 160 , loss: 40.57720184326172
Epoch 170 , loss: 26.128400802612305
Epoch 180 , loss: 16.96102523803711
Epoch 190 , loss: 11.088306427001953
Epoch 200 , loss: 7.297678470611572
Epoch 210 , loss: 4.827805519104004
Epoch 220 , loss: 3.2080435752868652
Epoch 230 , loss: 2.140023708343506
Epoch 240 , loss: 1.4322056770324707
Epoch 250 , loss: 0.9614217281341553
Epoch 260 , loss: 0.6469417810440063
Epoch 270 , loss: 0.4363449215888977
Epoch 280 , loss: 0.29484623670578003
Epoch 

## Using `torch.nn`

In [7]:
model = torch.nn.Sequential(
    torch.nn.Linear(dimIn, dimHidden),
    torch.nn.ReLU(),
    torch.nn.Linear(dimHidden, dimOut)
).to(device)

lossFunc = torch.nn.MSELoss(reduction='sum')

lr = 1e-4
numEpochs = 500
for epoch in range(numEpochs):
    # Forward
    YPred = model(X)
    
    # Loss
    loss = lossFunc(YPred, Y)
    if not epoch % 10:
        print("Epoch", epoch, ", loss:", loss.item())
    
    # Zero the gradients before backward pass.
    model.zero_grad()
    
    # Backward.
    # Compute the loss grad wrt the parameter Tensors of `model`
    # that were automatically stored with `requires_grad=True`.
    loss.backward()
    
    # Update
    with torch.no_grad():
        for param in model.parameters():
            param -= lr * param.grad

Epoch 0 , loss: 667.0211181640625
Epoch 10 , loss: 372.93719482421875
Epoch 20 , loss: 225.90145874023438
Epoch 30 , loss: 132.2462921142578
Epoch 40 , loss: 75.00849914550781
Epoch 50 , loss: 42.240482330322266
Epoch 60 , loss: 24.232006072998047
Epoch 70 , loss: 14.303144454956055
Epoch 80 , loss: 8.689802169799805
Epoch 90 , loss: 5.4155497550964355
Epoch 100 , loss: 3.4450552463531494
Epoch 110 , loss: 2.227128505706787
Epoch 120 , loss: 1.4571682214736938
Epoch 130 , loss: 0.9619020223617554
Epoch 140 , loss: 0.6394496560096741
Epoch 150 , loss: 0.42782217264175415
Epoch 160 , loss: 0.2882079482078552
Epoch 170 , loss: 0.19537928700447083
Epoch 180 , loss: 0.13322681188583374
Epoch 190 , loss: 0.0913565456867218
Epoch 200 , loss: 0.06296852231025696
Epoch 210 , loss: 0.04365168884396553
Epoch 220 , loss: 0.0304095558822155
Epoch 230 , loss: 0.021283671259880066
Epoch 240 , loss: 0.014960355125367641
Epoch 250 , loss: 0.01055900938808918
Epoch 260 , loss: 0.0074802786111831665
Epoc

## Using `optim`

In [11]:
model = torch.nn.Sequential(
    torch.nn.Linear(dimIn, dimHidden),
    torch.nn.ReLU(),
    torch.nn.Linear(dimHidden, dimOut)
).to(device)

lossFunc = torch.nn.MSELoss(reduction='sum')

lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
numEpochs = 500
for epoch in range(numEpochs):
    # Forward
    YPred = model(X)
    
    # Loss
    loss = lossFunc(YPred, Y)
    if not epoch % 10:
        print("Epoch", epoch, ", loss:", loss.item())

    # Zero all the gradients.
    # By default, `backward` accumulates gradients rather than overwites them.
    optimizer.zero_grad()
    
    # Backward
    loss.backward()
    
    # Update the parameters.
    optimizer.step()

Epoch 0 , loss: 666.832763671875
Epoch 10 , loss: 515.1885375976562
Epoch 20 , loss: 402.29302978515625
Epoch 30 , loss: 315.60009765625
Epoch 40 , loss: 248.55567932128906
Epoch 50 , loss: 194.6269073486328
Epoch 60 , loss: 150.83323669433594
Epoch 70 , loss: 115.55943298339844
Epoch 80 , loss: 87.15130615234375
Epoch 90 , loss: 64.56743621826172
Epoch 100 , loss: 46.84429168701172
Epoch 110 , loss: 33.12757873535156
Epoch 120 , loss: 22.792102813720703
Epoch 130 , loss: 15.274070739746094
Epoch 140 , loss: 10.000965118408203
Epoch 150 , loss: 6.410922527313232
Epoch 160 , loss: 4.0355095863342285
Epoch 170 , loss: 2.5035464763641357
Epoch 180 , loss: 1.5394915342330933
Epoch 190 , loss: 0.945873498916626
Epoch 200 , loss: 0.5833356380462646
Epoch 210 , loss: 0.3631114363670349
Epoch 220 , loss: 0.2295474112033844
Epoch 230 , loss: 0.14826789498329163
Epoch 240 , loss: 0.09816903620958328
Epoch 250 , loss: 0.06667209416627884
Epoch 260 , loss: 0.04640299081802368
Epoch 270 , loss: 0.0

## Custom `nn`

For more complex models.

In [13]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, dimIn, dimHidden, dimOut):
        super().__init__()
        self.linear1 = torch.nn.Linear(dimIn, dimHidden)
        self.linear2 = torch.nn.Linear(dimHidden, dimOut)
        
    def forward(self, X):
        """
        `forward` must accept an input Tensor and return an output Tensor.
        """
        YPred = self.linear2(torch.relu(self.linear1(X)))
        return YPred

model = TwoLayerNet(dimIn, dimHidden, dimOut).to(device)
lossFunc = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for epoch in range(500):
    # Forward
    YPred = model(X)
    
    # Loss
    loss = lossFunc(YPred, Y)
    if not epoch % 10:
        print("Epoch", epoch, ", loss:", loss.item())
    
    # Backward and update
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0 , loss: 645.6306762695312
Epoch 10 , loss: 359.6498107910156
Epoch 20 , loss: 208.09262084960938
Epoch 30 , loss: 111.8485107421875
Epoch 40 , loss: 57.78961944580078
Epoch 50 , loss: 30.421998977661133
Epoch 60 , loss: 16.76714515686035
Epoch 70 , loss: 9.630976676940918
Epoch 80 , loss: 5.703324317932129
Epoch 90 , loss: 3.4544947147369385
Epoch 100 , loss: 2.1289784908294678
Epoch 110 , loss: 1.3320127725601196
Epoch 120 , loss: 0.8451871275901794
Epoch 130 , loss: 0.5428599119186401
Epoch 140 , loss: 0.3532104790210724
Epoch 150 , loss: 0.23301978409290314
Epoch 160 , loss: 0.15587753057479858
Epoch 170 , loss: 0.10552483052015305
Epoch 180 , loss: 0.07225219160318375
Epoch 190 , loss: 0.05001062527298927
Epoch 200 , loss: 0.03493734449148178
Epoch 210 , loss: 0.024613944813609123
Epoch 220 , loss: 0.017473118379712105
Epoch 230 , loss: 0.012489109300076962
Epoch 240 , loss: 0.008983037434518337
Epoch 250 , loss: 0.0064963968470692635
Epoch 260 , loss: 0.004720793105661869


## Control flow + weight sharing: An example of dynamic graphs

For each forward, use a shared-weight hidden layer for a randomly chosen number of times.

In [16]:
import random

class DynamicNet(torch.nn.Module):
    def __init__(self, dimIn, dimHidden, dimOut):
        super().__init__()
        self.inLinear = torch.nn.Linear(dimIn, dimHidden)
        self.midLinear = torch.nn.Linear(dimHidden, dimHidden)
        self.outLinear = torch.nn.Linear(dimHidden, dimOut)
        
    def forward(self, X):
        reluH = self.inLinear(X).clamp(min=0)  # clamp(0) == relu
        for _ in range(random.randint(0, 3)):
            reluH = self.midLinear(reluH).clamp(min=0)
        YPred = self.outLinear(reluH)
        return YPred

model = DynamicNet(dimIn, dimHidden, dimOut).to(device)
lossFunc = torch.nn.MSELoss(reduction='sum')
# The current dynamic network is hard to be trained with vanilla SGD,
# so we use momentum.
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for epoch in range(500):
    # Forward
    YPred = model(X)
    
    # Loss
    loss = lossFunc(YPred, Y)
    if not epoch % 10:
        print("Epoch", epoch, ", loss:", loss.item())
        
    # Backward and update
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0 , loss: 625.861083984375
Epoch 10 , loss: 570.870361328125
Epoch 20 , loss: 565.5037841796875
Epoch 30 , loss: 351.8943786621094
Epoch 40 , loss: 99.09777069091797
Epoch 50 , loss: 125.12979125976562
Epoch 60 , loss: 170.66844177246094
Epoch 70 , loss: 111.08098602294922
Epoch 80 , loss: 30.483501434326172
Epoch 90 , loss: 11.141931533813477
Epoch 100 , loss: 17.901830673217773
Epoch 110 , loss: 17.76694679260254
Epoch 120 , loss: 20.74614715576172
Epoch 130 , loss: 18.20951271057129
Epoch 140 , loss: 13.469329833984375
Epoch 150 , loss: 4.302995204925537
Epoch 160 , loss: 3.0961363315582275
Epoch 170 , loss: 9.583621978759766
Epoch 180 , loss: 2.4331865310668945
Epoch 190 , loss: 2.3768906593322754
Epoch 200 , loss: 0.8927303552627563
Epoch 210 , loss: 8.006782531738281
Epoch 220 , loss: 1.1096906661987305
Epoch 230 , loss: 1.0338538885116577
Epoch 240 , loss: 1.7363452911376953
Epoch 250 , loss: 20.106325149536133
Epoch 260 , loss: 4.408821105957031
Epoch 270 , loss: 17.52338