In [1]:
import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
w = torch.tensor([0.1], requires_grad=True)
b = torch.tensor([2.0], requires_grad=True)

In [5]:
x = torch.tensor([0.0])
y = torch.tensor([1.0])

Forward pass: compute predicted y using operations on Tensors.

Since w and b have requires_grad=True, operations involving these Tensors will cause PyTorch to build a computational graph, allowing automatic computation of gradients.

Since we are no longer implementing the backward pass by hand we don't need to keep references to intermediate values.

In [6]:
y_pred = w*x+b # Linear regression

In [7]:
print(f'True label: {y}', f'\nPredicted: {y_pred}')

True label: tensor([1.]) 
Predicted: tensor([2.], grad_fn=<AddBackward0>)


In [8]:
loss = (y_pred - y).pow(2)
loss

tensor([1.], grad_fn=<PowBackward0>)

In [9]:
loss.backward()
print(f'Gradient b: {b.grad}')
print(f'Gradient w: {w.grad}')

Gradient b: tensor([2.])
Gradient w: tensor([0.])


In [10]:
# Manually zero the gradients after running the backward pass
w.grad.zero_()
b.grad.zero_()

tensor([0.])

In [11]:
N = 64 # N is batch size
D_in = 1000 # D_in is input dimension
H = 100 # H is hidden dimension
D_out = 10 # D_out is output dimension.

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [12]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out),
)

In [13]:
loss_fn = torch.nn.MSELoss() # reduction = none, default=mean, sum

In [14]:
y_pred = model(x)
y_pred[0], y[0]

(tensor([ 0.1859, -0.0155, -0.1976, -0.0818,  0.0182, -0.4138, -0.3370, -0.0842,
         -0.2544,  0.4052], grad_fn=<SelectBackward0>),
 tensor([-0.1718, -0.0733, -2.4441,  0.1204,  0.4349, -0.0407,  1.1851,  1.1307,
         -0.2867, -0.7277]))

In [15]:
loss = loss_fn(y_pred, y)
loss.item()

1.0749353170394897

In [16]:
loss.backward()

In [17]:
# Update the weights using gradient descent. Each parameter is a Tensor, so
# we can access its data and gradients like we did before.
# Example of parameter update
learning_rate = 1e-1
with torch.no_grad():
    for param in model.parameters():
        param.data -= learning_rate * param.grad
model.zero_grad()

In [18]:
import torch

class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    super(TwoLayerNet, self).__init__()
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)

  def forward(self, x):
    relu = torch.nn.ReLU()
    h_relu = relu(self.linear1(x))
    y_pred = self.linear2(h_relu)
    return y_pred

In [19]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [20]:

model = TwoLayerNet(D_in, H, D_out)

In [21]:
loss_fn = torch.nn.MSELoss()

In [22]:
# Define the optimization algorithm to be used (Stochastic Gradient Descent):
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

In [23]:
y_pred = model(x)

In [24]:
loss = loss_fn(y_pred, y)
loss.item()

1.0838369131088257

In [25]:
optimizer.zero_grad()
loss.backward()
optimizer.step()