<a href="https://colab.research.google.com/github/sanspareilsmyn/mldl_sandbox/blob/main/pytorch_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e

In [1]:
import numpy as np

In [2]:
# Data Generation
np.random.seed(42)
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

In [4]:
import torch
import torch.optim as optim
import torch.nn as nn

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Our data was in Numpy arrays, but we need to transform them into PyTorch's Tensors
# and then we send them to the chosen device

x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

In [5]:
print(type(x_train), type(x_train_tensor), x_train_tensor.type())

<class 'numpy.ndarray'> <class 'torch.Tensor'> torch.FloatTensor


In [7]:
# FIRST
# Initializes parameters "a" and "b" randomly, ALMOST as we did in Numpy
# since we want to apply gradient descent on these parameters, we need
# to set REQUIRES_GRAD = TRUE
a = torch.randn(1, requires_grad=True, dtype=torch.float)
b = torch.randn(1, requires_grad=True, dtype=torch.float)
print(a, b)

tensor([-1.1742], requires_grad=True) tensor([-0.1160], requires_grad=True)


In [8]:
# SECOND
# But what if we want to run it on a GPU? We could just send them to device, right?
a = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
b = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
print(a, b)
# Sorry, but NO! The to(device) "shadows" the gradient...

tensor([-0.1646], requires_grad=True) tensor([0.1607], requires_grad=True)


In [9]:
# THIRD
# We can either create regular tensors and send them to the device (as we did with our data)
a = torch.randn(1, dtype=torch.float).to(device)
b = torch.randn(1, dtype=torch.float).to(device)
# and THEN set them as requiring gradients...
a.requires_grad_()
b.requires_grad_()
print(a, b)

tensor([1.1920], requires_grad=True) tensor([0.1483], requires_grad=True)


In [13]:
lr = 1e-1
n_epochs = 1000

torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

for epoch in range(n_epochs):
  yhat = a + b * x_train_tensor
  error = y_train_tensor - yhat
  loss = (error ** 2).mean()

  loss.backward()
  
  # Let's check the computed gradients...
  #print(a.grad)
  #print(b.grad)

  # What about UPDATING the parameters? Not so fast...

  # FIRST ATTEMPT
  # AttributeError: 'NoneType' object has no attribute 'zero_'
  # a = a - lr * a.grad
  # b = b - lr * b.grad
  # print(a)

  # SECOND ATTEMPT
  # RuntimeError: a leaf Variable that requires grad has been used in an in-place operation.
  # a -= lr * a.grad
  # b -= lr * b.grad        

  # THIRD ATTEMPT
  # We need to use NO_GRAD to keep the update out of the gradient computation
  # Why is that? It boils down to the DYNAMIC GRAPH that PyTorch uses...
  with torch.no_grad():
      a -= lr * a.grad
      b -= lr * b.grad

  # PyTorch is "clingy" to its computed gradients, we need to tell it to let it go...
  a.grad.zero_()
  b.grad.zero_()

print(a, b)

tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


In [17]:
# Optimizer
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

lr = 1e-1
n_epochs = 1000

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
  yhat = a + b * x_train_tensor
  error = y_train_tensor - yhat
  loss = (error ** 2).mean()

  loss.backward()

  optimizer.step()
  optimizer.zero_grad()

print(a, b)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)
tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


In [18]:
# Loss
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

lr = 1e-1
n_epochs = 1000

loss_fn = nn.MSELoss(reduction='mean')

optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
  yhat = a + b * x_train_tensor

  loss = loss_fn(y_train_tensor, yhat)

  loss.backward()
  optimizer.step()
  optimizer.zero_grad()

print(a, b)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)
tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


In [19]:
# Model
class ManualLinearRegression(nn.Module):
  def __init__(self):
    super().__init__()
    self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
    self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))

  def forward(self, x):
    return self.a + self.b * x

In [21]:
torch.manual_seed(42)

model = ManualLinearRegression().to(device)
print(model.state_dict())

lr = 1e-1
n_epochs = 1000

loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)

for epoch in range(n_epochs):
  # In PyTorch, train() method doesn't perform a training step!
  model.train()

  yhat = model(x_train_tensor)

  loss = loss_fn(y_train_tensor, yhat)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()

print(model.state_dict())

OrderedDict([('a', tensor([0.3367])), ('b', tensor([0.1288]))])
OrderedDict([('a', tensor([1.0235])), ('b', tensor([1.9690]))])


In [22]:
class LayerLinearRegression(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear = nn.Linear(1, 1)

  def forward(self, x):
    return self.linear(x)

In [23]:
# Let's make it generic.
def make_train_step(model, loss_fn, optimizer):
  def train_step(x, y):
    model.train()
    yhat = model(x)
    loss = loss_fn(y, yhat)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    return loss.item()
  # Returns the function that will be called inside the train loop
  return train_step

In [24]:
train_step = make_train_step(model, loss_fn, optimizer)
losses = []

for epoch in range(n_epochs):
  loss = train_step(x_train_tensor, y_train_tensor)
  losses.append(loss)
print(model.state_dict())

OrderedDict([('a', tensor([1.0235])), ('b', tensor([1.9690]))])


In [25]:
# Dataset
from torch.utils.data import Dataset, TensorDataset

class CustomDataset(Dataset):
  def __init__(self, x_tensor, y_tensor):
    self.x = x_tensor
    self.y = y_tensor

  def __getitem__(self, index):
    return (self.x[index], self.y[index])

  def __len__(self):
    return len(self.x)

x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.7713]), tensor([2.4745]))
(tensor([0.7713]), tensor([2.4745]))


In [27]:
# DataLoader
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)

losses = []
train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
  # By using dataloader, we're now sending only one mini-batch to the device!
  for x_batch, y_batch in train_loader:
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)

    loss = train_step(x_batch, y_batch)
    losses.append(loss)

print(model.state_dict())


OrderedDict([('a', tensor([1.0291])), ('b', tensor([1.9716]))])


In [29]:
# Random Split
from torch.utils.data.dataset import random_split

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

train_dataset, val_dataset = random_split(dataset, [80, 20])

train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)

In [31]:
losses = []
val_losses = []
train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
  for x_batch, y_batch in train_loader:
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)

    loss = train_step(x_batch, y_batch)
    losses.append(loss)

  with torch.no_grad():
    for x_val, y_val in val_loader:
      x_val = x_val.to(device)
      y_val = y_val.to(device)

      model.eval()
      yhat = model(x_val)
      val_loss = loss_fn(y_val, yhat)
      val_losses.append(val_loss.item())

print(model.state_dict())

OrderedDict([('a', tensor([1.0235])), ('b', tensor([1.9531]))])
