### Learning Rate

In [None]:
import torch

def default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')   
    if torch.backends.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')

device = default_device()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split

In [None]:
def f(x, y):
    return x**2 + 2 * y**2

n_samples = 1000
X = torch.rand(n_samples)
Y = torch.rand(n_samples)
Z = f(X, Y) + 3 * torch.randn(n_samples)

dataset = torch.stack([X, Y, Z], dim=1)

In [None]:
train_size = int(0.7 * n_samples)
test_size = n_samples - train_size

train_dataset, test_dataset = random_split(dataset, lengths=[train_size, test_size])

train_dataloader = DataLoader(
  TensorDataset(train_dataset.dataset.narrow(1,0,2), train_dataset.dataset.narrow(1,2,1)), 
  batch_size=32, shuffle=False)
test_dataloader = DataLoader(
  TensorDataset(test_dataset.dataset.narrow(1,0,2), test_dataset.dataset.narrow(1,2,1)), 
  batch_size=32, shuffle=False)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(2, 8)
        self.output = nn.Linear(8, 1)
        
    def forward(self, x):
        x = torch.relu(self.hidden(x))
        return self.output(x)

In [None]:
n_epochs = 100
learning_rate = 0.1

loss_fn = nn.MSELoss()

for with_schedular in [False, True]:
  train_losses = []
  test_losses = []
  model = Model()

  optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
  scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

  for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    for X_batch, Y_batch in train_dataloader:
      optimizer.zero_grad()
      Y_pred = model(X_batch)
      loss = loss_fn(Y_pred, Y_batch)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    train_loss /= len(train_dataloader)
    train_losses.append(train_loss)

    model.eval()
    test_loss = 0
    with torch.no_grad():
      for X_batch, Y_batch in test_dataloader:
        Y_pred = model(X_batch)
        loss = loss_fn(Y_pred, Y_batch)
        test_loss += loss.item()
    
    test_loss /= len(test_dataloader)
    test_losses.append(test_loss)

    if with_schedular:
      scheduler.step()

  plt.figure(figsize=(8, 4))
  plt.plot(range(n_epochs),train_losses, label='Train loss')
  plt.plot(range(n_epochs),test_losses, label='Test loss')
  plt.title('{0} lr_scheduler'.format('With' if with_schedular else 'Without'))
  plt.legend()
  plt.show()
  

In [None]:
# 学习率衰减，每100个epoch衰减一半
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.5)

In [None]:
# 指数衰减，每个epoch衰减0.99
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

In [None]:
# 余弦学习率衰减，T_max是最大迭代次数，eta_min是最小学习率
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100, eta_min=0.00001)

In [None]:
# 预热学习率，前10个epoch学习率从0增加到0.1，然后再使用余弦学习率衰减
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

In [None]:
# 预热学习率
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: epoch / 30)