### 5.5.2 골라쓰는 옵티마이저

In [1]:
import torch.optim as optim

In [2]:
dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_functional',
 '_multi_tensor',
 'lr_scheduler',
 'swa_utils']

#### 경사 하강 옵티마이저 사용하기

In [4]:
import torch

In [5]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate=1e-5
optimizer = optim.SGD([params], lr=learning_rate)

In [6]:
t_c = [0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0]
t_u = [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]
t_c = torch.tensor(t_c)
t_u = torch.tensor(t_u)

In [7]:
def model(t_u, w, b):
    return w * t_u + b

In [8]:
def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c)**2
    return squared_diffs.mean()

In [9]:
t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)
loss.backward()

In [10]:
optimizer.step()

params

tensor([ 9.5483e-01, -8.2600e-04], requires_grad=True)

In [11]:
t_un = 0.1 * t_u

In [12]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

t_p = model(t_un, *params)
loss = loss_fn(t_p, t_c)

optimizer.zero_grad()
loss.backward()
optimizer.step()

params

tensor([1.7761, 0.1064], requires_grad=True)

In [13]:
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 500 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))
            
    return params

In [14]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

training_loop(
    n_epochs=5000,
    optimizer=optimizer,
    params=params,
    t_u=t_un,
    t_c=t_c,
)

Epoch 500, Loss 7.860120
Epoch 1000, Loss 3.828538
Epoch 1500, Loss 3.092191
Epoch 2000, Loss 2.957698
Epoch 2500, Loss 2.933134
Epoch 3000, Loss 2.928648
Epoch 3500, Loss 2.927830
Epoch 4000, Loss 2.927679
Epoch 4500, Loss 2.927652
Epoch 5000, Loss 2.927647


tensor([  5.3671, -17.3012], requires_grad=True)

#### 다른 옵티마이저 테스트하기

In [16]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-1
optimizer = optim.Adam([params], lr=learning_rate)

training_loop(
    n_epochs=2000,
    optimizer=optimizer,
    params=params,
    t_u=t_u,
    t_c=t_c,
)

Epoch 500, Loss 7.612900
Epoch 1000, Loss 3.086700
Epoch 1500, Loss 2.928579
Epoch 2000, Loss 2.927644


tensor([  0.5367, -17.3021], requires_grad=True)

### 5.5.3 훈련, 검증, 과적합

#### 훈련 손실 평가하기

#### 검증셋으로 일반화하기

#### 데이터셋 나누기

In [19]:
n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)

shuffled_indices = torch.randperm(n_samples)

train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

train_indices, val_indices

(tensor([ 7, 10,  2,  0,  3,  1,  6,  4,  8]), tensor([9, 5]))

In [20]:
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]

val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u

In [23]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        val_t_p = model(val_t_u, *params)
        val_loss = loss_fn(val_t_p, val_t_c)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        if epoch <= 3 or epoch % 500 == 0:
            print(f'Epoch {epoch}, Training loss {train_loss.item():.4f},'
                  f'Validation loss {val_loss.item():.4f}')
    
    return params

In [24]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

training_loop(
    n_epochs=3000,
    optimizer=optimizer,
    params=params,
    train_t_u=train_t_un,
    val_t_u=val_t_un,
    train_t_c=train_t_c,
    val_t_c=val_t_c,
)

Epoch 1, Training loss 91.7660,Validation loss 29.0568
Epoch 2, Training loss 43.7766,Validation loss 2.3025
Epoch 3, Training loss 36.0900,Validation loss 3.5195
Epoch 500, Training loss 7.0920,Validation loss 4.6118
Epoch 1000, Training loss 3.4116,Validation loss 4.0901
Epoch 1500, Training loss 2.9273,Validation loss 3.9970
Epoch 2000, Training loss 2.8636,Validation loss 3.9759
Epoch 2500, Training loss 2.8552,Validation loss 3.9700
Epoch 3000, Training loss 2.8541,Validation loss 3.9680


tensor([  5.4240, -17.2490], requires_grad=True)

### 5.5.4 자동미분의 주의사항과 자동미분 끄기

In [25]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        with torch.no_grad():
            val_t_p = model(val_t_u, *params)
            val_loss = loss_fn(val_t_p, val_t_c)
            assert val_loss.requires_grad == False
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        if epoch <= 3 or epoch % 500 == 0:
            print(f'Epoch {epoch}, Training loss {train_loss.item():.4f},'
                  f'Validation loss {val_loss.item():.4f}')
    
    return params

In [26]:
def cal_forward(t_u, t_c, is_train):
    with torch.set_grad_enabled(is_train):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
    return loss

In [27]:
cal_forward(t_un, t_c, True)

tensor(3.0566, grad_fn=<MeanBackward0>)

In [28]:
cal_forward(t_un, t_c, False)

tensor(3.0566)

## 5.7 연습 문제

### 1. 모델을 w2 * t_u ** 2 + w1 * t_u + b 로 다시 정의하자.

In [29]:
def model(t_u, w1, w2, b):
    return w2 * t_u**2 + w1 * t_u + b

#### **a.** 이 모델을 다루기 위해 훈련 루프의 어떤 부분을 변경해야 하나?

In [30]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        with torch.no_grad():
            val_t_p = model(val_t_u, *params)
            val_loss = loss_fn(val_t_p, val_t_c)
            assert val_loss.requires_grad == False
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        if epoch <= 3 or epoch % 500 == 0:
            print(f'Epoch {epoch}, Training loss {train_loss.item():.4f},'
                  f'Validation loss {val_loss.item():.4f}')
    
    return params

params 를 생성하는 부분 변경 필요

#### **b.** 모델 변경과 무관한 부분은 어디인가?

훈련 루프는 변경과 무관

#### **c.** 훈련 후 손실이 증가하는가 감소하는가?

In [31]:
params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

training_loop(
    n_epochs=3000,
    optimizer=optimizer,
    params=params,
    train_t_u=train_t_un,
    val_t_u=val_t_un,
    train_t_c=train_t_c,
    val_t_c=val_t_c,
)

Epoch 1, Training loss 681.0538,Validation loss 652.1264
Epoch 2, Training loss 361504.2188,Validation loss 289540.2500
Epoch 3, Training loss 196336768.0000,Validation loss 158641680.0000
Epoch 500, Training loss nan,Validation loss nan
Epoch 1000, Training loss nan,Validation loss nan
Epoch 1500, Training loss nan,Validation loss nan
Epoch 2000, Training loss nan,Validation loss nan
Epoch 2500, Training loss nan,Validation loss nan
Epoch 3000, Training loss nan,Validation loss nan


tensor([nan, nan, nan], requires_grad=True)

loss 가 발산함

#### **d.** 실제 결과가 좋아졌나 나빠졌나?

나빠짐

In [35]:
labels = torch.tensor([1, 0, 0, 1, 0, 1, 1, 1, 0, 0], dtype=torch.int)

In [36]:
labels = labels.unsqueeze(-1)
labels

tensor([[1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0],
        [0]], dtype=torch.int32)

In [38]:
mask = torch.eq(labels, labels.transpose(0, 1))
mask

tensor([[ True, False, False,  True, False,  True,  True,  True, False, False],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [False,  True,  True, False,  True, False, False, False,  True,  True]])

In [39]:
torch.diag(mask)

tensor([True, True, True, True, True, True, True, True, True, True])

In [40]:
torch.diag_embed(torch.diag(mask))

tensor([[ True, False, False, False, False, False, False, False, False, False],
        [False,  True, False, False, False, False, False, False, False, False],
        [False, False,  True, False, False, False, False, False, False, False],
        [False, False, False,  True, False, False, False, False, False, False],
        [False, False, False, False,  True, False, False, False, False, False],
        [False, False, False, False, False,  True, False, False, False, False],
        [False, False, False, False, False, False,  True, False, False, False],
        [False, False, False, False, False, False, False,  True, False, False],
        [False, False, False, False, False, False, False, False,  True, False],
        [False, False, False, False, False, False, False, False, False,  True]])

In [42]:
mask

tensor([[ True, False, False,  True, False,  True,  True,  True, False, False],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [ True, False, False,  True, False,  True,  True,  True, False, False],
        [False,  True,  True, False,  True, False, False, False,  True,  True],
        [False,  True,  True, False,  True, False, False, False,  True,  True]])

In [41]:
mask ^ torch.diag_embed(torch.diag(mask))

tensor([[False, False, False,  True, False,  True,  True,  True, False, False],
        [False, False,  True, False,  True, False, False, False,  True,  True],
        [False,  True, False, False,  True, False, False, False,  True,  True],
        [ True, False, False, False, False,  True,  True,  True, False, False],
        [False,  True,  True, False, False, False, False, False,  True,  True],
        [ True, False, False,  True, False, False,  True,  True, False, False],
        [ True, False, False,  True, False,  True, False,  True, False, False],
        [ True, False, False,  True, False,  True,  True, False, False, False],
        [False,  True,  True, False,  True, False, False, False, False,  True],
        [False,  True,  True, False,  True, False, False, False,  True, False]])

In [44]:
array = [(0, 0), (1, 1), (2, 0), (3, 1), (4, 0), (5, 0)]

tokens, label_ids = map(list, zip(*array))
tokens, label_ids

([0, 1, 2, 3, 4, 5], [0, 1, 0, 1, 0, 0])