In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

torch.__version__

'1.7.0+cu101'

In [2]:
# For reproducibility
torch.manual_seed(1)

<torch._C.Generator at 0x7fd9d5833b58>

In [3]:
# Training  dataset
x_train = torch.FloatTensor([[1, 2, 1],
                             [1, 3, 2],
                             [1, 3, 4],
                             [1, 5, 5],
                             [1, 7, 5],
                             [1, 2, 5],
                             [1, 6, 6],
                             [1, 7, 7]
                            ])
y_train = torch.LongTensor([2, 2, 2, 1, 1, 1, 0, 0])

# Test dataset
x_test = torch.FloatTensor([[2, 1, 1],
                            [3, 1, 2],
                            [3, 3, 4]])
y_test = torch.LongTensor([2, 2, 2])

# Check dataset shape
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

torch.Size([8, 3]) torch.Size([3, 3])
torch.Size([8]) torch.Size([3])


## Train model
- Overfitting으로 인해 모델의 Test Accuracy가 낮고 Cost는 높은 모습을 확인할 수 있음

In [4]:
class SoftmaxClassifierModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear = nn.Linear(3, 3)  # input feature X Target class

  def forward(self, x):
    return self.linear(x)


def train(model, optimizer, x_train, y_train):
  n_epochs = 20
  for epoch in range(n_epochs):
    prediction = model(x_train)
    cost = F.cross_entropy(prediction, y_train)
    
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    if epoch % 1 == 0:
      print(f'epoch : {epoch:5}  |  cost : {cost:10.6f}')

def test(model, x_test, y_test):
  prediction = model(x_test)
  cost = F.cross_entropy(prediction, y_test)

  predicted_class = prediction.max(dim=1)[1]
  correct_count = sum(predicted_class == y_test)

  print(f'Cost : {cost:8.6f}  |  Accuracy : {correct_count/len(y_test):8.6f}')

In [5]:
model = SoftmaxClassifierModel()
optimizer = optim.SGD(model.parameters(), lr=0.1)

train(model, optimizer, x_train, y_train)

epoch :     0  |  cost :   2.203667
epoch :     1  |  cost :   1.199645
epoch :     2  |  cost :   1.142985
epoch :     3  |  cost :   1.117769
epoch :     4  |  cost :   1.100901
epoch :     5  |  cost :   1.089523
epoch :     6  |  cost :   1.079872
epoch :     7  |  cost :   1.071320
epoch :     8  |  cost :   1.063325
epoch :     9  |  cost :   1.055720
epoch :    10  |  cost :   1.048378
epoch :    11  |  cost :   1.041245
epoch :    12  |  cost :   1.034285
epoch :    13  |  cost :   1.027478
epoch :    14  |  cost :   1.020813
epoch :    15  |  cost :   1.014279
epoch :    16  |  cost :   1.007872
epoch :    17  |  cost :   1.001586
epoch :    18  |  cost :   0.995419
epoch :    19  |  cost :   0.989365


In [6]:
test(model, x_test, y_test)

Cost : 1.425844  |  Accuracy : 0.000000


## Learning rate
- Gradient Descent에서 $\alpha$ 값
- learning rate가 너무 높으면 diverge하면서 cost가 점점 증가함(overshooting)
- learning rate가 너무 낮으면 cost가 잘 줄어들지 않아 학습 효율이 떨어짐

In [7]:
# too high learning rate
model = SoftmaxClassifierModel()
lr = 1e5
train(model, optim.SGD(model.parameters(), lr=lr), x_train, y_train)

epoch :     0  |  cost :   1.280268
epoch :     1  |  cost : 976950.750000
epoch :     2  |  cost : 1279135.250000
epoch :     3  |  cost : 1198378.875000
epoch :     4  |  cost : 1098825.750000
epoch :     5  |  cost : 1968197.625000
epoch :     6  |  cost : 284763.250000
epoch :     7  |  cost : 1532260.250000
epoch :     8  |  cost : 1651503.750000
epoch :     9  |  cost : 521878.562500
epoch :    10  |  cost : 1397263.250000
epoch :    11  |  cost : 750986.375000
epoch :    12  |  cost : 918691.375000
epoch :    13  |  cost : 1487888.250000
epoch :    14  |  cost : 1582260.125000
epoch :    15  |  cost : 685818.125000
epoch :    16  |  cost : 1140048.875000
epoch :    17  |  cost : 940566.375000
epoch :    18  |  cost : 931638.250000
epoch :    19  |  cost : 1971322.625000


In [8]:
# too low learning rate
model = SoftmaxClassifierModel()
lr = 1e-5
train(model, optim.SGD(model.parameters(), lr=lr), x_train, y_train)

epoch :     0  |  cost :   3.187324
epoch :     1  |  cost :   3.187014
epoch :     2  |  cost :   3.186703
epoch :     3  |  cost :   3.186393
epoch :     4  |  cost :   3.186082
epoch :     5  |  cost :   3.185772
epoch :     6  |  cost :   3.185461
epoch :     7  |  cost :   3.185151
epoch :     8  |  cost :   3.184840
epoch :     9  |  cost :   3.184530
epoch :    10  |  cost :   3.184220
epoch :    11  |  cost :   3.183910
epoch :    12  |  cost :   3.183599
epoch :    13  |  cost :   3.183289
epoch :    14  |  cost :   3.182979
epoch :    15  |  cost :   3.182669
epoch :    16  |  cost :   3.182359
epoch :    17  |  cost :   3.182048
epoch :    18  |  cost :   3.181738
epoch :    19  |  cost :   3.181427


- 적절한 크기의 learning rate로 시작해서, cost가 발산하면 줄이고 cost가 정체하면 높이는 방식으로 조절

In [9]:
torch.manual_seed(1)

# moderate learning rate
model = SoftmaxClassifierModel()
lr = 1e-1
train(model, optim.SGD(model.parameters(), lr=lr), x_train, y_train)

epoch :     0  |  cost :   2.203667
epoch :     1  |  cost :   1.199645
epoch :     2  |  cost :   1.142985
epoch :     3  |  cost :   1.117769
epoch :     4  |  cost :   1.100901
epoch :     5  |  cost :   1.089523
epoch :     6  |  cost :   1.079872
epoch :     7  |  cost :   1.071320
epoch :     8  |  cost :   1.063325
epoch :     9  |  cost :   1.055720
epoch :    10  |  cost :   1.048378
epoch :    11  |  cost :   1.041245
epoch :    12  |  cost :   1.034285
epoch :    13  |  cost :   1.027478
epoch :    14  |  cost :   1.020813
epoch :    15  |  cost :   1.014279
epoch :    16  |  cost :   1.007872
epoch :    17  |  cost :   1.001586
epoch :    18  |  cost :   0.995419
epoch :    19  |  cost :   0.989365


## Data Preprocessing
- 각 속성별 값의 범위(scale)가 크게 다르다면 범위가 큰 값이 과도하게 예측에 반영됨
- 속성별 범위(scale) 차이로 인한 영향을 제거하기 위해 정규화 필요

In [10]:
mu = x_train.mean(dim=0)
sigma = x_train.std(dim=0) + 1

x_train_normalized = (x_train - mu) / sigma
x_test_normalized  = (x_test - mu) / sigma

print(x_train_normalized)
print(x_train_normalized.mean(dim=0), x_train_normalized.std(dim=0))

tensor([[ 0.0000, -0.7578, -1.1267],
        [ 0.0000, -0.4387, -0.7928],
        [ 0.0000, -0.4387, -0.1252],
        [ 0.0000,  0.1994,  0.2086],
        [ 0.0000,  0.8376,  0.2086],
        [ 0.0000, -0.7578,  0.2086],
        [ 0.0000,  0.5185,  0.5425],
        [ 0.0000,  0.8376,  0.8763]])
tensor([0., 0., 0.]) tensor([0.0000, 0.6809, 0.6662])


In [11]:
torch.manual_seed(1)

model = SoftmaxClassifierModel()
optimizer = optim.SGD(model.parameters(), lr=0.1)

train(model, optimizer, x_train_normalized, y_train)

epoch :     0  |  cost :   1.248026
epoch :     1  |  cost :   1.222619
epoch :     2  |  cost :   1.198292
epoch :     3  |  cost :   1.175012
epoch :     4  |  cost :   1.152744
epoch :     5  |  cost :   1.131453
epoch :     6  |  cost :   1.111105
epoch :     7  |  cost :   1.091664
epoch :     8  |  cost :   1.073092
epoch :     9  |  cost :   1.055356
epoch :    10  |  cost :   1.038417
epoch :    11  |  cost :   1.022243
epoch :    12  |  cost :   1.006796
epoch :    13  |  cost :   0.992044
epoch :    14  |  cost :   0.977953
epoch :    15  |  cost :   0.964491
epoch :    16  |  cost :   0.951626
epoch :    17  |  cost :   0.939327
epoch :    18  |  cost :   0.927566
epoch :    19  |  cost :   0.916316
