## Q1-1)

In [17]:
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import matplotlib.pylab as plt
import random

from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

In [2]:
# 파라미터 설정 (learning rate, training epochs, batch_size)
learning_rate = 0.1
training_epochs = 15
batch_size = 100

In [4]:
# train과 test set으로 나누어 MNIST data 불러오기
train_dataset = dsets.MNIST(root='./mnist_data/',
                            train=True,
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./mnist_data/',
                           train=False,
                           transform=transforms.ToTensor())

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./mnist_data/MNIST/raw/train-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw
Processing...
Done!




In [10]:
# dataset loader에 train과 test할당하기(batch size, shuffle, drop_last 잘 설정할 것!)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         shuffle=False,
                         drop_last=True)

In [13]:
# Layer 쌓기 (조건: 3개의 Layer 사용, DropOut 사용 (p=0.3), ReLU 함수 사용, Batch normalization하기)
# 각 Layer의 Hidden node 수 : 1st Layer (784,100), 2nd Layer(100,100),3rd Layer(100,10)
l1 = nn.Linear(784, 100, bias=True)
l2 = nn.Linear(100, 100, bias=True)
l3 = nn.Linear(100, 10, bias=True)
relu = nn.ReLU()
dropout = nn.Dropout(p=0.3)
bn = nn.BatchNorm1d(100)

In [14]:
# xavier initialization을 이용하여 각 layer의 weight 초기화
nn.init.xavier_uniform(l1.weight)
nn.init.xavier_uniform(l2.weight)
nn.init.xavier_uniform(l3.weight)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Parameter containing:
tensor([[-1.5923e-01,  1.2804e-02,  2.2385e-01,  6.6520e-02, -1.7304e-01,
         -1.7853e-01, -9.6810e-02,  2.0175e-01, -8.0244e-02,  2.1902e-01,
          1.0228e-01,  1.1648e-01, -2.9521e-02, -1.8644e-01, -2.0157e-01,
         -1.8056e-01,  1.2386e-01, -7.8582e-02, -9.9098e-02, -9.5483e-02,
         -1.0700e-02,  1.9318e-01, -1.0829e-01, -2.0169e-02, -2.0358e-01,
          1.5787e-01, -1.0330e-01,  5.1215e-02, -1.0194e-02, -8.4842e-02,
         -1.5966e-01,  1.0230e-01,  1.7901e-01,  7.9604e-02, -1.4405e-02,
          9.6878e-02, -1.2354e-01, -4.7724e-02, -1.2104e-01, -1.7828e-01,
         -1.5313e-01, -6.3761e-02, -1.8521e-01,  2.1763e-01, -1.1343e-01,
         -8.3920e-02, -8.1436e-03, -6.6567e-02,  2.1454e-01,  2.1404e-01,
         -7.5073e-02, -7.7181e-02,  1.9544e-01,  9.5197e-02,  1.0162e-01,
          5.1343e-02, -3.2681e-02, -3.5978e-02,  1.7361e-01, -1.2619e-01,
          1.7015e-01,  1.0960e-01, -9.5830e-02, -6.0812e-02, -1.7672e-01,
          3.8193

In [16]:
# torch.nn.Sequential을 이용하여 model 정의하기(쌓는 순서: linear-Batch Normalization Layer - ReLU- DropOut)
model = nn.Sequential(l1, bn, relu, dropout,
                      l2, bn, relu, dropout,
                      l3).to('cpu')

In [24]:
# Loss Function 정의하기 (CrossEntropy를 사용할 것!)
criterion = nn.CrossEntropyLoss().to('cpu')
# optimizer 정의하기 (Adam optimizer를 사용할 것!)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#cost 계산을 위한 변수 설정
train_total_batch = len(train_loader)
test_total_batch = len(test_loader)

In [22]:
# Training epoch (cost 값 초기 설정(0으로)과 model의 train 설정 꼭 할 것) 
for epoch in range(training_epochs):
    avg_cost = 0
    # train dataset을 불러오고(X,Y 불러오기), back propagation과 optimizer를 사용하여 loss를 최적화하는 코드
    for X, Y in train_loader:
        X = X.view(-1, 784).to('cpu')
        Y = Y.to('cpu')
        
        hypothesis = model(X)
        bn_loss = criterion(hypothesis, Y)
        
        optimizer.zero_grad()
        bn_loss.backward()
        optimizer.step()
    
        avg_cost += bn_loss / train_total_batch
        
        
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning finished')

Epoch: 0001 cost = 0.532430947
Epoch: 0002 cost = 0.421571046
Epoch: 0003 cost = 0.393104583
Epoch: 0004 cost = 0.369535118
Epoch: 0005 cost = 0.349459350
Epoch: 0006 cost = 0.351994306
Epoch: 0007 cost = 0.335494578
Epoch: 0008 cost = 0.350321680
Epoch: 0009 cost = 0.321461827
Epoch: 0010 cost = 0.319085449
Epoch: 0011 cost = 0.300809681
Epoch: 0012 cost = 0.321902364
Epoch: 0013 cost = 0.305828780
Epoch: 0014 cost = 0.305184990
Epoch: 0015 cost = 0.302269131
Learning finished


In [27]:
# test data로 모델의 정확도를 검증하는 코드 (model의 evaluation mode 설정 꼭 할 것)
# X_test 불러올 때 view를 사용하여 차원 변환할 것/ Y_test를 불러올때 labels사용
# accuracy의 초기 값 설정(0으로) 꼭 할 것

with torch.no_grad():
    model.eval() # set the model to evaluation mode
    
    loss, bn_acc = 0, 0
    for i, (X, Y) in enumerate(test_loader):
        X = X.view(-1, 784).to('cpu')
        Y = Y.to('cpu')
        
        pred = model(X)
        correct_pred = torch.argmax(pred, 1) == Y
        loss += criterion(pred, Y)
        bn_acc += correct_pred.float().mean()
    
    loss, bn_acc = loss / test_total_batch, bn_acc / test_total_batch
    print("Accuracy: ", bn_acc.item())
    
    ## Test set에서 random으로 data를 뽑아 Label과 Prediction을 비교하는 코드 
    r = random.randint(0, len(test_dataset)-1)
    X_single_data = test_dataset.test_data[r:r + 1].view(-1, 28 *28).float()
    Y_single_data = test_dataset.test_labels[r:r + 1]
    
    print('Label: ', Y_single_data.item())
    single_prediction = model(X_single_data)
    print('Prediction: ', torch.argmax(single_prediction, 1).item())

Accuracy:  0.930300235748291
Label:  5
Prediction:  5


## Q1-2) 지금까지는 Layer의 수를 바꾸거나, Batch Normalization Layer를 추가하는 등 Layer에만 변화를 주며 모델의 성능을 향상 시켰습니다.  
### 이번 문제에서는 위에서 만든 모델에서 있던 Layer 들의 Hidden node 수를 증가 또는 감소 (ex: 200, 300, 50...) 시켰을 때, train set에서의 cost와 test set에서 Accuracy가 기존 결과와 비교하였을 때 어떻게 달라졌는지 비교해주시면 됩니다.   

In [28]:
# Case 1 : increase the number of hidden nodes
# Accuracy Increased :)
l1 = nn.Linear(784, 200, bias=True)
l2 = nn.Linear(200, 300, bias=True)
l3 = nn.Linear(300, 10, bias=True)
relu = nn.ReLU()
dropout = nn.Dropout(p=0.3)
bn1 = nn.BatchNorm1d(200)
bn2 = nn.BatchNorm1d(300)

nn.init.xavier_uniform(l1.weight)
nn.init.xavier_uniform(l2.weight)
nn.init.xavier_uniform(l3.weight)

model = nn.Sequential(l1, bn1, relu, dropout,
                      l2, bn2, relu, dropout,
                      l3).to('cpu')

# Loss Function 정의하기 (CrossEntropy를 사용할 것!)
criterion = nn.CrossEntropyLoss().to('cpu')
# optimizer 정의하기 (Adam optimizer를 사용할 것!)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#cost 계산을 위한 변수 설정
train_total_batch = len(train_loader)
test_total_batch = len(test_loader)

# train
for epoch in range(training_epochs):
    avg_cost = 0
    
    for X, Y in train_loader:
        X = X.view(-1, 784).to('cpu')
        Y = Y.to('cpu')
        
        hypothesis = model(X)
        bn_loss = criterion(hypothesis, Y)
        
        optimizer.zero_grad()
        bn_loss.backward()
        optimizer.step()
    
        avg_cost += bn_loss / train_total_batch
           
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))
print('Learning finished')

# test
with torch.no_grad():
    model.eval() # set the model to evaluation mode
    
    loss, bn_acc = 0, 0
    for i, (X, Y) in enumerate(test_loader):
        X = X.view(-1, 784).to('cpu')
        Y = Y.to('cpu')
        
        pred = model(X)
        correct_pred = torch.argmax(pred, 1) == Y
        loss += criterion(pred, Y)
        bn_acc += correct_pred.float().mean()
    
    loss, bn_acc = loss / test_total_batch, bn_acc / test_total_batch
    print("Accuracy: ", bn_acc.item())

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


Epoch: 0001 cost = 0.472852260
Epoch: 0002 cost = 0.343448043
Epoch: 0003 cost = 0.305057496
Epoch: 0004 cost = 0.278459698
Epoch: 0005 cost = 0.259482622
Epoch: 0006 cost = 0.246392906
Epoch: 0007 cost = 0.232810199
Epoch: 0008 cost = 0.227727771
Epoch: 0009 cost = 0.223600134
Epoch: 0010 cost = 0.217849076
Epoch: 0011 cost = 0.226236090
Epoch: 0012 cost = 0.211904764
Epoch: 0013 cost = 0.203420267
Epoch: 0014 cost = 0.199457690
Epoch: 0015 cost = 0.191901475
Learning finished
Accuracy:  0.9729997515678406


In [29]:
# Case 2 : decrease the number of hidden nodes
# Accuracy increased but not as much as in Case 1!
l1 = nn.Linear(784, 100, bias=True)
l2 = nn.Linear(100, 50, bias=True)
l3 = nn.Linear(50, 10, bias=True)
relu = nn.ReLU()
dropout = nn.Dropout(p=0.3)
bn1 = nn.BatchNorm1d(100)
bn2 = nn.BatchNorm1d(50)

nn.init.xavier_uniform(l1.weight)
nn.init.xavier_uniform(l2.weight)
nn.init.xavier_uniform(l3.weight)

model = nn.Sequential(l1, bn1, relu, dropout,
                      l2, bn2, relu, dropout,
                      l3).to('cpu')

# Loss Function 정의하기 (CrossEntropy를 사용할 것!)
criterion = nn.CrossEntropyLoss().to('cpu')
# optimizer 정의하기 (Adam optimizer를 사용할 것!)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#cost 계산을 위한 변수 설정
train_total_batch = len(train_loader)
test_total_batch = len(test_loader)

# train
for epoch in range(training_epochs):
    avg_cost = 0
    
    for X, Y in train_loader:
        X = X.view(-1, 784).to('cpu')
        Y = Y.to('cpu')
        
        hypothesis = model(X)
        bn_loss = criterion(hypothesis, Y)
        
        optimizer.zero_grad()
        bn_loss.backward()
        optimizer.step()
    
        avg_cost += bn_loss / train_total_batch
           
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))
print('Learning finished')

# test
with torch.no_grad():
    model.eval() # set the model to evaluation mode
    
    loss, bn_acc = 0, 0
    for i, (X, Y) in enumerate(test_loader):
        X = X.view(-1, 784).to('cpu')
        Y = Y.to('cpu')
        
        pred = model(X)
        correct_pred = torch.argmax(pred, 1) == Y
        loss += criterion(pred, Y)
        bn_acc += correct_pred.float().mean()
    
    loss, bn_acc = loss / test_total_batch, bn_acc / test_total_batch
    print("Accuracy: ", bn_acc.item())

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


Epoch: 0001 cost = 0.518346190
Epoch: 0002 cost = 0.379162967
Epoch: 0003 cost = 0.342260003
Epoch: 0004 cost = 0.316496283
Epoch: 0005 cost = 0.312597126
Epoch: 0006 cost = 0.289735138
Epoch: 0007 cost = 0.280683070
Epoch: 0008 cost = 0.286977559
Epoch: 0009 cost = 0.275946289
Epoch: 0010 cost = 0.264384478
Epoch: 0011 cost = 0.257800817
Epoch: 0012 cost = 0.252921611
Epoch: 0013 cost = 0.243324831
Epoch: 0014 cost = 0.254261762
Epoch: 0015 cost = 0.242500201
Learning finished
Accuracy:  0.9689001441001892


**How to choose optimal number of neurons?**  
https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw  
*no clear answer or rule of thumb : find it randomly or by grid search or etc*