# Practical Session 3

## Imports

In [1]:
import torch
import dlc_practical_prologue as prologue
import time

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")

## Activation function

In [3]:
def sigma(x):
    return torch.tanh(x)

In [4]:
def dsigma(x):
    return 1 - torch.tanh(x).pow(2)

## Loss

In [5]:
def loss(v, t):
    return torch.norm(v - t, p=2).pow(2)

In [6]:
def dloss(v, t):
    return -2*(t-v)

## Forward and backward passes

In [7]:
def forward_pass(w1, b1, w2, b2, x):
    x0 = x

    s1 = torch.mv(w1, x0) + b1
    x1 = sigma(s1)

    s2 = torch.mv(w2, x1) + b2
    x2 = sigma(s2)
    
    return x0, s1, x1, s2, x2

In [8]:
def backward_pass(w1, b1, w2, b2,
                  t,
                  x, s1, x1, s2, x2,
                  dl_dw1, dl_db1, dl_dw2, dl_db2):
    

    ### Get dimensions
    # First layer
    D1 = x.size()[0]
    C1 = w1.size()[0]

    # Second layer
    D2 = x1.size()[0]
    C2 = w2.size()[0]


    ### Calculate derivatives
    # Output loss
    dl_dx2 = dloss(x2, t)

    # Second layer
    dl_ds2 = dl_dx2 * dsigma(s2)
    dl_dx1 = torch.mv(w2.T, dl_ds2)
    dl_dw2_x = torch.mm(dl_ds2.view(C2,1), x1.view(1, D2))
    dl_db2_x = dl_ds2 

    # First layer
    dl_ds1 = dl_dx1 * dsigma(s1)
    dl_dw1_x = torch.mm(dl_ds1.view(C1,1), x0.view(1, D1))
    dl_db1_x = dl_ds1 


    ### Add up gradients
    dl_dw1 += dl_dw1_x
    dl_db1 += dl_db1_x
    dl_dw2 += dl_dw2_x
    dl_db2 += dl_db2_x

    return dl_dw1, dl_db1, dl_dw2, dl_db2

In [9]:
def test_nn(test_input, test_target, train_input, train_target):
    ### Initialization
    tp_test, tp_train = 0, 0 # true positives
    N = test_target.size()[0]

    for x_test, t_test, x_train, t_train in zip(test_input, test_target, train_input, train_target):
        # Predict output label
        _, _, _, _, x2_test = forward_pass(w1, b1, w2, b2, x_test)
        _, _, _, _, x2_train = forward_pass(w1, b1, w2, b2, x_train)
        
        # Check if prediction is correct
        if torch.argmax(x2_test).item() == torch.argmax(t_test).item():
            tp_test += 1
        
        if torch.argmax(x2_train).item() == torch.argmax(t_train).item():
            tp_train += 1

    return ((1-tp_train/N)*100), ((1-tp_test/N)*100)

## Training the network

In [10]:
### Load data
# Circumvent problems with MNIST dataset
from six.moves import urllib

opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozialla/5.0')]
urllib.request.install_opener(opener)

train_input, train_target, test_input, test_target = prologue.load_data(one_hot_labels=True, normalize=True, cifar=False)

if torch.cuda.is_available():
    train_input = train_input.to(device)
    train_target = train_target.to(device)
    test_input = test_input.to(device)
    test_target = test_target.to(device)

# Transform targets
train_target *= 0.9

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples


In [11]:
### Dimensions
# Input dimension
D1 = train_input.size()[1]
# Sucessive dimensions
C1 = 50
D2 = C1
C2 = train_target.size()[1]


### Training parameters
N_grad_steps = 1000
eta = 0.1/train_input.size()[0]


### Initialization
epsilon = 1e-6
w1 = torch.normal(torch.zeros(C1,D1), epsilon)
w2 = torch.normal(torch.zeros(C2,D2), epsilon)
b1 = torch.normal(torch.zeros(C1), epsilon)
b2 = torch.normal(torch.zeros(C2), epsilon)

# CUDA
if torch.cuda.is_available():
    w1 = w1.to(device)
    w2 = w2.to(device)
    b1 = b1.to(device)
    b2 = b2.to(device)

print('### Start training ###')
start = time.time()
### Training steps
for i in range(N_grad_steps):
    # Print iteration
    if (i+1)%50 == 0:
        train_error, test_error = test_nn(test_input, test_target, train_input, train_target)
        print('Iteration {:d}: Train error: {:.02f}%  ---  Test error: {:.02f}%'.format(i+1, train_error, test_error))

    # Reset gradient sum
    dl_dw1 = torch.zeros_like(w1)
    dl_db1 = torch.zeros_like(b1)
    dl_dw2 = torch.zeros(w2.size())
    dl_db2 = torch.zeros(b2.size())

    # CUDA
    if torch.cuda.is_available():
        dl_dw1 = dl_dw1.to(device)
        dl_dw2 = dl_dw2.to(device)
        dl_db1 = dl_db1.to(device)
        dl_db2 = dl_db2.to(device)
    
    # Iterate over training samples
    for x, t in zip(train_input, train_target):

        ### Forward pass
        x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, x)

        ### Backward pass
        dl_dw1, dl_db1, dl_dw2, dl_db2 = backward_pass(w1, b1, w2, b2,
                                                        t,
                                                        x, s1, x1, s2, x2,
                                                        dl_dw1, dl_db1, dl_dw2, dl_db2)

    ### Gradient step
    w1 -= eta * dl_dw1
    w2 -= eta * dl_dw2
    b1 -= eta * dl_db1
    b2 -= eta * dl_db2

end = time.time()
print('Training time: {:.02f}min'.format((end-start)/60))

### Start training ###
Iteration 50: Train error: 46.60%  ---  Test error: 52.40%
Iteration 100: Train error: 17.40%  ---  Test error: 27.10%
Iteration 150: Train error: 8.10%  ---  Test error: 19.90%
Iteration 200: Train error: 6.10%  ---  Test error: 16.60%
Iteration 250: Train error: 5.10%  ---  Test error: 17.30%
Iteration 300: Train error: 4.00%  ---  Test error: 15.10%
Iteration 350: Train error: 3.00%  ---  Test error: 17.00%
Iteration 400: Train error: 2.30%  ---  Test error: 15.80%
Iteration 450: Train error: 2.20%  ---  Test error: 16.90%
Iteration 500: Train error: 1.00%  ---  Test error: 17.20%
Iteration 550: Train error: 0.80%  ---  Test error: 16.20%
Iteration 600: Train error: 1.10%  ---  Test error: 17.00%
Iteration 650: Train error: 0.40%  ---  Test error: 15.80%
Iteration 700: Train error: 0.50%  ---  Test error: 16.50%
Iteration 750: Train error: 0.40%  ---  Test error: 16.40%
Iteration 800: Train error: 0.30%  ---  Test error: 17.10%
Iteration 850: Train error: 0.30

## Testing trained network

In [12]:
train_error, test_error = test_nn(test_input, test_target, train_input, train_target)

print('Train error: {:.02f}%'.format(train_error))
print('Test error: {:.02f}%'.format(test_error))

Train error: 0.10%
Test error: 17.00%
