# Simulated Annealing


In [None]:
%matplotlib inline

import torch
import torchvision
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.autograd import Variable
import random as rm

# Add the sibling folders
import sys, os
sys.path.insert(0, os.path.abspath('../..'))
import src.utils as ut

Let's define a basic NN, made out of two layers: the first one has 100 neurons, while the second one 50.

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # Define all the components
        # Basic two-layer network
        self.fc1 = nn.Linear(28 * 28, 200)
        #self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(200, 10)
        
    def forward(self, x):
        # Define the acutal network
        
        # Get the batch size
        in_size = x.size(0)
        print(x.size())
        # Flatten data, -1 is inferred from the other dimensions
        x = x.view(in_size, -1) 
        
        # Forward rule
        x = F.relu(self.fc1(x))
        #x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return F.softmax(x)

## The algorithm
The first implementation uses the whole training set, in order to avoid noisy evaluations. In such a way we are sure to make the best choice. 
The epsilon variable allows to set the maximum movement in each direction (similar to learning rate). Each movement is done by creating a tensor for each layer called $ \Delta(w) $.

$$ w'  = w - \epsilon \Delta(w)  $$

where each value of the tensor  $ \Delta(w) $ is sampled from the distribution $ U (-1, 1) $. 

In the following implementation, if $ L(w') > L(w) $ the opposite direction is chosen.

$$ w''  = w + \epsilon \ \Delta(w)  $$



In [None]:
epsilon = 10e-2
net = Net().cuda()
train_loader, test_loader = ut.load_dataset(dataset_name='mnist', minibatch=4096)
        
def full_train_SA(trainloader, model, accuracy_before, gpu=True):
    model.train()
    
    if accuracy is None:
        accuracy_before = ut.test_train(train_loader, net)
    
    inverse = []
    
    for param in net.parameters():
        # Replicate the tensor
        tensor_size = param.data.size()
        move = torch.zeros(tensor_size)
        # Send it to the GPU
        if gpu:
            move = move.cuda()
        # Generate move
        move = move.uniform_(-1, 1).mul(epsilon) * param.data
        # Stepback is saved
        inverse.append(move.mul(-2))
        # Move the parameters
        param.data.add_(move)
    
    # Evaluate the accuracy 
    new_accuracy = ut.test_train(train_loader, net)

    if new_accuracy[1] < accuracy_before[1]:
        print("Wrong direction, in fact:", new_accuracy[1])
        for k, param in enumerate(net.parameters()):
            param.data.add_(inverse[k])

        new_accuracy = ut.test_train(train_loader, net)
    
    print("New accuracy: ", new_accuracy)
    
    return new_accuracy

accuracy = None
for epoch in range(1000):
    print("Epoch: ", epoch)
    accuracy = full_train_SA(train_loader, net, accuracy)
    print("Validation test:", ut.test(test_loader, net))

Looking at the results, it can be seen that both the directions do not sometimes improve the accuracy. This technique allows to avoid overwriting the values if $ L(w'') > L(w') $, but in the meanwhile it makes the worst choice.

The update rule can be changed to choose only the best movement. If both of them worsen the solution, the best one is picked according to the temperature.

In [4]:
epsilon = 10e-2 / 2
net = Net().cuda()
train_loader, test_loader = ut.load_dataset(dataset_name='mnist', minibatch=4096)
        
def full_train_SA_best_step(trainloader, model, accuracy_before, gpu=True):
    model.train()
    
    inverse = []
    
    for param in net.parameters():
        # Replicate the tensor
        tensor_size = param.data.size()
        move = torch.zeros(tensor_size)
        # Send it to the GPU
        if gpu:
            move = move.cuda()
        # Generate move
        move = move.uniform_(-1, 1).mul(epsilon) * param.data
        # Stepback is saved
        inverse.append(move.mul(-2))
        # Move the parameters
        param.data.add_(move)
    
    # Evaluate the accuracy 
    first_accuracy = ut.test_train(train_loader, net)[1]
    
    if first_accuracy < accuracy_before:
        for k, param in enumerate(net.parameters()):
            param.data.add_(inverse[k])

        second_accuracy = ut.test_train(train_loader, net)[1]
        
        if second_accuracy < first_accuracy: # Get back to the first solution
            for k, param in enumerate(net.parameters()):
                param.data.sub_(inverse[k])
            new_accuracy = first_accuracy
        else:
            new_accuracy = second_accuracy
    else:
        new_accuracy = first_accuracy
        
    print("New accuracy: ", new_accuracy)
    
    return new_accuracy

'''current_accuracy = ut.test_train(train_loader, net)[1]
for epoch in range(1000):
    print("Epoch: ", epoch)
    current_accuracy = full_train_SA_best_step(train_loader, net, current_accuracy)
    print("Validation test:", ut.test(test_loader, net))'''

'current_accuracy = ut.test_train(train_loader, net)[1]\nfor epoch in range(1000):\n    print("Epoch: ", epoch)\n    current_accuracy = full_train_SA_best_step(train_loader, net, current_accuracy)\n    print("Validation test:", ut.test(test_loader, net))'

In [None]:
epsilon = 10e-1 / 2
net = Net().cuda()
train_loader, test_loader = ut.load_dataset(dataset_name='mnist', minibatch=4096)
        
def full_train_SA_temperature(trainloader, model, accuracy_before, temperature = 0, gpu=True):
    model.train()
    
    inverse = []
    
    for param in net.parameters():
        # Replicate the tensor
        tensor_size = param.data.size()
        move = torch.zeros(tensor_size)
        # Send it to the GPU
        if gpu:
            move = move.cuda()
        # Generate move
        move = move.uniform_(-1, 1).mul(epsilon) * param.data
        # Stepback is saved
        inverse.append(move.mul(-1)) 
        # Move the parameters
        param.data.add_(move)
    
    # Evaluate the accuracy 
    first_accuracy = ut.test_train(train_loader, net)[1]
    if first_accuracy < accuracy_before:
        for k, param in enumerate(net.parameters()):
            param.data.add_(inverse[k].mul(2))

        second_accuracy = ut.test_train(train_loader, net)[1]
        
        if second_accuracy < accuracy_before and temperature == 0: # Get back to the first solution
            for k, param in enumerate(net.parameters()):
                param.data.sub_(inverse[k])
            new_accuracy = accuracy_before
        else:
            new_accuracy = second_accuracy
    else:
        new_accuracy = first_accuracy
    
    print("New accuracy: ", new_accuracy)
    
    return new_accuracy

accuracy_before = ut.test_train(train_loader, net)[1]
for epoch in range(1000):
    print("Epoch: ", epoch)
    accuracy = full_train_SA_temperature(train_loader, net, accuracy)
    print("Validation test:", ut.test(test_loader, net))

## Comparison with SGD 
Running the same network with the SGD algorithm to compare the results.

This function allows to test a single minibatch, given inputs and labels.

In [5]:
def test_minibatch(inputs, labels, model):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0

    outputs = model(Variable(inputs))
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    test_loss += float(F.cross_entropy(outputs, Variable(labels)).item())

    return correct / total

In [6]:
def train(trainloader, model, optimizer, criterion, gpu=True):
    model.train()
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data

        if gpu:
            inputs, labels = inputs.cuda(), labels.cuda()

        #print(test_minibatch(inputs, labels, model))
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        del inputs, labels, outputs


In [None]:
train_loader, test_loader = ut.load_dataset(dataset_name='mnist', minibatch=512)
net = Net().cuda()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 20):    
    train(train_loader, net, optimizer, criterion, 1)
    print("Epoch: ", epoch, "value:", ut.test(test_loader, net))

## Mixing SGD and SA
We have already seen that the Cross entropy is a continuous approximation of the accuracy. Another approach can benefit of both the approaches.

In [7]:
train_loader, test_loader = ut.load_dataset(dataset_name='mnist', minibatch=512)
net = Net().cuda()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 100):    
    train(train_loader, net, optimizer, criterion, 1)
    print(ut.test_train(train_loader, net))
    print("Epoch: ", epoch, "value:", ut.test(test_loader, net))

accuracy = ut.test_train(train_loader, net)[1]
for epoch in range(1000):
    print("Epoch: ", epoch)
    accuracy = full_train_SA_best_step(train_loader, net, accuracy)
    print("Validation test:", ut.test(test_loader, net))
    



Epoch:  1 value: (0.0035228755354881286, 0.7898)
Epoch:  2 value: (0.003221932029724121, 0.8928)
Epoch:  3 value: (0.0031670210003852846, 0.9056)
Epoch:  4 value: (0.0031424849987030028, 0.9109)
Epoch:  5 value: (0.003127106690406799, 0.9163)
Epoch:  6 value: (0.0031161041021347045, 0.9194)
Epoch:  7 value: (0.003108021366596222, 0.9227)
Epoch:  8 value: (0.0031003273487091062, 0.926)
Epoch:  9 value: (0.003094895625114441, 0.9279)
Epoch:  10 value: (0.003089822828769684, 0.9287)
Epoch:  11 value: (0.003085683298110962, 0.9304)
Epoch:  12 value: (0.003081518316268921, 0.932)
Epoch:  13 value: (0.0030769414067268372, 0.934)
Epoch:  14 value: (0.0030738753199577333, 0.9354)
Epoch:  15 value: (0.0030705927491188048, 0.9361)
Epoch:  16 value: (0.0030673923015594484, 0.9376)
Epoch:  17 value: (0.003064748203754425, 0.9383)
Epoch:  18 value: (0.003061336863040924, 0.9392)
Epoch:  19 value: (0.003059269595146179, 0.9396)
Epoch:  20 value: (0.0030563995480537413, 0.942)
Epoch:  21 value: (0.00

NameError: name 'accuracy' is not defined