# Evolutionary Algorithms for learning rate schedules

In [1]:
import os
import sys
import glob
import random
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow
from IPython.display import display, HTML
from sklearn.metrics import accuracy_score
% matplotlib inline


import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.sampler as sampler

# Import modules every time "you run code imported using %aimport
%load_ext autoreload
%autoreload 1

# Add the src directory for functions
src_dir = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'src')
print(src_dir)
sys.path.append(src_dir)

# import my functions:
%aimport functions
from functions import*

/home/rbbidart/learn-lr/src


In [2]:
# Set params:
base_data_dir = '/home/rbbidart/project/rbbidart/learn-lr/data'
cuda = 'True'
batch_size = 64
test_batch_size = 64
seed = 101
GPU_num = 0

torch.manual_seed(seed)

if cuda:
    torch.cuda.manual_seed(seed)    
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}

# torch.cuda.set_device(GPU_num)
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

0
Tesla P100-PCIE-12GB


In [3]:
def MNIST_data_loaders(base_data_dir, train_samples_index, valid_samples_index, batch_size=64):
    """Make train, validation, test data loaders for MNIST dataset. Limited augmnetation, nothing fancy
    
    Arguments:
        train_samples_index: index of the train samples  ???(what is this index based off???)
        valid_samples_index: index of the valid samples
        batch_size: 
    """
    class ChunkSampler(sampler.Sampler):
        """Samples elements sequentially from some offset. 
        
        Argument:
            samples_index: index of desired samples
        """
        def __init__(self, samples_index):
            self.samples_index = samples_index

        def __iter__(self):
            return iter(self.samples_index)
    
        def __len__(self):
            return len(self.samples_index)

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(base_data_dir, train=True, download=True,
                       transform=transforms.Compose([
                           transforms.RandomRotation(15),
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, **kwargs,
        sampler=ChunkSampler(train_samples_index))

    valid_loader = torch.utils.data.DataLoader(
        datasets.MNIST(base_data_dir, train=True, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, **kwargs,
        sampler=ChunkSampler(valid_samples_index))

    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(base_data_dir, train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    return(train_loader, valid_loader, test_loader)


class SmallNet(nn.Module):
    def __init__(self):
        super(SmallNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)
    
    
    def get_lr_performance(self, optimizer, scheduler, train_loader, valid_loader, epochs, verbose=False):        
        """Return the validation after training for given epochs"""
        
        def get_valid_loss():
            # Now get the validation loss
            self.eval()
            valid_loss = 0
            correct = 0
            num_data=0
            for data, target in valid_loader:
                num_data+=len(target)
                if cuda:
                    data, target = data.cuda(), target.cuda()
                data, target = Variable(data, volatile=True), Variable(target)
                output = self(data)
                valid_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
                pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
                correct += pred.eq(target.data.view_as(pred)).cpu().sum()
            valid_loss /= num_data
            valid_acc = 100. * correct / num_data
            return valid_loss, valid_acc    
        
        for epoch in range(1, epochs + 1):
            # Every epoch step the scheduler
            scheduler.step()
            self.train()
            for batch_idx, (data, target) in enumerate(train_loader):
                if cuda:
                    data, target = data.cuda(), target.cuda()
                data, target = Variable(data), Variable(target)
                optimizer.zero_grad()
                output = self(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                optimizer.step()
            # to deubg
            if verbose:
                valid_loss, valid_acc = get_valid_loss()
                print('train loss: ', loss.data.cpu().numpy()[0])
                print('valid_loss: ', valid_loss, 'valid_acc: ', valid_acc)
       
        # final validation loss
        valid_loss, valid_acc = get_valid_loss()
        return valid_loss, valid_acc
    
    def test(self):
        self.eval()
        test_loss = 0
        correct = 0
        for data, target in test_loader:
            if cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data, volatile=True), Variable(target)
            output = self(data)
            test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
            pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        test_acc = 100. * correct / len(test_loader.dataset)
        test_loss /= len(test_loader.dataset)
        return test_loss, test_acc

## How do you properly add another type of LR scheduler?
https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py

In [4]:
from torch.optim import Optimizer
from torch.optim import Optimizer


class _LRScheduler(object):
    def __init__(self, optimizer, last_epoch=-1):
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        if last_epoch == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError("param 'initial_lr' is not specified "
                                   "in param_groups[{}] when resuming an optimizer".format(i))
        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        self.step(last_epoch + 1)
        self.last_epoch = last_epoch

    def get_lr(self):
        raise NotImplementedError

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr
            
class FixedLR(_LRScheduler):
    """Learning rate for each epoch corresponds to a learning rate in the list. No decay.
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        schedule (list): List of learning rates. Should be the same as nuber of epochs
        last_epoch (int): The index of last epoch. Default: -1.
    Example:
        >>> # Assuming 3 epochs, schedule = [0.05, 0.005, 0.0005]
        >>> # lr = 0.05     if epoch = 1
        >>> # lr = 0.005    if epoch = 2
        >>> # lr = 0.0005   if epoch = 3
        >>> scheduler = FixedLR(optimizer, schedule = [0.05, 0.005, 0.0005])
        >>> for epoch in range(3):
        >>>     scheduler.step()
        >>>     train(...)
        >>>     validate(...)
    """

    def __init__(self, optimizer, schedule, last_epoch=-1):
        if not type(schedule) == list:
            raise ValueError('Schedule should be a list of'
                             ' floats. Got {}', schedule)
        self.schedule = schedule
        super(FixedLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        return self.schedule

## Test the Network

In [5]:
# epochs = 5
# momentum = .5
# lr = .1
# schedule = np.repeat(lr, epochs).tolist()

# indexes = list(range(60000))
# random.shuffle(indexes)
# valid_frac = .2
# train_samples_index = indexes[int(valid_frac*len(indexes)):]
# valid_samples_index = indexes[0:int(valid_frac*len(indexes))]
# train_loader, valid_loader, test_loader = MNIST_data_loaders(base_data_dir, train_samples_index, valid_samples_index, batch_size=64)

# model = SmallNet()
# model.cuda()
    
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
# scheduler = FixedLR(optimizer, schedule)

# model.get_lr_performance(optimizer, scheduler, train_loader, valid_loader, epochs, verbose=True)
# acc, loss = model.test()
# print(loss, acc)

In [6]:
# epochs = 5
# momentum = .5
# lr = .001
# schedule = np.repeat(lr, epochs).tolist()

# indexes = list(range(60000))
# random.shuffle(indexes)
# valid_frac = .2
# train_samples_index = indexes[int(valid_frac*len(indexes)):]
# valid_samples_index = indexes[0:int(valid_frac*len(indexes))]
# train_loader, valid_loader, test_loader = MNIST_data_loaders(base_data_dir, train_samples_index, valid_samples_index, batch_size=64)

# model = SmallNet()
# model.cuda()
    
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
# scheduler = FixedLR(optimizer, schedule)

# model.get_lr_performance(optimizer, scheduler, train_loader, valid_loader, epochs, verbose=True)
# acc, loss = model.test()
# print(loss, acc)

In [7]:
import numpy as np
import random
from random import randint
def create_population(num_schedules, epochs=10):
    """ Create Learning rate schedules, called population 
    
    thanks: "https://blog.coast.ai/lets-evolve-a-neural-network-with-a-genetic-algorithm-code-included-8809bece164"
    Creates learning rate schedules by randomly sampling between 10e1 and 10e-6
    Generate a random float in [0, -6], take exp of this
    
    Args: num_schedules (int): number of random schedules to create
          epochs (int): Number of epochs for learning rate schedule
    Returns: learning rate schedule
    """
    pop = []
    for _ in range(0, num_schedules):
        # Create schedule
        exponents = np.random.uniform(-6, 0, epochs)
        lr_schedule = np.power(10, exponents).tolist()
        pop.append(lr_schedule)
    return pop

def breed(sch1, sch2, way='average', num_children=2):
    """Make two children as parts of their parents.
    
    Args:
        sch1 (list): lr_schedule
        sch2 (list): lr_schedule
    """
    children = []
    for _ in range(2):
        child = []

        # Loop through the parameters and pick params for the kid.
        for idx in range(len(sch1)):
            if(way=='random'):
                child.append(random.choice([sch1[idx], sch2[idx]]))
            elif(way=='mean'):
                child.append(np.mean(np.array([sch1[idx], sch2[idx]])))
        children.append(child)
    return children

def mutate(lr_schedule):
    """Randomly mutate one learning rate randomly
    
    Args:
        lr_schedule (list): lr schedule to mutate
    """
    # Choose a random key.
    idx = randint(0, len(lr_schedule))
    
    # Mutate one of the params. Add an amount within a factor of 10 of the original
    lr_schedule[idx] = lr_schedule[idx]*random.uniform(-10, 10)
    return lr_schedule 
    
def get_population_perf(population):
    """Evaluate all the schedules for epochs
    
    Args:
        population (list of lists): list schedules which are lists of length epochs
        
    Returns:
        pop_perf: list of tuples indicating the schedule and its' accuracy
    """
    
    # Create data loaders
    indexes = list(range(60000))
    random.shuffle(indexes)
    valid_frac = .2
    train_samples_index = indexes[int(valid_frac*len(indexes)):]
    valid_samples_index = indexes[0:int(valid_frac*len(indexes))]
    train_loader, valid_loader, test_loader = MNIST_data_loaders(base_data_dir, train_samples_index, valid_samples_index, batch_size=64)

    # Create network
    model = SmallNet()
    model.cuda()
    
    epochs = len(population[0])
    momentum = .5
    
    pop_perf = []
    for curr_schedule in population:
        lr = curr_schedule[0] # is this even used?
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
        scheduler = FixedLR(optimizer=optimizer, schedule=curr_schedule)
        loss, acc = model.get_lr_performance(optimizer, scheduler, train_loader, valid_loader, epochs, verbose=False)
        pop_perf.append((acc, curr_schedule))
        
    return pop_perf

In [8]:
def evolve(pop_perf):
    """Evolve a population of learning rates. 
    Args:
        pop (list): A list of learning rates
    
    Process:
    1. Tests schedules, and then keeps the top 25%, as well as 10% chance of keeping a poor schedule.
    2. Randomly mutate kept networks with 50% prob
    3. Fill the ramaining slots in population with children, created by randomly combining the parents 
        (50%/50% change of averaging a parameter, or randomly selecting one)
        
    """
    
    # Sort on the scores.
    pop = [x[1] for x in sorted(pop_perf, key=lambda x: x[0], reverse=True)]

    # keep the best 25%
    retain_length = int(len(pop)*.25)

    # The parents are every network we want to keep.
    parents = pop[:retain_length]

    # For those we aren't keeping, randomly keep some anyway.
    for individual in pop[retain_length:]:
        if .1 > random.random():
            parents.append(individual)

    # Randomly mutate some of the networks we're keeping.
    for index, individual in enumerate(parents):
        if random.random() > .5:
            parents[index] = mutate(parents[index])

    # Now find out how many spots we have left to fill. (how many children to make )
    parents_length = len(parents)
    desired_length = len(pop) - parents_length
    children = []

    # Add children, which are bred from two remaining networks.
    while len(children) < desired_length:

        # Get a random mom and dad.
        male = random.randint(0, parents_length-1)
        female = random.randint(0, parents_length-1)

        # Assuming they aren't the same network...
        if male != female:
            male = parents[male]
            female = parents[female]

            # pick breeding method:
            if random.random() > .5:
                way='mean'
            else:
                way = 'random'
                
            # Breed them.
            babies = breed(male, female, way, num_children=2)

            # Add the children one at a time.
            for baby in babies:
                # Don't grow larger than desired length.
                if len(children) < desired_length:
                    children.append(baby)
    parents.extend(children)
    return parents

def run_genetic(generations, num_schedules, epochs):
    """Run the genetic algorithm for a given number of generations.

    Save the results as .........
    Args:
        generations (int): Number of times to evole the population
        num_schedules (int): number of schedules to use each iteration
    """
    
    # initialize schedules:
    population = create_population(num_schedules=num_schedules, epochs=epochs)

    # Evolve the generation.
    for i in range(generations):
        print('Running generation: ', i)
        pop_perf = get_population_perf(population)
        pop_perf = [x for x in sorted(pop_perf, key=lambda x: x[0], reverse=True)]
        
        # print average accuracy, best accuracy, and best schedule
        perf_only = [x[0] for x in pop_perf]
        avg = sum(perf_only)/len(perf_only)
        print('Avg acc: ', avg, 'best acc: ', pop_perf[0][0])
        print('Schedule: ',[ '%.5f' % elem for elem in pop_perf[0][1]])

        # Evolve
        population = evolve(pop_perf, epochs)
    
    # get final accuracy, and print the top 5 sorted
    pop_perf = get_population_perf(population)
    pop_perf = [x for x in sorted(pop_perf, key=lambda x: x[0], reverse=True)]

    # Print out the top 5 networks.
    print('Final Results: ', pop_perf[:5])

In [9]:
run_genetic(generations=10, num_schedules=8, epochs=5)

Running generation:  0




Avg acc:  17.341666666666665 best acc:  48.875
Schedule:  ['0.00012', '0.00276', '0.00321', '0.03626', '0.00048']
Running generation:  1
Avg acc:  57.337500000000006 best acc:  81.24166666666666
Schedule:  ['0.00006', '0.00149', '0.01328', '0.01821', '0.00025']
Running generation:  2
Avg acc:  65.828125 best acc:  84.4
Schedule:  ['0.00006', '0.00149', '0.01328', '0.01821', '0.00025']
Running generation:  3
Avg acc:  54.68854166666667 best acc:  80.28333333333333
Schedule:  ['0.00006', '0.00149', '0.01328', '0.01821', '0.00025']
Running generation:  4
Avg acc:  59.87708333333333 best acc:  82.73333333333333
Schedule:  ['0.00006', '0.00149', '0.01328', '0.01821', '0.00025']
Running generation:  5
Avg acc:  56.728125 best acc:  81.04166666666667
Schedule:  ['0.00006', '0.00149', '0.01328', '0.01821', '0.00025']
Running generation:  6
Avg acc:  79.73541666666667 best acc:  89.175
Schedule:  ['0.00016', '-0.00082', '0.01328', '0.01821', '0.00025']
Running generation:  7
Avg acc:  76.038541