# Learning rate/momentum schedules trained from scratch
The default to investigate will be resnet50 with CIFAR10

Look at a number of epochs just short of where we think it will converge, so we are aiming for a high accuracy, but not taking too long. (Aim for 94% accuracy.)

As a compromise between a fully flexible model for learning rates (which won't work with evolutionary algorithms), or using something inflexible like choosing a global learning rate, we choose two learning rates for each epoch, and linearly scale between these during the epoch.

Try:
* Finding optimal learning rate given common momemtum initialization. 
* Find optimal momentum given a sensible learning rate schedule.
* Try optimizing both at the same time.

Maybe:
* Look at how the learning rates evolve from a bad initialization to a sensible one (gif)
* look at how optimal learning rate schedule changes based on the momemtum used.


** Be careful to reinitialize the pytorch/fastai model each time, so we don't start fine tuning an existing model**

In [1]:
import os
import sys
import glob
import random
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

from torchvision import datasets, models, transforms

fast_ai_dir = '/media/rene/Data/fastai/'
sys.path.append(fast_ai_dir)

# ??????????????? this may be causing an error:
SEED = 101
np.random.seed(SEED)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Add the src directory for functions
src_dir = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'src')
print(src_dir)
sys.path.append(src_dir)

# import my functions:
from genetic import*

torch.cuda.set_device(0)
print(torch.cuda.is_available())
print(torch.cuda.current_device())

/media/rene/Data/learn-lr/src
True
0


In [2]:
PATH = "/media/rene/Data/data/cifar10/"

In [3]:
def conv_layer(ni, nf, ks=3, stride=1):
    return nn.Sequential(
        nn.Conv2d(ni, nf, kernel_size=ks, bias=False, stride=stride, padding=ks//2),
        nn.BatchNorm2d(nf, momentum=0.01),
        nn.LeakyReLU(negative_slope=0.1, inplace=True))

class ResLayer(nn.Module):
    def __init__(self, ni):
        super().__init__()
        self.conv1=conv_layer(ni, ni//2, ks=1)
        self.conv2=conv_layer(ni//2, ni, ks=3)
        
    def forward(self, x): 
        return x.add(self.conv2(self.conv1(x)))
#        return x.add_(self.conv2(self.conv1(x)))

class Darknet(nn.Module):
    def make_group_layer(self, ch_in, num_blocks, stride=1):
        return [conv_layer(ch_in, ch_in*2,stride=stride)
               ] + [(ResLayer(ch_in*2)) for i in range(num_blocks)]

    def __init__(self, num_blocks, num_classes, nf=32):
        super().__init__()
        layers = [conv_layer(3, nf, ks=3, stride=1)]
        for i,nb in enumerate(num_blocks):
            layers += self.make_group_layer(nf, nb, stride=2-(i==1))
            nf *= 2
        layers += [nn.AdaptiveAvgPool2d(1), Flatten(), nn.Linear(nf, num_classes)]
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x): return self.layers(x)

In [4]:
def phases_linear(lr_sch, mom_sch):
    training_phase_schedule = []
    for ind in range(0, len(lr_sch), 2):
        ind = int(ind)
        curr_sch = TrainingPhase(epochs=1, opt_fn=optim.SGD, lr=(lr_sch[ind], lr_sch[ind+1]), lr_decay=DecayType.LINEAR, 
                          momentum=(mom_sch[ind],mom_sch[ind+1]), momentum_decay=DecayType.LINEAR)
        
        training_phase_schedule.append(curr_sch)
    return training_phase_schedule

In [5]:
def get_darknet_perf(PATH, lr_sch_list, mom_sch_list, downsample, acc_dict={}, bs=512):
    num_workers = 4
    sz=32

    pop_perf = []

    for ind in range(len(lr_sch_list)):
        # if schedule already tested, return this acc
        if(tuple(lr_sch_list[ind]+mom_sch_list[ind]) in acc_dict):
            acc = acc_dict[tuple(lr_sch_list[ind]+mom_sch_list[ind])]
            pop_perf.append([acc, lr_sch_list[ind], mom_sch_list[ind]])
        else:
            stats = (np.array([ 0.4914 ,  0.48216,  0.44653]), np.array([ 0.24703,  0.24349,  0.26159]))
            tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomFlip()], pad=32//8)
            data = ImageClassifierData.from_paths(PATH, val_name='test', tfms=tfms, bs=bs)

            m = Darknet([1, 2, 4, 6, 3], num_classes=10, nf=32)
            data = ImageClassifierData.from_paths(PATH, val_name='test', tfms=tfms, bs=bs)
            learn = ConvLearner.from_model_data(m, data)
            learn.crit = nn.CrossEntropyLoss()
            learn.metrics = [accuracy]

            learn.fit_opt_sched(phases_linear(lr_sch_list[ind], mom_sch_list[ind]))
#             learn.sched.plot_lr(show_text=False)
            preds, y = learn.predict_with_targs()
            acc = accuracy_np(preds, y)
            
            pop_perf.append([acc, lr_sch_list[ind], mom_sch_list[ind]])
            acc_dict[tuple(lr_sch_list[ind]+mom_sch_list[ind])] = acc
            
    return pop_perf, acc_dict

In [6]:
def run_genetic_darknet(PATH, out_loc, generations, epochs, init_lr_sch, init_mom_sch, downsample=1, evolve_lr=True, evolve_mom=False):
    """Run the genetic algorithm on schedules for number of generations. 
    Save best model at each generation and final best 5 models
    
    Take as input lr and mom schedules that are the correct length (=epochs)
    
    Downsampling is only used for the evolution part. All other parts use the full length of schedules.
    """
    bs=512

    lr_sch = init_lr_sch
    mom_sch = init_mom_sch

    # Store the top schedule and accuracy (tuples) as elements in a list.
    history=[]
    # same info, but easier format to search:
    acc_dict = {}

    # Evolve the generation.
    for i in range(generations):
        print('Running generation: ', i)
            
        pop_perf, acc_dict_tmp = get_darknet_perf(PATH, lr_sch, mom_sch, downsample, acc_dict, bs)
        pop_perf = [x for x in sorted(pop_perf, key=lambda x: x[0], reverse=True)]
        history.append(pop_perf)
        acc_dict.update(acc_dict_tmp)

        # save the intermediate result every generation
        out_file = os.path.join(out_loc, 'cifar_darknet_'+'on_gen_'+str(i))
        pickle.dump(history, open(out_file, 'wb'))

        # print average accuracy, best accuracy, and best schedule
        perf_only = [x[0] for x in pop_perf]
        avg = sum(perf_only)/len(perf_only)
        print('Avg acc: ', avg, 'best acc: ', pop_perf[0][0])
        print('LR Schedule: ',[ '%.5f' % elem for elem in pop_perf[0][1]])

        # Evolve
        lr_perf = [[x[0], x[1]] for x in pop_perf]
        mom_perf = [[x[0], x[2]] for x in pop_perf]
        if evolve_lr:
            # downsample it for evolution
            lr_perf = [[x[0], x[1][::downsample]] for x in lr_perf]
            lr_sch = evolve(lr_perf, breed_slice)
            # upsample it back to normal
            lr_sch = [np.repeat(np.array(x), int(downsample)).tolist() for x in lr_sch]
        if evolve_mom:
            # downsample it for evolution
            mom_perf = [[x[0], x[1][::downsample]] for x in mom_perf]
            mom_sch = evolve(mom_perf, breed_slice)
            # upsample it back to normal
            mom_sch = [np.repeat(np.array(x), int(downsample)).tolist() for x in mom_sch]

    # get final accuracy, and print the top 5 sorted
    pop_perf = get_darknet_perf(PATH, lr_sch, mom_sch, downsample=downsample)
    pop_perf = [x for x in sorted(pop_perf, key=lambda x: x[0], reverse=True)]

    # Print out the top 5 networks.=
    print('Final Results: ', pop_perf[:5])

    # save history as a pickle file
    out_file = os.path.join(out_loc, 'cifar_dark_'+str(generations)+'_numsch_'+str(epochs)+'_on_gen_'+str(i+29))
    pickle.dump(history, open(out_file, 'wb'))

In [None]:
# now try with bs of 512, instead of 2048. Seems large batch might be less stable?

PATH = "/media/rene/Data/data/cifar10"
out_loc = '/media/rene/Data/data/learn-lr/output/cifar_dark_10epoch_ds1'
num_schedules = 12
epochs = 10
generations = 100
downsample = 1

# Initialize schedules. Need 2 points for every epoch. 
size = int(2*epochs/downsample)
init_lr_sch = create_population(num_schedules, size=size, rate_range=(-2.5, -1.5))
init_mom_sch = [[.9]*size]*num_schedules

init_lr_sch = [np.repeat(np.array(x), int(downsample)).tolist() for x in init_lr_sch]
init_mom_sch = [np.repeat(np.array(x), int(downsample)).tolist() for x in init_mom_sch]

run_genetic_darknet(PATH, out_loc, generations, epochs, init_lr_sch, init_mom_sch,
            downsample=downsample, evolve_lr=True, evolve_mom=False)

Running generation:  0


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

 18%|█▊        | 18/98 [00:09<00:43,  1.83it/s, loss=2.02]



epoch      trn_loss   val_loss   accuracy                 
    0      1.514411   2.578099   0.1       
    1      1.196001   3.016543   0.1                      
    2      1.1194     3.412578   0.1981                   
    3      0.93135    2.507309   0.3251                    
    4      0.740798   0.966175   0.6765                    
                                                           