# Hyperparameter Optimization with PSO

In [2]:
using Flux
using MLDatasets
using Random
using LinearAlgebra

In [3]:
train_x, train_y = CIFAR10.traindata(Float32, 1:100)
test_x, test_y = CIFAR10.testdata(Float32, 1:100);

In [4]:
train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9);

In [5]:
function loss_and_accuracy(udata, wdata, model)
    ndata = size(udata,4)
    ŵ = model(udata)
    loss = Flux.crossentropy(ŵ, wdata; agg=sum)
    accuracy = sum(Flux.onecold(ŵ) .== Flux.onecold(wdata)) / ndata
    return loss, accuracy
end

loss_and_accuracy (generic function with 1 method)

In [6]:
batch_size = 10
train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=true);

In [7]:
mutable struct Particle 
    position::Vector{Int}
    best_position::Vector{Int}
    best_loss::Float32
    velocity::Vector{Float32}
    idx::Int
end

In [8]:
function hyper_parameterized(particle)
    model_before_dense = Chain(
              Conv((particle.position[1],particle.position[1]), 3=>particle.position[2], relu, pad=SamePad()),
              Conv((particle.position[3],particle.position[3]), particle.position[2]=>particle.position[4], relu, pad=SamePad()),
              MaxPool((particle.position[5],particle.position[5])),
              Dropout(0.2),
              Conv((particle.position[6],particle.position[6]), particle.position[4]=>particle.position[7], relu, pad=SamePad()),
              Conv((particle.position[8],particle.position[8]), particle.position[7]=>particle.position[9], relu, pad=SamePad()),
              MaxPool((particle.position[10],particle.position[10])),
              Dropout(0.2),
              Conv((particle.position[11],particle.position[11]), particle.position[9]=>particle.position[12], relu, pad=SamePad()),
              Conv((particle.position[13],particle.position[13]), particle.position[12]=>particle.position[14], relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten)
    last_size = model_before_dense(rand(Float32, 32, 32, 3, 1)) |> size
    return Chain(
              Conv((particle.position[1],particle.position[1]), 3=>particle.position[2], relu, pad=SamePad()),
              Conv((particle.position[3],particle.position[3]), particle.position[2]=>particle.position[4], relu, pad=SamePad()),
              MaxPool((particle.position[5],particle.position[5])),
              Dropout(0.2),
              Conv((particle.position[6],particle.position[6]), particle.position[4]=>particle.position[7], relu, pad=SamePad()),
              Conv((particle.position[8],particle.position[8]), particle.position[7]=>particle.position[9], relu, pad=SamePad()),
              MaxPool((particle.position[10],particle.position[10])),
              Dropout(0.2),
              Conv((particle.position[11],particle.position[11]), particle.position[9]=>particle.position[12], relu, pad=SamePad()),
              Conv((particle.position[13],particle.position[13]), particle.position[12]=>particle.position[14], relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten,
              Dense(last_size[1], particle.position[15], relu),
              Dropout(0.2),
              Dense(particle.position[15],10),
              softmax)
end

hyper_parameterized (generic function with 1 method)

In [9]:
function train(model,train_loader,train_x,train_y,test_x,test_y,model_name)
    test_loss = 0
    test_acc = 0
    opt = ADAM(0.001) 
    K = 3
    println("Training $model_name architecture.")
    for k in 1:K
        for (u, w) in train_loader
            gs = gradient(() -> Flux.Losses.crossentropy(model(u), w), Flux.params(model)) # compute gradient
            Flux.Optimise.update!(opt, Flux.params(model), gs) # update parameters
        end
        test_loss, test_acc = loss_and_accuracy(test_x, test_y, model)
    end
    return test_loss, test_acc
end

train (generic function with 1 method)

In [14]:
function hyper_parameter_optimization(train_x, train_y, test_x, test_y, numparticles, ω, c1,c2)
    lo = [2,1,2,1,2,2,1,2,1,2,2,2,2,2,10]
    hi = [10,48,10,100,4,10,100,10,256,4,10,256,10,256,1000]
    particles = []
    best_loss = typemax(Int)
    best_loss_previous = typemax(Int)
    swarm_best_position = []
    swarm_best_previous = []
    # init particles - search space is 16 variables
        #1   #2  #3   #4  #5   #6   #7  #8   #9  #10  #11  #12 #13 #14 #15
    # [SF1, n1, SF2, n2, SP1, SF3, n3, SF4, n4, SP2, SF5, n5, SF6, n6, SD1]
    for i in 1:numparticles
        SF1 = rand(2:10)
        n1 = rand(3:48)
        SF2 = rand(2:10)
        n2 = rand(n1:100)
        SP1 = rand(2:4)
        SF3 = rand(2:10)
        n3 = rand(n2:100)
        SF4 = rand(2:10)
        n4 = rand(n3:256)
        SP2 = rand(2:4)
        SF5 = rand(2:10)
        n5 = rand(n4:256)
        SF6 = rand(2:10)
        n6 = rand(n5:256)
        SD1 = rand(10:1000)
        push!(particles, Particle([SF1,n1,SF2,n2,SP1,SF3,n3,SF4,n4,SP2,SF5,n5,SF6,n6,SD1],
                [SF1,n1,SF2,n2,SP1,SF3,n3,SF4,n4,SP2,SF5,n5,SF6,n6,SD1], 0, rand(Float32,15),i))
    end
    # find best model to work towards
    for p in particles
        m = hyper_parameterized(p)
        this_loss = 0
        for (u,w) in train_loader
            this_loss, this_acc = loss_and_accuracy(train_x,train_y, m)
        end
        p.best_loss = this_loss
        if p.best_loss < best_loss
            best_loss = p.best_loss
            best_loss_previous = best_loss
            swarm_best_position = p.position 
            swarm_best_previous = p.position
        end
    end
    
    println("Particles initalized")
    
    for i in 1:10
        for p in particles
            # vector math for additional speed boost
            # ω - inertia weight, how much the previous velocity impacts the current position
            # c1 - how much the particle pays attention to its own best position
            # c2 - how much the particle pays attention to the swarm's best position
            r1 = rand((0.0:1.0))
            r2 = rand((0.0:1.0))
            p.velocity = (ω .* p.velocity) .+ r1 * c1 .* (p.best_position .- p.position) .+ r2 * c2 .* (swarm_best_position .- p.position)
            p.position = p.position + p.velocity
            # ensure each position is between its hi and low
            for (i,param) in enumerate(p.position)
                p.position[i] = clamp(round(Int64,param), lo[i], hi[i])
            end
            model = hyper_parameterized(p)
            this_loss, this_acc = train(model,train_loader,train_x,train_y,test_x,test_y,"model $p.idx")
            println("Loss: $this_loss; Accuracy: $this_acc")
            if p.best_loss > this_loss
                p.best_loss = this_loss
                p.best_position = p.position
                if best_loss > this_loss
                    best_loss_previous = best_loss
                    swarm_best_previous = swarm_best_position
                    best_loss = this_loss
                    swarm_best_position = p.position
                    println("Changing accuracy to $best_loss")
                    if norm(swarm_best_position - swarm_best_previous) < 0.1
                        return swarm_best_position
                    end
                    if best_loss - best_loss_previous < 0.1
                        return swarm_best_position
                    end
                end
            end
        end
    end
    return swarm_best_position
end

hyper_parameter_optimization (generic function with 1 method)

In [None]:
b_p = hyper_parameter_optimization(train_x, train_y,test_x, test_y, 10, 0.7, 2.05, 2.05)