# Hyperparameter Optimization with PSO

In [1]:
using Flux
using MLDatasets
using Random

In [2]:
train_x, train_y = CIFAR10.traindata(Float32, 1:50)
test_x, test_y = CIFAR10.testdata(Float32, 1:50);

In [3]:
train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9);

In [4]:
function loss_and_accuracy(udata, wdata, model)
    ndata = size(udata,4)
    ŵ = model(udata)
    loss = Flux.crossentropy(ŵ, wdata; agg=sum)
    accuracy = sum(Flux.onecold(ŵ) .== Flux.onecold(wdata)) / ndata
    return loss, accuracy
end

loss_and_accuracy (generic function with 1 method)

In [5]:
batch_size = 5
train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=true);

In [6]:
mutable struct Particle 
    position::Vector{Int}
    best_position::Vector{Int}
    best_accuracy::Float32
    velocity::Vector{Float32}
    idx::Int
end

In [11]:
function hyper_parameterized(particle)
    model_before_dense = Chain(
              Conv((particle.position[1],particle.position[1]), 3=>particle.position[2], relu, pad=SamePad()),
              Conv((particle.position[3],particle.position[3]), particle.position[2]=>particle.position[4], relu, pad=SamePad()),
              MaxPool((particle.position[5],particle.position[5])),
              Dropout(0.2),
              Conv((particle.position[6],particle.position[6]), particle.position[4]=>particle.position[7], relu, pad=SamePad()),
              Conv((particle.position[8],particle.position[8]), particle.position[7]=>particle.position[9], relu, pad=SamePad()),
              MaxPool((particle.position[10],particle.position[10])),
              Dropout(0.2),
              Conv((particle.position[11],particle.position[11]), particle.position[9]=>particle.position[12], relu, pad=SamePad()),
              Conv((particle.position[13],particle.position[13]), particle.position[12]=>particle.position[14], relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten)
    last_size = model_before_dense(rand(Float32, 32, 32, 3, 1)) |> size
    return Chain(
              Conv((particle.position[1],particle.position[1]), 3=>particle.position[2], relu, pad=SamePad()),
              Conv((particle.position[3],particle.position[3]), particle.position[2]=>particle.position[4], relu, pad=SamePad()),
              MaxPool((particle.position[5],particle.position[5])),
              Dropout(0.2),
              Conv((particle.position[6],particle.position[6]), particle.position[4]=>particle.position[7], relu, pad=SamePad()),
              Conv((particle.position[8],particle.position[8]), particle.position[7]=>particle.position[9], relu, pad=SamePad()),
              MaxPool((particle.position[10],particle.position[10])),
              Dropout(0.2),
              Conv((particle.position[11],particle.position[11]), particle.position[9]=>particle.position[12], relu, pad=SamePad()),
              Conv((particle.position[13],particle.position[13]), particle.position[12]=>particle.position[14], relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten,
              Dense(last_size[1], particle.position[15], relu),
              Dropout(0.2),
              Dense(particle.position[15],10),
              softmax)
end

hyper_parameterized (generic function with 1 method)

In [12]:
function train(model,train_loader,optimizer,train_x,train_y,test_x,test_y,model_name)
    test_loss = 0
    test_acc = 0
    α = 0.001 # <- stepsize; in the ML community, it is often denoted as a `learning rate η`
    #opt = optimizer(α) 
    opt = optimizer 
    K = 10
    println("Training $model_name architecture.")
    for k in 1:K
        for (u, w) in train_loader
            gs = gradient(() -> Flux.Losses.crossentropy(model(u), w), Flux.params(model)) # compute gradient
            Flux.Optimise.update!(opt, Flux.params(model), gs) # update parameters
        end
        test_loss, test_acc = loss_and_accuracy(test_x, test_y, model)

        #println("  train_loss = $train_loss, train_accuracy = $train_acc")
        #println("  test_loss = $test_loss, test_accuracy = $test_acc")
    end
    return test_loss, test_acc
end

train (generic function with 1 method)

In [24]:
function hyper_parameter_optimization(train_x, train_y, test_x, test_y, numparticles, ω, c1,c2)
    lo = [2,1,2,1,2,2,1,2,1,2,2,2,2,2,10]
    hi = [10,48,10,100,4,10,100,10,100,4,10,256,10,256,1000]
    particles = []
    best_accuracy = 0
    swarm_best_position = []
    # init particles - search space is 16 variables
        #1   #2  #3   #4  #5   #6   #7  #8   #9  #10  #11  #12 #13 #14 #15
    # [SF1, n1, SF2, n2, SP1, SF3, n3, SF4, n4, SP2, SF5, n5, SF6, n6, SD1]
    for i in 1:numparticles
        SF1 = rand(2:10)
        n1 = rand(3:48)
        SF2 = rand(2:10)
        n2 = rand(n1:100)
        SP1 = rand(2:4)
        SF3 = rand(2:10)
        n3 = rand(n2:100)
        SF4 = rand(2:10)
        n4 = rand(n3:100)
        SP2 = rand(2:4)
        SF5 = rand(2:10)
        n5 = rand(n4:256)
        SF6 = rand(2:10)
        n6 = rand(n5:256)
        SD1 = rand(10:1000)
        push!(particles, Particle([SF1,n1,SF2,n2,SP1,SF3,n3,SF4,n4,SP2,SF5,n5,SF6,n6,SD1],
                [SF1,n1,SF2,n2,SP1,SF3,n3,SF4,n4,SP2,SF5,n5,SF6,n6,SD1], 0, rand(Float32,15),i))
    end
    # find best model to work towards
    for p in particles
        m = hyper_parameterized(p)
        this_acc = 0
        for (u,w) in train_loader
            this_loss, this_acc = loss_and_accuracy(train_x,train_y, m)
        end
        p.best_accuracy = this_acc
        if p.best_accuracy > best_accuracy
            best_accuracy = p.best_accuracy
            swarm_best_position = p.position
        end
    end
    
    println("Particles initalized")
    
    for i in 1:100
        for p in particles
            # vector math for additional speed boost
            # ω - inertia weight, how much the previous velocity impacts the current position
            # c1 - how much the particle pays attention to its own best position
            # c2 - how much the particle pays attention to the swarm's best position
            r1 = rand((-1.0:1.0))
            r2 = rand((-1.0:1.0))
            p.velocity = (ω .* p.velocity) .+ r1 * c1 .* (p.best_position .- p.position) .+ r2 * c2 .* (swarm_best_position .- p.position)
            p.position = clamp.(trunc.(Int, p.position + p.velocity),1,1000)
            model = hyper_parameterized(p)
            this_loss, this_acc = train(model,train_loader,ADAM(0.01),train_x,train_y,test_x,test_y,"model $p.idx")
            println("Loss: $this_loss; Accuracy: $this_acc")
            
            if p.best_accuracy < this_acc
                p.best_accuracy = this_acc
                p.best_position = p.position
            end
                
            if best_accuracy < this_acc
                
                best_accuracy = this_acc
                swarm_best_position = p.position
                println("Changing accuracy to $best_accuracy")
            end
            
        end
    end
    
    return swarm_best_position
end

hyper_parameter_optimization (generic function with 1 method)

In [None]:
b_p = hyper_parameter_optimization(train_x, train_y,test_x, test_y, 4, 0.5, 0.3, 0.4)

Particles initalized
Training model Particle([5, 35, 4, 57, 3, 9, 88, 8, 97, 2, 4, 140, 5, 221, 850], [2, 36, 2, 50, 3, 9, 94, 8, 97, 3, 5, 118, 4, 224, 893], 0.08f0, Float32[3.322588, -0.17521325, 2.5703948, 7.243511, 0.88086027, 0.46931034, -5.1297398, 0.66945016, 0.89636767, -0.07103636, -0.40247476, 22.774555, 1.692674, -2.3450809, -42.96469], 1).idx architecture.
Loss : 653.63776; Accuracy: 0.18
Changing accuracy
0.18
Training model Particle([10, 35, 8, 68, 4, 9, 81, 9, 99, 2, 3, 174, 8, 217, 785], [10, 35, 8, 68, 4, 9, 81, 9, 99, 2, 3, 174, 8, 217, 785], 0.12f0, Float32[0.09185809, 0.23349273, 0.14723402, 0.15239114, 0.44463146, 0.18977511, 0.034646153, 0.15103954, 0.11905825, 0.454171, 0.38997853, 0.18910807, 0.43691856, 0.17298317, 0.28658605], 2).idx architecture.
