# Hyperparameter Optimization with PSO

In [11]:
using Flux
using MLDatasets
using Random
using LinearAlgebra

In [12]:
train_x, train_y = CIFAR10.traindata(Float32, 1:100)
test_x, test_y = CIFAR10.testdata(Float32, 1:100);

In [13]:
train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9);

In [14]:
function loss_and_accuracy(udata, wdata, model)
    ndata = size(udata,4)
    ŵ = model(udata)
    loss = Flux.crossentropy(ŵ, wdata; agg=sum)
    accuracy = sum(Flux.onecold(ŵ) .== Flux.onecold(wdata)) / ndata
    return loss, accuracy
end

loss_and_accuracy (generic function with 1 method)

In [15]:
batch_size = 10
train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=true);

In [16]:
mutable struct Particle 
    position::Vector{Int}
    best_position::Vector{Int}
    best_loss::Float32
    velocity::Vector{Float32}
    idx::Int
end

In [17]:
function hyper_parameterized(particle)
    model_before_dense = Chain(
              Conv((particle.position[1],particle.position[1]), 3=>particle.position[2], relu, pad=SamePad()),
              Conv((particle.position[3],particle.position[3]), particle.position[2]=>particle.position[4], 
              relu, pad=SamePad()),
              MaxPool((particle.position[5],particle.position[5])),
              Dropout(0.2),
              Conv((particle.position[6],particle.position[6]), particle.position[4]=>particle.position[7], 
              relu, pad=SamePad()),
              Conv((particle.position[8],particle.position[8]), particle.position[7]=>particle.position[9], 
              relu, pad=SamePad()),
              MaxPool((particle.position[10],particle.position[10])),
              Dropout(0.2),
              Conv((particle.position[11],particle.position[11]), particle.position[9]=>particle.position[12],
              relu, pad=SamePad()),
              Conv((particle.position[13],particle.position[13]), particle.position[12]=>particle.position[14],
              relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten)
    last_size = model_before_dense(rand(Float32, 32, 32, 3, 1)) |> size
    return Chain(
              Conv((particle.position[1],particle.position[1]), 3=>particle.position[2], relu, pad=SamePad()),
              Conv((particle.position[3],particle.position[3]), particle.position[2]=>particle.position[4],
              relu, pad=SamePad()),
              MaxPool((particle.position[5],particle.position[5])),
              Dropout(0.2),
              Conv((particle.position[6],particle.position[6]), particle.position[4]=>particle.position[7],
              relu, pad=SamePad()),
              Conv((particle.position[8],particle.position[8]), particle.position[7]=>particle.position[9],
              relu, pad=SamePad()),
              MaxPool((particle.position[10],particle.position[10])),
              Dropout(0.2),
              Conv((particle.position[11],particle.position[11]), particle.position[9]=>particle.position[12],
              relu, pad=SamePad()),
              Conv((particle.position[13],particle.position[13]), particle.position[12]=>particle.position[14],
              relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten,
              Dense(last_size[1], particle.position[15], relu),
              Dropout(0.2),
              Dense(particle.position[15],10),
              softmax)
end

hyper_parameterized (generic function with 1 method)

In [18]:
function train(model,train_loader,train_x,train_y,test_x,test_y,model_name)
    test_loss = 0
    test_acc = 0
    opt = ADAM(0.001) 
    K = 1
    println("Training $model_name architecture.")
    for k in 1:K
        for (u, w) in train_loader
            gs = gradient(() -> Flux.Losses.crossentropy(model(u), w), Flux.params(model)) # compute gradient
            Flux.Optimise.update!(opt, Flux.params(model), gs) # update parameters
        end
        test_loss, test_acc = loss_and_accuracy(test_x, test_y, model)
    end
    return test_loss, test_acc
end

train (generic function with 1 method)

In [19]:
function hyper_parameter_optimization(train_x, train_y, test_x, test_y, numparticles, ω, c1,c2)
    lo = [2,1,2,1,2,2,1,2,1,2,2,2,2,2,10]
    hi = [10,48,10,100,4,10,100,10,256,4,10,256,10,256,1000]
    particles = []
    best_loss = typemax(Int)
    best_loss_previous = typemax(Int)
    swarm_best_position = []
    swarm_best_previous = []
    # init particles - search space is 16 variables
        #1   #2  #3   #4  #5   #6   #7  #8   #9  #10  #11  #12 #13 #14 #15
    # [SF1, n1, SF2, n2, SP1, SF3, n3, SF4, n4, SP2, SF5, n5, SF6, n6, SD1]
    for i in 1:numparticles
        SF1 = rand(2:10)
        n1 = rand(3:48)
        SF2 = rand(2:10)
        n2 = rand(n1:100)
        SP1 = rand(2:4)
        SF3 = rand(2:10)
        n3 = rand(n2:100)
        SF4 = rand(2:10)
        n4 = rand(n3:256)
        SP2 = rand(2:4)
        SF5 = rand(2:10)
        n5 = rand(n4:256)
        SF6 = rand(2:10)
        n6 = rand(n5:256)
        SD1 = rand(10:1000)
        push!(particles, Particle([SF1,n1,SF2,n2,SP1,SF3,n3,SF4,n4,SP2,SF5,n5,SF6,n6,SD1],
                [SF1,n1,SF2,n2,SP1,SF3,n3,SF4,n4,SP2,SF5,n5,SF6,n6,SD1], 0, rand(Float32,15),i))
    end
    # find best model to work towards
    for p in particles
        m = hyper_parameterized(p)
        this_loss = 0
        for (u,w) in train_loader
            this_loss, this_acc = loss_and_accuracy(train_x,train_y, m)
        end
        p.best_loss = this_loss
        if p.best_loss < best_loss
            best_loss = p.best_loss
            best_loss_previous = best_loss
            swarm_best_position = p.position 
            swarm_best_previous = p.position
        end
    end
    
    println("Particles initalized")
    
    for i in 1:10
        for p in particles
            # vector math for additional speed boost
            # ω - inertia weight, how much the previous velocity impacts the current position
            # c1 - how much the particle pays attention to its own best position
            # c2 - how much the particle pays attention to the swarm's best position
            r1 = rand((0.0:1.0))
            r2 = rand((0.0:1.0))
            p.velocity = (ω .* p.velocity) .+ r1 * c1 .* (p.best_position .- p.position) .+ r2 * c2 .* (swarm_best_position .- p.position)
            p.position = round.(Int,p.position + p.velocity)
            # ensure each position is between its hi and low
            for (i,param) in enumerate(p.position)
                p.position[i] = clamp(param, lo[i], hi[i])
            end
            model = hyper_parameterized(p)
            this_loss, this_acc = train(model,train_loader,train_x,train_y,test_x,test_y,"model $p.idx")
            println("Loss: $this_loss; Accuracy: $this_acc")
            if p.best_loss > this_loss
                p.best_loss = this_loss
                p.best_position = p.position
                if best_loss > this_loss
                    best_loss_previous = best_loss
                    swarm_best_previous = swarm_best_position
                    best_loss = this_loss
                    swarm_best_position = p.position
                    println("Changing loss to $best_loss")
                    if norm(swarm_best_position - swarm_best_previous) < 0.1
                        return swarm_best_position
                    end
                    if best_loss_previous - best_loss < 0.01
                        return swarm_best_position
                    end
                end
            end
        end
    end
    return swarm_best_position
end

hyper_parameter_optimization (generic function with 1 method)

In [20]:
b_p = hyper_parameter_optimization(train_x, train_y,test_x, test_y, 20, 0.7, 2.05, 2.05)

Particles initalized
Training model Particle([8, 33, 4, 75, 3, 5, 80, 2, 137, 4, 3, 203, 5, 203, 126], [8, 33, 3, 74, 3, 5, 80, 2, 137, 4, 3, 203, 5, 203, 126], 230.12793f0, Float32[0.48987728, 0.22725996, 0.55913395, 0.51881176, 0.14660399, 0.3447053, 0.08895339, 0.10942507, 0.21142632, 0.6682682, 0.1428383, 0.36139002, 0.16933616, 0.16572776, 0.086658105], 1).idx architecture.
Loss: 231.61865; Accuracy: 0.06
Training model Particle([2, 1, 3, 100, 2, 7, 62, 10, 48, 2, 2, 60, 10, 123, 324], [9, 28, 9, 32, 4, 3, 98, 4, 240, 4, 10, 247, 2, 247, 849], 230.23412f0, Float32[-9.983774, -34.749645, -5.8906016, 86.794876, -3.4001224, 4.13949, -36.487823, 10.903911, -192.30624, -1.67127, -16.375069, -186.50935, 14.901881, -124.38314, -524.6235], 2).idx architecture.
Loss: 230.69138; Accuracy: 0.12
Training model Particle([4, 8, 8, 82, 2, 9, 89, 5, 93, 2, 2, 215, 10, 227, 764], [4, 8, 8, 82, 2, 8, 89, 4, 93, 2, 2, 214, 10, 227, 764], 230.64026f0, Float32[0.20496464, 0.4871254, 0.3611396, 0.48717

Loss: 231.41985; Accuracy: 0.06
Training model Particle([9, 32, 10, 21, 4, 2, 100, 5, 256, 4, 7, 256, 4, 256, 1000], [9, 28, 9, 32, 4, 3, 98, 4, 240, 4, 10, 247, 2, 247, 849], 230.23412f0, Float32[7.361358, 31.025248, 8.1765785, -78.643585, 1.7199143, -5.3023567, 48.258522, -4.6672626, 258.98563, 2.930111, 4.937452, 252.79346, -5.9686832, 167.1318, 709.01355], 2).idx architecture.
Loss: 230.8057; Accuracy: 0.06
Training model Particle([2, 1, 10, 47, 2, 2, 48, 10, 140, 2, 2, 37, 10, 108, 906], [4, 8, 8, 82, 2, 8, 89, 4, 93, 2, 2, 214, 10, 227, 764], 230.64026f0, Float32[-3.9565248, -14.009012, 2.3027978, -34.508976, 0.19377494, -16.02405, -40.889767, 4.553831, 47.477432, 0.11513202, 0.121432915, -177.98448, 0.0464881, -118.758675, 141.56403], 3).idx architecture.
Loss: 230.87689; Accuracy: 0.06
Training model Particle([2, 4, 6, 60, 2, 10, 49, 10, 1, 4, 2, 2, 10, 43, 10], [10, 14, 6, 80, 4, 2, 93, 7, 227, 2, 6, 239, 4, 244, 871], 230.49797f0, Float32[-8.32343, -3.9137213, 0.012161844, -8

Loss: 232.32248; Accuracy: 0.08
Training model Particle([2, 1, 6, 54, 2, 10, 36, 10, 1, 4, 2, 2, 10, 2, 10], [2, 4, 6, 60, 2, 10, 49, 10, 1, 4, 2, 2, 10, 43, 10], 230.34276f0, Float32[-5.826401, -2.739605, 0.008513291, -5.783968, -1.9936975, 3.166422, -12.757192, 2.0765579, -81.11768, 1.2265638, -3.8952997, -83.26543, 5.3206196, -57.98699, -278.9135], 4).idx architecture.
Loss: 230.36398; Accuracy: 0.06
Training model Particle([3, 6, 5, 100, 3, 3, 98, 10, 256, 3, 5, 256, 7, 256, 191], [4, 14, 7, 75, 4, 3, 75, 9, 227, 4, 7, 243, 5, 245, 635], 230.27213f0, Float32[1.3131136, 5.3749027, -5.095605, 56.90792, 1.4790748, 0.6552086, 38.029076, 4.857275, 537.23206, 1.4711256, 3.116774, 562.1467, -2.879788, 374.76257, -809.0526], 5).idx architecture.
Loss: 230.57484; Accuracy: 0.06
Training model Particle([2, 1, 10, 100, 2, 2, 100, 10, 256, 2, 2, 2, 10, 53, 1000], [7, 44, 5, 50, 2, 5, 52, 3, 55, 3, 7, 205, 9, 216, 575], 230.32668f0, Float32[-9.738533, -64.29428, 5.9409637, 21.653324, 0.05888197

Loss: 231.4385; Accuracy: 0.11
Training model Particle([2, 1, 10, 100, 2, 2, 100, 10, 256, 2, 2, 2, 10, 6, 1000], [7, 44, 5, 50, 2, 5, 52, 3, 55, 3, 7, 205, 9, 216, 575], 230.32668f0, Float32[-6.816973, -45.005997, 4.1586747, 15.157327, 0.041217383, -2.9750774, 17.141747, 5.1048737, 61.401165, -2.7898097, -5.0201035, -76.235306, 1.0792885, -47.07994, 259.19272], 6).idx architecture.
Loss: 230.7808; Accuracy: 0.08
Training model Particle([2, 1, 8, 49, 2, 2, 57, 6, 103, 2, 2, 54, 10, 136, 1000], [4, 10, 10, 81, 3, 2, 81, 10, 129, 2, 5, 201, 9, 201, 83], 230.07841f0, Float32[-3.9336178, -17.809288, -1.8514107, -32.196632, -2.023445, 0.019196188, -23.9409, -3.885426, -26.010067, 0.050733306, -6.090938, -146.95906, 2.0788236, -65.441574, 1537.652], 7).idx architecture.
Loss: 230.26245; Accuracy: 0.16
Training model Particle([2, 1, 3, 100, 2, 2, 79, 6, 1, 2, 2, 2, 10, 2, 1000], [8, 6, 9, 6, 4, 2, 65, 9, 200, 2, 7, 248, 3, 254, 736], 230.25975f0, Float32[-8.492813, -7.1410546, -2.6858475, 82.

Loss: 230.77852; Accuracy: 0.06
Training model Particle([2, 1, 2, 100, 2, 2, 83, 5, 1, 2, 2, 2, 10, 2, 1000], [8, 6, 9, 6, 4, 2, 65, 9, 200, 2, 7, 248, 3, 254, 736], 230.25975f0, Float32[-5.944969, -4.9987383, -1.8800932, 57.410957, -1.8463964, 0.0014789809, 4.0222597, -0.9394578, -84.31694, 0.11021349, -6.8687267, -119.456505, 7.075684, -87.26578, 97.49868], 8).idx architecture.
Loss: 230.69975; Accuracy: 0.06
Training model Particle([5, 5, 10, 100, 2, 4, 79, 8, 71, 4, 3, 148, 9, 191, 950], [7, 17, 10, 37, 3, 4, 96, 8, 150, 4, 6, 233, 7, 242, 508], 230.27014f0, Float32[0.21595958, -8.084816, -0.11480868, 84.31691, -1.5733129, 0.95069146, -21.386658, 0.09825302, -103.12311, 0.91932434, -2.2730422, -107.71123, 0.83636206, -65.06918, 491.74533], 9).idx architecture.
Loss: 231.0652; Accuracy: 0.06
Training model Particle([2, 1, 10, 53, 2, 2, 66, 10, 144, 2, 2, 83, 10, 132, 922], [2, 1, 10, 100, 2, 2, 87, 10, 127, 3, 2, 256, 9, 256, 749], 230.44376f0, Float32[-5.67019, -25.867828, 1.536623

Loss: 230.62454; Accuracy: 0.06
Training model Particle([2, 1, 10, 100, 2, 2, 100, 10, 200, 3, 2, 256, 10, 256, 691], [2, 1, 10, 100, 2, 2, 87, 10, 127, 3, 2, 256, 9, 256, 749], 230.44376f0, Float32[-3.969133, -18.10748, 1.0756364, 111.463196, -0.49894875, -1.1721804, 60.693684, -0.24411513, 56.096146, 1.2744547, -0.46346834, 407.6414, 0.07086046, 276.69574, -231.3526], 10).idx architecture.
Loss: 231.65933; Accuracy: 0.06
Training model Particle([2, 1, 10, 50, 2, 2, 46, 8, 119, 2, 2, 10, 10, 86, 1000], [10, 44, 6, 71, 3, 10, 87, 2, 114, 4, 5, 194, 7, 213, 19], 230.2248f0, Float32[-12.230724, -77.121796, 9.307219, -22.878849, -3.028553, -12.267069, -53.570187, 5.8311663, 7.436803, -3.0751073, -8.689142, -246.05156, 8.73954, -169.86534, 997.90436], 11).idx architecture.
Loss: 232.5801; Accuracy: 0.06
Training model Particle([2, 1, 9, 81, 2, 2, 70, 10, 118, 2, 2, 135, 10, 145, 819], [10, 20, 8, 28, 3, 6, 64, 2, 110, 4, 3, 112, 7, 243, 874], 230.2443f0, Float32[-11.370061, -27.20039, -0.5

Loss: 233.13815; Accuracy: 0.06
Training model Particle([10, 21, 7, 1, 3, 6, 54, 2, 98, 4, 3, 79, 7, 256, 952], [10, 20, 8, 28, 3, 6, 64, 2, 110, 4, 3, 112, 7, 243, 874], 230.2443f0, Float32[8.440957, 19.909727, -2.4218752, -121.68549, 1.0985162, 4.2216635, -15.601852, -13.23281, -20.138214, 2.0980413, 1.0812361, -55.85783, -3.0932665, 238.55562, 133.2801], 12).idx architecture.
Loss: 231.13795; Accuracy: 0.06
Training model Particle([10, 48, 10, 75, 4, 3, 81, 8, 133, 4, 4, 239, 2, 256, 1000], [9, 47, 7, 87, 4, 3, 89, 5, 192, 4, 4, 224, 2, 255, 977], 230.31912f0, Float32[0.51633275, 26.907316, 3.594589, -24.644606, 1.3725513, -1.332598, -18.76044, 6.3661194, -98.997574, 1.3268105, -1.9519024, -16.678967, -5.26534, 83.548615, 65.128716], 13).idx architecture.
Loss: 231.10068; Accuracy: 0.06
Training model Particle([2, 1, 9, 78, 2, 2, 48, 10, 13, 2, 2, 64, 10, 90, 832], [7, 22, 3, 48, 3, 10, 93, 4, 233, 3, 10, 248, 5, 248, 877], 230.25237f0, Float32[-17.074713, -69.30824, 6.734283, 28.37

Loss: 230.4905; Accuracy: 0.11
Training model Particle([2, 1, 10, 71, 2, 2, 63, 10, 54, 2, 2, 63, 10, 144, 817], [7, 22, 3, 48, 3, 10, 93, 4, 233, 3, 10, 248, 5, 248, 877], 230.25237f0, Float32[-11.952299, -48.51577, 4.713998, -6.7840796, -3.5029023, -5.351222, 14.524546, 3.9110646, 40.700886, -3.5008678, -5.354971, -1.3275421, 11.986386, 54.01873, -14.999927], 14).idx architecture.
Loss: 231.69507; Accuracy: 0.06
Training model Particle([2, 1, 6, 1, 2, 2, 44, 10, 50, 2, 2, 2, 10, 249, 1000], [7, 14, 8, 63, 3, 3, 97, 2, 160, 2, 10, 235, 2, 255, 602], 230.28036f0, Float32[-4.144595, -13.408216, -1.6527641, -34.33189, -0.4728545, -2.5941727, -22.9254, 6.361057, -45.62784, -1.1848342, -10.063481, -117.71297, 9.961267, -2.9380229, 372.60278], 15).idx architecture.
Loss: 232.83676; Accuracy: 0.06
Training model Particle([2, 1, 10, 72, 2, 2, 67, 9, 117, 2, 2, 63, 10, 173, 566], [6, 18, 8, 64, 4, 9, 72, 8, 122, 3, 8, 170, 9, 171, 197], 230.26302f0, Float32[-6.6004124, -33.944847, 3.582705, -0

Loss: 235.09282; Accuracy: 0.06
Training model Particle([6, 12, 8, 55, 4, 10, 73, 7, 114, 2, 8, 216, 10, 164, 199], [6, 18, 8, 64, 4, 9, 72, 8, 122, 3, 8, 170, 9, 171, 197], 230.26302f0, Float32[3.5797114, 11.088607, -1.5921065, -16.688686, 2.8927727, 8.443937, 5.5784345, -2.468172, -2.5482183, -0.054590393, 5.6612053, 152.79118, 0.060514927, -8.52564, -366.9501], 16).idx architecture.
Loss: 230.44623; Accuracy: 0.11
Training model Particle([8, 24, 2, 49, 3, 8, 100, 2, 256, 4, 9, 256, 7, 256, 1000], [8, 23, 3, 61, 3, 8, 94, 5, 245, 4, 9, 247, 7, 254, 936], 230.22604f0, Float32[6.36746, 23.029396, -11.785618, -18.09472, 1.0682554, 6.3595424, 99.00762, -14.178686, 562.31525, 2.1131227, 7.337971, 461.21042, -3.1217365, 334.1444, 404.8275], 17).idx architecture.
Loss: 230.97896; Accuracy: 0.06
Training model Particle([2, 48, 4, 100, 4, 10, 100, 2, 119, 4, 10, 256, 8, 256, 10], [2, 28, 7, 96, 3, 9, 100, 5, 117, 3, 9, 201, 9, 216, 343], 230.25664f0, Float32[0.027882561, 53.324253, -4.7491045

Loss: 233.61794; Accuracy: 0.06
Training model Particle([2, 48, 2, 100, 4, 10, 100, 2, 121, 4, 10, 256, 7, 256, 10], [2, 28, 7, 96, 3, 9, 100, 5, 117, 3, 9, 201, 9, 216, 343], 230.25664f0, Float32[0.019517792, 37.326977, -3.3243732, 25.795664, 1.2632687, 5.369075, 23.0305, -3.151546, 1.5118864, 1.2625374, 5.154017, 52.339973, -1.2575376, 32.64156, -612.0788], 18).idx architecture.
Loss: 234.3531; Accuracy: 0.11
Training model Particle([2, 1, 10, 1, 2, 2, 1, 7, 1, 2, 2, 2, 10, 2, 1000], [4, 11, 6, 74, 2, 5, 80, 9, 146, 3, 2, 156, 9, 186, 593], 230.076f0, Float32[-15.578364, -55.297016, 20.656555, -116.83, -0.093391396, -14.536813, -101.88231, -1.8030692, -493.987, -5.737888, 0.013837415, -448.07748, 2.3144782, -300.57687, 2610.2532], 19).idx architecture.
Loss: 230.32901; Accuracy: 0.08
Training model Particle([2, 1, 9, 100, 2, 2, 97, 8, 1, 2, 2, 150, 10, 256, 841], [2, 1, 9, 65, 2, 2, 69, 8, 116, 2, 2, 129, 10, 169, 833], 229.2433f0, Float32[-0.14826487, -2.9512684, 1.9092578, 9.11105,

15-element Vector{Int64}:
   2
   1
   9
  65
   2
   2
  69
   8
 116
   2
   2
 129
  10
 169
 833