In [3]:
using Flux
using MLDatasets
using Random

In [4]:
train_x, train_y = CIFAR10.traindata(Float32, 1:5000)
test_x, test_y = CIFAR10.testdata(Float32, 1:5000);

In [5]:
println("Size of each image: ", size(train_x))
println("Label of 50th training datapoint: ", train_y[50])
# So here we can see that each training point is a 3D array - a 32x32 image with 3 color channels

Size of each image: (32, 32, 3, 5000)
Label of 50th training datapoint: 0


In [6]:
# Since this is a multi-class classification problem, we can use one hot encoding, just like the MNIST dataset.
# There's 10 classes just like mnist, so we encode from 0 to 9

train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9)
nclasses = length(train_y[:,1])
println("number of classes: ", nclasses)

number of classes: 10


In [7]:
# 3 VGG Block
model_VGG3 = Chain(
              Conv((3,3), 3=>32, relu, pad=SamePad()),
              Conv((3,3), 32=>32, relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 32=>64, relu, pad=SamePad()),
              Conv((3,3), 64=>64, relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 64=>128, relu, pad=SamePad()),
              Conv((3,3), 128=>256, relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten,
              Dense(4096,128,relu),
              Dropout(0.2),
              Dense(128,10),
              softmax)

Chain(
  Conv((3, 3), 3 => 32, relu, pad=1),   [90m# 896 parameters[39m
  Conv((3, 3), 32 => 32, relu, pad=1),  [90m# 9_248 parameters[39m
  MaxPool((2, 2)),
  Dropout(0.2),
  Conv((3, 3), 32 => 64, relu, pad=1),  [90m# 18_496 parameters[39m
  Conv((3, 3), 64 => 64, relu, pad=1),  [90m# 36_928 parameters[39m
  MaxPool((2, 2)),
  Dropout(0.2),
  Conv((3, 3), 64 => 128, relu, pad=1),  [90m# 73_856 parameters[39m
  Conv((3, 3), 128 => 256, relu, pad=1),  [90m# 295_168 parameters[39m
  MaxPool((2, 2)),
  Dropout(0.2),
  Flux.flatten,
  Dense(4096, 128, relu),               [90m# 524_416 parameters[39m
  Dropout(0.2),
  Dense(128, 10),                       [90m# 1_290 parameters[39m
  NNlib.softmax,
)[90m                   # Total: 16 arrays, [39m960_298 parameters, 3.666 MiB.

In [8]:
model_VGG3(train_x)

10×5000 Matrix{Float32}:
 0.09785    0.0975845  0.0954044  …  0.0971324  0.0974197  0.0982076
 0.102264   0.102372   0.101639      0.102131   0.101265   0.102551
 0.0970284  0.0972183  0.0961078     0.0962253  0.0961777  0.0975096
 0.0970968  0.0975925  0.0989338     0.0982039  0.0983154  0.0993957
 0.103986   0.104447   0.104582      0.103529   0.104394   0.102354
 0.09727    0.0957423  0.0976759  …  0.0979593  0.0982497  0.0985923
 0.103954   0.106033   0.106241      0.104876   0.105206   0.102793
 0.100826   0.100111   0.0985247     0.0987964  0.0983051  0.0976457
 0.0992934  0.0988542  0.100623      0.100458   0.10077    0.0992553
 0.100432   0.100044   0.100269      0.100689   0.0998976  0.101695

In [8]:
function loss_and_accuracy(udata, wdata, model)

    ndata = size(udata,4)

    ŵ = model(udata)
    loss = Flux.crossentropy(ŵ, wdata; agg=sum)
    accuracy = sum(Flux.onecold(ŵ) .== Flux.onecold(wdata)) / ndata
    return loss, accuracy
end

loss_and_accuracy (generic function with 1 method)

In [None]:
batch_size = 8
train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=true);

In [None]:
function train(model,train_loader,optimizer,train_x,train_y,test_x,test_y,model_name)
    train_losses = []
    train_accuracy = []
    test_losses = []
    test_accuracy = []
    α = 0.001 # <- stepsize; in the ML community, it is often denoted as a `learning rate η`
    #opt = optimizer(α) 
    opt = optimizer 
    K = 1
    for k in 1:K
        for (u, w) in train_loader
            gs = gradient(() -> Flux.Losses.crossentropy(model(u), w), Flux.params(model)) # compute gradient
            Flux.Optimise.update!(opt, Flux.params(model), gs) # update parameters
        end
        println("Epoch $k for $model_name architecture.")
        train_loss, train_acc = loss_and_accuracy(train_x, train_y,  model)

        test_loss, test_acc = loss_and_accuracy(test_x, test_y, model)

        println("  train_loss = $train_loss, train_accuracy = $train_acc")
        println("  test_loss = $test_loss, test_accuracy = $test_acc")
        
        push!(test_losses, test_loss)
        push!(test_accuracy, test_acc)
        push!(train_losses, train_loss)
        push!(train_accuracy, train_acc)
    end
    return train_losses, train_accuracy, test_losses, test_accuracy
end

In [7]:
function to_model(vector)
    # while of course we could make something fancy out of this, for now 
    # let's just have it encode back to our original model
    # if we were building a library or something we'd store more information about the
    # original model to not have to rebuild it like this
    return Chain(
              Conv((3,3), 3=>32, relu, pad=SamePad(), weight=reshape(vector[1:864],(3,3,3,32)), bias=vector[865:896]),# 896 - 32 bias 864
              Conv((3,3), 32=>32, relu, pad=SamePad(),weight=reshape(vector[897:10112],(3,3,32,32)), bias=vector[10112:10143]),# 9248 - 32 bias 9216
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 32=>64, relu, pad=SamePad(), weight=reshape(vector[10144:28575], (3,3,32,64)), bias=vector[28575:28638]), # 18496 - 64 = 18432 
              Conv((3,3), 64=>64, relu, pad=SamePad(), weight=reshape(vector[28639:65502], (3,3,64,64)), bias=vector[65503:65566]), # 36928 - 64 = 36864
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 64=>128, relu, pad=SamePad(),weight=reshape(vector[65568:139295], (3,3,64,128)), bias=vector[139296:139423]), # 73856 - 128 = 73728
              Conv((3,3), 128=>256, relu, pad=SamePad(),weight=reshape(vector[139424:434335], (3,3,128,256)), bias=vector[434336:434591]), # 295168 - 256 = 294912
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten,
              Dense(reshape(vector[434592:958879],(128,4096)),vector[958880:959007],relu), # 524416 - 128 bias = 524288
              Dropout(0.2),
              Dense(reshape(vector[959008:960287], (10,128)),vector[960288:960297],relu)) # 1290 - 10 bias = 1280
end

to_model (generic function with 1 method)

In [8]:
mutable struct Particle 
    position::Vector{Float32}
    best_position::Vector{Float32}
    best_accuracy::Float32
    velocity::Vector{Float32}
end

In [9]:
function train_PSO(model,train_loader,train_x,train_y,test_x,test_y, numparticles, ω, c1,c2)
    train_losses = []
    train_accuracy = []
    test_losses = []
    test_accuracy = []
    particles = []
    best_loss, best_accuracy = loss_and_accuracy(train_x, train_y,  model)
    
    params = Flux.params(model)
    params_flattened = []
    
    for p in params
        flattened = flatten(p)
        final = reshape(flattened, (1, size(flattened, 1) * size(flattened,2)))
        append!(params_flattened, final)
    end
    
    swarm_best_position = params_flattened
    d = size(params_flattened)
    
    #init particles
    for n in 1:numparticles
        push!(particles, Particle(params_flattened,params_flattened,best_accuracy,rand(Float32,size(params_flattened))))
    end

    println("Initalized particles!")
    
    for k in 1:3
        for (u, w) in train_loader
            for p in particles
                # vector math for additional speed boost
                # ω - inertia weight, how much the previous velocity impacts the current position
                # c1 - how much the particle pays attention to its own best position
                # c2 - how much the particle pays attention to the swarm's best position
                p.velocity = (ω .* p.velocity) .+
                (rand((-1.0,1.0),d) * c1 .* (p.best_position .- p.position)) .+
                (rand((-1.0,1.0),d) * c2 .* (swarm_best_position .- p.position))
                # need to make sure that these values are always between -1.0 and 1
                # clamp. broadcasts to entire vector
                p.position = clamp.(p.position .+ p.velocity,0.0,1.0)
            
                moved_model = to_model(p.position)
                this_loss, this_acc = loss_and_accuracy(train_x,train_y, moved_model)
                
                if p.best_accuracy < this_acc
                    p.best_accuracy = this_acc
                    p.best_position = p.position
                end
                
                if best_accuracy < this_acc
                    best_accuracy = this_acc
                    swarm_best_position = p.position
                    model = moved_model
                    println("New accuracy: $this_acc")
                end
            end
        end

        train_loss, train_acc = loss_and_accuracy(train_x, train_y,  model)
        test_loss, test_acc = loss_and_accuracy(test_x, test_y, model)

        println("  train_loss = $train_loss, train_accuracy = $train_acc")
        println("  test_loss = $test_loss, test_accuracy = $test_acc")
        
        push!(test_losses, test_loss)
        push!(test_accuracy, test_acc)
        push!(train_losses, train_loss)
        push!(train_accuracy, train_acc)
    end
    return train_losses, train_accuracy, test_losses, test_accuracy, model
end

train_PSO (generic function with 1 method)

In [15]:
vgg3_train_loss, 
vgg3_train_accuracy, 
vgg3_test_loss, 
vgg3_test_accuracy,trained_model = train_PSO(model_VGG3, train_loader, train_x, train_y,test_x, test_y, 10, 0.5, 0.3, 0.4);

LoadError: UndefVarError: model_VGG3 not defined

In [None]:
vgg3_train_loss, 
vgg3_train_accuracy, 
vgg3_test_loss, 
vgg3_test_accuracy = train(model_VGG3, train_loader, ADAM(0.001),train_x, train_y,test_x, test_y, "VGG3");

In [None]:
# TODO: 
# Generalize to_model function (difficult, not necessary)
# Run model with several different optimizer with different hyper parameters
# Run PSO with different hyper-parameters
# Build table with accuracy, loss plotted