In [1]:
using Pkg
using Flux
using MLDatasets
using Random

In [2]:
train_x, train_y = CIFAR10.traindata(Float32, 1:5000)
test_x, test_y = CIFAR10.testdata(Float32, 1:5000);

In [3]:
println("Size of each image: ", size(train_x))
println("Label of 50th training datapoint: ", train_y[50])
# So here we can see that each training point is a 3D array - a 32x32 image with 3 color channels

Size of each image: (32, 32, 3, 5000)
Label of 50th training datapoint: 0


In [4]:
# Since this is a multi-class classification problem, we can use one hot encoding, just like the MNIST dataset.
# There's 10 classes just like mnist, so we encode from 0 to 9

train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9)
nclasses = length(train_y[:,1])
println("number of classes: ", nclasses)

number of classes: 10


In [5]:
# 3 VGG Block
model_VGG3 = Chain(
              Conv((3,3), 3=>32, relu, pad=SamePad()),
              Conv((3,3), 32=>32, relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 32=>64, relu, pad=SamePad()),
              Conv((3,3), 64=>64, relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 64=>128, relu, pad=SamePad()),
              Conv((3,3), 128=>256, relu, pad=SamePad()),
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten,
              Dense(4096,128,relu),
              Dropout(0.2),
              Dense(128,10),
              softmax)

Chain(
  Conv((3, 3), 3 => 32, relu, pad=1),   [90m# 896 parameters[39m
  Conv((3, 3), 32 => 32, relu, pad=1),  [90m# 9_248 parameters[39m
  MaxPool((2, 2)),
  Dropout(0.2),
  Conv((3, 3), 32 => 64, relu, pad=1),  [90m# 18_496 parameters[39m
  Conv((3, 3), 64 => 64, relu, pad=1),  [90m# 36_928 parameters[39m
  MaxPool((2, 2)),
  Dropout(0.2),
  Conv((3, 3), 64 => 128, relu, pad=1),  [90m# 73_856 parameters[39m
  Conv((3, 3), 128 => 256, relu, pad=1),  [90m# 295_168 parameters[39m
  MaxPool((2, 2)),
  Dropout(0.2),
  Flux.flatten,
  Dense(4096, 128, relu),               [90m# 524_416 parameters[39m
  Dropout(0.2),
  Dense(128, 10),                       [90m# 1_290 parameters[39m
  NNlib.softmax,
)[90m                   # Total: 16 arrays, [39m960_298 parameters, 3.666 MiB.

In [6]:
function loss_and_accuracy(udata, wdata, model)

    ndata = size(udata,4)

    ŵ = model(udata)
    loss = Flux.crossentropy(ŵ, wdata; agg=sum)
    accuracy = sum(Flux.onecold(ŵ) .== Flux.onecold(wdata)) / ndata
    return loss, accuracy
end

loss_and_accuracy (generic function with 1 method)

In [17]:
function loss(udata, wdata, model)

    ndata = size(udata,4)

    ŵ = model(udata)
    loss = Flux.crossentropy(ŵ, wdata; agg=sum)
    return loss
end

loss (generic function with 1 method)

In [8]:
batch_size = 128
train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=true);

In [9]:
function train(model,train_loader,optimizer,train_x,train_y,test_x,test_y,model_name)
    train_losses = []
    train_accuracy = []
    test_losses = []
    test_accuracy = []
    α = 0.001 # <- stepsize; in the ML community, it is often denoted as a `learning rate η`
    #opt = optimizer(α) 
    opt = optimizer 
    K = 1
    for k in 1:K
        for (u, w) in train_loader
            gs = gradient(() -> Flux.Losses.crossentropy(model(u), w), Flux.params(model)) # compute gradient
            Flux.Optimise.update!(opt, Flux.params(model), gs) # update parameters
        end
        println("Epoch $k for $model_name architecture.")
        train_loss, train_acc = loss_and_accuracy(train_x, train_y,  model)

        test_loss, test_acc = loss_and_accuracy(test_x, test_y, model)

        println("  train_loss = $train_loss, train_accuracy = $train_acc")
        println("  test_loss = $test_loss, test_accuracy = $test_acc")
        
        push!(test_losses, test_loss)
        push!(test_accuracy, test_acc)
        push!(train_losses, train_loss)
        push!(train_accuracy, train_acc)
    end
    return train_losses, train_accuracy, test_losses, test_accuracy
end

train (generic function with 1 method)

In [10]:
#https://www.researchgate.net/post/How-to-adjust-deep-learning-parameters-using-Particle-swarm-optimization-PSO
#= You need to (1) define your fitness function to be the accuracy of the model. 
(2) initiate a population where each point is a vector containing the number of hidden layers and the 
number of features per layer (or any other network parameters based on your architecture). 
For each of these you need to train the model and evaluate the fitness function (calculate accuracy). 
(3) iterate using position and velocity updates (regular pso) until you satisfy the stopping conditions.

I am not entirely sure how efficient this whole process will be though. 
(1) You still need to set the parametrs of pso itself, and (
2) depending on your network architecture, the time it takes to train, and the parameters of that training, 
it might be very time consuming, and (3) there is no guarantee of converging to an optimal or near-optimal solution. 
On the bright side, since the fitness function itself is the accuracy,
you can always know if the model is doing better.=#

In [11]:
function to_model(vector)
    # while of course we could make something fancy out of this, for now 
    # let's just have it encode back to our original model
    # if we were building a library or something we'd store more information about the
    # original model to not have to rebuild it like this
    return Chain(
              Conv((3,3), 3=>32, relu, pad=SamePad(), weight=reshape(vector[1:864],(3,3,3,32)), bias=vector[865:896]),# 896 - 32 bias 864
              Conv((3,3), 32=>32, relu, pad=SamePad(),weight=reshape(vector[897:10112],(3,3,32,32)), bias=vector[10112:10143]),# 9248 - 32 bias 9216
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 32=>64, relu, pad=SamePad(), weight=reshape(vector[10144:28575], (3,3,32,64)), bias=vector[28575:28638]), # 18496 - 64 = 18432 
              Conv((3,3), 64=>64, relu, pad=SamePad(), weight=reshape(vector[28639:65502], (3,3,64,64)), bias=vector[65503:65566]), # 36928 - 64 = 36864
              MaxPool((2,2)),
              Dropout(0.2),
              Conv((3,3), 64=>128, relu, pad=SamePad(),weight=reshape(vector[65568:139295], (3,3,64,128)), bias=vector[139296:139423]), # 73856 - 128 = 73728
              Conv((3,3), 128=>256, relu, pad=SamePad(),weight=reshape(vector[139424:434335], (3,3,128,256)), bias=vector[434336:434591]), # 295168 - 256 = 294912
              MaxPool((2,2)),
              Dropout(0.2),
              Flux.flatten,
              Dense(reshape(vector[434592:958879],(128,4096)),vector[958880:959007],relu), # 524416 - 128 bias = 524288
              Dropout(0.2),
              Dense(reshape(vector[959008:960287], (10,128)),vector[960288:960297],relu)) # 1290 - 10 bias = 1280
end

to_model (generic function with 1 method)

In [12]:
mutable struct Particle 
    position::Vector{Float32}
    best_position::Vector{Float32}
    best_loss::Float32
    velocity::Vector{Float32}
end

In [18]:
function train_PSO(model,train_loader,train_x,train_y,test_x,test_y, numparticles, K)
    train_losses = []
    train_accuracy = []
    test_losses = []
    test_accuracy = []
    particles = []
    best_loss = loss(train_x, train_y,  model)
    
    params = Flux.params(model)
    params_flattened = []
    
    for p in params
        flattened = flatten(p)
        final = reshape(flattened, (1, size(flattened, 1) * size(flattened,2)))
        append!(params_flattened, final)
    end
    
    swarm_best_position = params_flattened
    
    #init particles
    for n in 1:numparticles
        push!(particles, Particle(params_flattened,params_flattened,best_loss,rand(Float32,size(params_flattened))))
    end
    d = size(params_flattened)
    println("Init particles")
    
    for k in 1:K
        for (u, w) in train_loader
            train_loss = loss(train_x, train_y,  model)
            for p in particles
                p.velocity = rand((-1.0,1.0),d) .* p.velocity .+ (p.best_position .- p.position) .+ swarm_best_position .- p.position
                #losses are going negative, need to make sure that these values are always between -1 and 1
                p.position = p.position .+ p.velocity
            
                moved_model = to_model(p.position)
                this_loss = loss(train_x,train_y, moved_model)
                # why is loss negative?
                if p.best_loss > this_loss
                    p.best_loss = this_loss
                    p.best_position = p.position
                end
                
                if best_loss > this_loss
                    best_loss = this_loss
                    swarm_best_position = p.position
                    model = moved_model
                    println("loss changed: ")
                    println(best_loss)
                end
            end
        end

        train_loss, train_acc = loss_and_accuracy(train_x, train_y,  model)

        test_loss, test_acc = loss_and_accuracy(test_x, test_y, model)

        println("  train_loss = $train_loss, train_accuracy = $train_acc")
        println("  test_loss = $test_loss, test_accuracy = $test_acc")
        
        push!(test_losses, test_loss)
        push!(test_accuracy, test_acc)
        push!(train_losses, train_loss)
        push!(train_accuracy, train_acc)
    end
    return train_losses, train_accuracy, test_losses, test_accuracy, trained_model
end

train_PSO (generic function with 1 method)

In [None]:
vgg3_train_loss, 
vgg3_train_accuracy, 
vgg3_test_loss, 
vgg3_test_accuracy,trained_model = train_PSO(model_VGG3, train_loader, train_x, train_y,test_x, test_y, 10,3);

Init particles
loss changed: 
-16948.178
loss changed: 
-34652.25
loss changed: 
-43505.97
loss changed: 
-46002.613
loss changed: 
-50752.344
loss changed: 
-58597.17
loss changed: 
-59744.438
loss changed: 
-71237.08
loss changed: 
-118723.445
loss changed: 
-122621.09
loss changed: 
-201062.6
loss changed: 
-220559.83
loss changed: 
-243602.94
loss changed: 
-252246.66


In [109]:
vgg3_train_loss, 
vgg3_train_accuracy, 
vgg3_test_loss, 
vgg3_test_accuracy = train(model_VGG3, train_loader, ADAM(0.001),train_x, train_y,test_x, test_y, "VGG3");

LoadError: UndefVarError: train not defined