In [1]:
using SimpleChains
using MLDatasets, Printf
using Flux
using Flux.Data: DataLoader
using Flux.Optimise: Optimiser, WeightDecay
using Flux: onehotbatch, onecold
using Flux.Losses: logitcrossentropy
using Statistics, Random
using CUDA 

In [2]:
num_image_classes = 10
learning_rate = 3e-4
num_epochs = 10

10

In [3]:
function get_data(split)
    x, y = MLDatasets.MNIST(split)[:]
    (reshape(x, 28 * 28 * 1, :), UInt32.(y .+ 1))
end

function display_loss(accuracy, loss)
    @printf("    training accuracy %.2f, loss %.4f\n", 100 * accuracy, loss)
end

display_loss (generic function with 1 method)

In [4]:
xtrain, ytrain = get_data(:train);
xtest, ytest = get_data(:test);

In [7]:

# simplechains model and loss
function sc_model() 
    return SimpleChain(
                (static(28 * 28)),
                TurboDense(SimpleChains.relu, 32),
                TurboDense(identity, 10))
end

sc_model (generic function with 1 method)

In [37]:
model = sc_model()
sc_loss = SimpleChains.add_loss(model, LogitCrossEntropyLoss(ytrain))

for i in 1:2
    println("SimpleChains run #$i")
    @time "  gradient buffer allocation" G = SimpleChains.alloc_threaded_grad(sc_loss)
    @time "  parameter initialization" p = SimpleChains.init_params(model)
    
    @time "  forward pass" model(xtrain, p)
    g = similar(p);
    @time "  valgrad!" valgrad!(g, sc_loss, xtrain, p)

    opt = SimpleChains.ADAM(learning_rate)
    @time "  train $(num_epochs) epochs" SimpleChains.train_batched!(G, p, sc_loss, xtrain, opt, num_epochs)

    @time "  compute training accuracy and loss" train_acc, train_loss = SimpleChains.accuracy_and_loss(sc_loss, xtrain, ytrain, p)
    display_loss(train_acc, train_loss)

    @time "  compute test accuracy and loss" test_acc, test_loss = SimpleChains.accuracy_and_loss(sc_loss, xtest, ytest, p)
    display_loss(test_acc, test_loss)

    println("------------------------------------")
end

SimpleChains run #1
  gradient buffer allocation: 0.000010 seconds (3 allocations: 596.781 KiB)
  parameter initialization: 0.000038 seconds (1 allocation: 99.500 KiB)
  forward pass: 0.045913 seconds (1 allocation: 32 bytes)


  valgrad!: 0.391498 seconds (32.56 k allocations: 1.800 MiB, 5.05% compilation time)


  train 10 epochs: 0.398857 seconds
  compute training accuracy and loss: 0.045554 seconds (17 allocations: 416 bytes)
    training accuracy 93.92, loss 0.2161
  compute test accuracy and loss: 0.008143 seconds (17 allocations: 416 bytes)
    training accuracy 93.88, loss 0.2170
------------------------------------
SimpleChains run #2
  gradient buffer allocation: 0.000008 seconds (3 allocations: 596.781 KiB)
  parameter initialization: 0.000039 seconds (1 allocation: 99.500 KiB)
  forward pass: 0.039509 seconds (1 allocation: 32 bytes)


  valgrad!: 0.407642 seconds (1 allocation: 16 bytes)


  train 10 epochs: 0.394469 seconds
  compute training accuracy and loss: 0.050877 seconds (22 allocations: 576 bytes)
    training accuracy 94.06, loss 0.2135
  compute test accuracy and loss: 0.007717 seconds (17 allocations: 416 bytes)
    training accuracy 93.89, loss 0.2151
------------------------------------


In [12]:
## Flux Model

In [13]:
use_cuda = CUDA.functional()
batchsize = 0
device = if use_cuda
  @info "Flux training on GPU"
  batchsize = 2048 
  gpu
else
  @info "Flux training on CPU"
  batchsize = 96 * Threads.nthreads()
  cpu
end

┌ Info: Flux training on GPU
└ @ Main c:\Users\heyia\Desktop\current-code\285\project\main\main.ipynb:4


gpu (generic function with 1 method)

In [32]:
function create_loader(x, y, batch_size, shuffle)
    y = onehotbatch(y, 1:num_image_classes)
    DataLoader((device(x), device(y)), batchsize = batch_size, shuffle = shuffle)
  end

train_loader = create_loader(xtrain, ytrain, batchsize, true)
test_loader = create_loader(xtest, ytest, batchsize, false)

5-element DataLoader(::Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, OneHotArrays.OneHotMatrix{UInt32, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}}}, batchsize=2048)
  with first element:
  (784×2048 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, 10×2048 OneHotMatrix(::CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}) with eltype Bool,)

In [33]:
function flux_model()
    return Chain(
        Dense(28*28, 32, Flux.relu),
        Dense(32, 10)
        ) |> device
end

flux_loss(ŷ, y) = logitcrossentropy(ŷ, y)

flux_loss (generic function with 1 method)

In [34]:
function eval_accuracy_loss(loader, model, device)
  l = 0.0f0
  acc = 0
  ntot = 0
  for (x, y) in loader
    x, y = x |> device, y |> device
    ŷ = model(x)
    l += flux_loss(ŷ, y) * size(x)[end]
    acc += sum(onecold(ŷ |> cpu) .== onecold(y |> cpu))
    ntot += size(x)[end]
  end
  return (acc = acc / ntot, loss = l / ntot)
end

eval_accuracy_loss (generic function with 1 method)

In [35]:
function train!(model, train_loader, opt)
    ps = Flux.params(model)
    for _ = 1:num_epochs
      for (x, y) in train_loader
        x = device(x)
        y = device(y)
        gs = Flux.gradient(ps) do
          ŷ = model(x)
          flux_loss(ŷ, y)
        end
        Flux.Optimise.update!(opt, ps, gs)
      end
    end
  end

train! (generic function with 1 method)

In [36]:
for run in 1:2
    println("Flux run #$run")
    @time "  create model" model = flux_model()
    opt = ADAM(learning_rate)
    @time "  train $num_epochs epochs" train!(model, train_loader, opt)
    @time "  compute training loss" train_acc, train_loss = eval_accuracy_loss(test_loader, model, device)
    display_loss(train_acc, train_loss)
    @time "  compute test loss" test_acc, test_loss = eval_accuracy_loss(train_loader, model, device)
    display_loss(test_acc, test_loss)
    println("------------------------------------")
end

Flux run #1
  create model: 0.001560 seconds (129 allocations: 204.828 KiB)


  train 10 epochs: 1.003846 seconds (809.96 k allocations: 61.771 MiB, 4.86% gc time, 16.57% compilation time: 11% of which was recompilation)
  compute training loss: 0.027564 seconds (21.54 k allocations: 1.859 MiB, 87.71% compilation time)
    training accuracy 90.14, loss 0.3989
  compute test loss: 0.024723 seconds (13.71 k allocations: 6.758 MiB)
    training accuracy 89.55, loss 0.4159
------------------------------------
Flux run #2
  create model: 0.000185 seconds (78 allocations: 201.422 KiB)


  train 10 epochs: 0.526895 seconds (529.26 k allocations: 47.701 MiB, 3.31% gc time)
  compute training loss: 0.002859 seconds (2.31 k allocations: 918.641 KiB)
    training accuracy 89.73, loss 0.4048
  compute test loss: 0.020799 seconds (13.71 k allocations: 6.758 MiB)
    training accuracy 89.11, loss 0.4225
------------------------------------


In [85]:
model = flux_model()

Chain(
  Dense(784 => 32, relu),               [90m# 25_120 parameters[39m
  Dense(32 => 10),                      [90m# 330 parameters[39m
) [90m                  # Total: 4 arrays, [39m25_450 parameters, 576 bytes.

In [86]:
count = 0
for layer in model
    count += sum(length, Flux.params(layer))
    # println(count)
    # println(Flux.params(layer))
end

In [87]:
length(Flux.params(model[1]))

2

In [90]:
for layer in model
    p = Flux.params(layer)
    size(p)
end

MethodError: MethodError: no method matching size(::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
Closest candidates are:
  size(!Matched::Union{LinearAlgebra.QR, LinearAlgebra.QRCompactWY, LinearAlgebra.QRPivoted}) at C:\Users\heyia\AppData\Local\Programs\Julia-1.8.2\share\julia\stdlib\v1.8\LinearAlgebra\src\qr.jl:581
  size(!Matched::Union{LinearAlgebra.QR, LinearAlgebra.QRCompactWY, LinearAlgebra.QRPivoted}, !Matched::Integer) at C:\Users\heyia\AppData\Local\Programs\Julia-1.8.2\share\julia\stdlib\v1.8\LinearAlgebra\src\qr.jl:580
  size(!Matched::Union{LinearAlgebra.Cholesky, LinearAlgebra.CholeskyPivoted}) at C:\Users\heyia\AppData\Local\Programs\Julia-1.8.2\share\julia\stdlib\v1.8\LinearAlgebra\src\cholesky.jl:514
  ...

In [81]:
size(model[1].weight), size(model[1].bias), 


((10,), (10, 32))

In [None]:
 size(model[2].weight), size(model[2].bias),

(32, 784)

In [84]:
for i=1:2
    w = size(model[i].weight)
    b = size(model[i].bias)
    println("$w $b")
end

In [61]:
T = typeof(model[1])
for (name, typ) in zip(fieldnames(T), T.types)
    println("type of the fieldname $name is $typ")
end

type of the fieldname weight is CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
type of the fieldname bias is CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}
type of the fieldname σ is typeof(NNlib.relu)
