Just update to Flux.jl 0.14 without breakage, keeping CUDA as a stron…

…g dep (#88) - Update to Flux.jl 0.14 without breakage here, keeping CUDA as a strong dep - Add a use_gpu=true model kwarg to allow the user to opt out of using Flux.gpu which now warns if a gpu isn't available - Move to a PrecompileTools based precompilation approach which is more robust. This avoids downloading weights files (200MB) by loading dummy data. But the conv modelling isn't fully precompiled due to the dummy data
r3tex · Jul 17, 2023 · 64f45f2 · 64f45f2 · IanButterworth · Jul 17, 2023
1 parent b28ac4b
commit 64f45f2
Show file tree

Hide file tree

Showing 12 changed files with 81 additions and 138 deletions.
diff --git a/.github/workflows/RunTests.yml b/.github/workflows/RunTests.yml
@@ -21,11 +21,11 @@ jobs:
             julia-arch: x86
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: julia-actions/setup-julia@latest
         with:
           version: ${{ matrix.julia-version }}
-      - uses: julia-actions/julia-runtest@master
+      - uses: julia-actions/julia-runtest@v1
         with:
           coverage: false
       # - uses: julia-actions/julia-processcoverage@v1

diff --git a/.github/workflows_disabled/SnoopCompile.yml b/.github/workflows_disabled/SnoopCompile.yml
diff --git a/Project.toml b/Project.toml
@@ -1,29 +1,34 @@
 name = "ObjectDetector"
 uuid = "3dfc1049-5314-49cf-8447-288dfd02f9fb"
 authors = ["Robert Luciani"]
-version = "0.2.9"
+version = "0.2.10"
 
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 ImageDraw = "4381153b-2b60-58ae-a1ba-fd683676385f"
 ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
 ImageTransformations = "02fcd773-0e25-5acc-982a-7f6622650795"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
-
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
 BenchmarkTools = "0.4, 0.5, 0.6, 0.7, 1.0"
-Flux = "0.12, 0.13"
+CUDA = "4"
+Flux = "0.12, 0.13, 0.14"
 ImageCore = "0.8, 0.9"
 ImageDraw = "0.2"
 ImageFiltering = "0.6, 0.7"
 ImageTransformations = "0.8, 0.9"
 LazyArtifacts = "1.3"
 PrettyTables = "2.0"
+PrecompileTools = "1"
+cuDNN = "1"
 julia = "1.3"
 
 [extras]

diff --git a/deps/SnoopCompile/precompile/precompile_ObjectDetector.jl b/deps/SnoopCompile/precompile/precompile_ObjectDetector.jl
diff --git a/deps/SnoopCompile/snoopBenchmark.jl b/deps/SnoopCompile/snoopBenchmark.jl
diff --git a/deps/SnoopCompile/snoopCompile.jl b/deps/SnoopCompile/snoopCompile.jl
diff --git a/dev/compilation/compiler.jl b/dev/compilation/compiler.jl
diff --git a/dev/compilation/precompile.jl b/dev/compilation/precompile.jl
diff --git a/src/ObjectDetector.jl b/src/ObjectDetector.jl
@@ -2,6 +2,8 @@ module ObjectDetector
 export YOLO
 export prepareImage, prepareImage!, resizekern, sizethatfits, emptybatch, drawBoxes
 
+using CUDA
+import cuDNN # not used but needed to load Flux CUDA Exts in Flux 0.14+
 import Flux.gpu
 import Flux.cpu
 export gpu, cpu
@@ -16,6 +18,7 @@ using ImageCore
 using BenchmarkTools
 using PrettyTables
 using ImageDraw
+using PrecompileTools
 
 abstract type AbstractModel end
 function getModelInputSize end
@@ -28,7 +31,16 @@ import .YOLO
 
 include("utils.jl")
 
-include("../deps/SnoopCompile/precompile/precompile_ObjectDetector.jl")
-_precompile_()
+@setup_workload begin
+    @compile_workload begin
+        # don't use GPU here because GPU compilation of Conv requires realistic weights not dummy weights
+        yolomod = YOLO.v3_COCO(dummy=true, silent=true, use_gpu=false)
+        batch = emptybatch(yolomod)
+        res = yolomod(batch)
+        res = nothing
+        batch = nothing
+        yolomod = nothing
+    end
+end
 
 end #module
diff --git a/src/utils.jl b/src/utils.jl
@@ -5,7 +5,12 @@ Create an empty batched input array on the GPU if available.
 """
 function emptybatch(model::T) where {T<:AbstractModel}
     modelInputSize = getModelInputSize(model)
-    gpu(zeros(Float32, modelInputSize...))
+    batch = zeros(Float32, modelInputSize...)
+    if YOLO.uses_gpu(model)
+        gpu(batch)
+    else
+        batch
+    end
 end
 
 """

diff --git a/src/yolo/pretrained.jl b/src/yolo/pretrained.jl
@@ -1,32 +1,33 @@
 using Pkg.Artifacts
 ## YOLOV2
-function v2_COCO(;batch=1, silent=false, w=608, h=608, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)])
-    yolo(joinpath(models_dir,"yolov2-608.cfg"), joinpath(artifact"yolov2-COCO", "yolov2-COCO.weights"), batch, silent=silent, cfgchanges=cfgchanges)
+function v2_COCO(;batch=1, silent=false, w=608, h=608, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)], kwargs...)
+    yolo(joinpath(models_dir,"yolov2-608.cfg"), joinpath(artifact"yolov2-COCO", "yolov2-COCO.weights"), batch; silent, cfgchanges, kwargs...)
 end
-v2_608_COCO(;batch=1, silent=false, cfgchanges=nothing) = v2_COCO(w=608, h=608, batch=batch, silent=silent, cfgchanges=cfgchanges)
+v2_608_COCO(;cfgchanges=nothing, kwargs...) = v2_COCO(;w=608, h=608, cfgchanges, kwargs...)
 
 ## YOLOV2-tiny
-function v2_tiny_COCO(;batch=1, silent=false, w=416, h=416, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)])
-    yolo(joinpath(models_dir,"yolov2-tiny.cfg"), joinpath(artifact"yolov2-tiny-COCO", "yolov2-tiny-COCO.weights"), batch, silent=silent, cfgchanges=cfgchanges)
+function v2_tiny_COCO(;batch=1, silent=false, w=416, h=416, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)], kwargs...)
+    yolo(joinpath(models_dir,"yolov2-tiny.cfg"), joinpath(artifact"yolov2-tiny-COCO", "yolov2-tiny-COCO.weights"), batch; silent, cfgchanges, kwargs...)
 end
-v2_tiny_416_COCO(;batch=1, silent=false, cfgchanges=nothing) = v2_tiny_COCO(w=416, h=416, batch=batch, silent=silent, cfgchanges=cfgchanges)
+v2_tiny_416_COCO(;cfgchanges=nothing, kwargs...) = v2_tiny_COCO(;w=416, h=416, cfgchanges, kwargs...)
 
 ## YOLOV3
-function v3_COCO(;batch=1, silent=false, w=416, h=416, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)])
-    yolo(joinpath(models_dir,"yolov3-416.cfg"), joinpath(artifact"yolov3-COCO", "yolov3-COCO.weights"), batch, silent=silent, cfgchanges=cfgchanges)
+function v3_COCO(;batch=1, silent=false, w=416, h=416, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)], dummy::Bool=false, kwargs...)
+    weightsfile = dummy ? nothing : joinpath(artifact"yolov3-COCO", "yolov3-COCO.weights")
+    yolo(joinpath(models_dir,"yolov3-416.cfg"), weightsfile, batch; silent, cfgchanges, kwargs...)
 end
-v3_320_COCO(;batch=1, silent=false, cfgchanges=nothing) = v3_COCO(w=320, h=320, batch=batch, silent=silent, cfgchanges=cfgchanges)
-v3_416_COCO(;batch=1, silent=false, cfgchanges=nothing) = v3_COCO(w=416, h=416, batch=batch, silent=silent, cfgchanges=cfgchanges)
-v3_608_COCO(;batch=1, silent=false, cfgchanges=nothing) = v3_COCO(w=608, h=608, batch=batch, silent=silent, cfgchanges=cfgchanges)
+v3_320_COCO(;cfgchanges=nothing, kwargs...) = v3_COCO(;w=320, h=320, cfgchanges, kwargs...)
+v3_416_COCO(;cfgchanges=nothing, kwargs...) = v3_COCO(;w=416, h=416, cfgchanges, kwargs...)
+v3_608_COCO(;cfgchanges=nothing, kwargs...) = v3_COCO(;w=608, h=608, cfgchanges, kwargs...)
 
 ## YOLOV3 SPP
-function v3_SPP_COCO(;batch=1, silent=false, w=608, h=608, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)])
-    yolo(joinpath(models_dir,"yolov3-spp.cfg"), joinpath(artifact"yolov3-spp-COCO", "yolov3-spp-COCO.weights"), batch, silent=silent, cfgchanges=cfgchanges)
+function v3_SPP_COCO(;batch=1, silent=false, w=608, h=608, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)], kwargs...)
+    yolo(joinpath(models_dir,"yolov3-spp.cfg"), joinpath(artifact"yolov3-spp-COCO", "yolov3-spp-COCO.weights"), batch; silent, cfgchanges, kwargs...)
 end
-v3_spp_608_COCO(;batch=1, silent=false, cfgchanges=nothing) = v3_SPP_COCO(w=608, h=608, batch=batch, silent=silent, cfgchanges=cfgchanges)
+v3_spp_608_COCO(;cfgchanges=nothing, kwargs...) = v3_SPP_COCO(;w=608, h=608, cfgchanges, kwargs...)
 
 ## YOLOV3-tiny
-function v3_tiny_COCO(;batch=1, silent=false, w=416, h=416, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)])
-    yolo(joinpath(models_dir,"yolov3-tiny.cfg"), joinpath(artifact"yolov3-tiny-COCO", "yolov3-tiny-COCO.weights"), batch, silent=silent, cfgchanges=cfgchanges)
+function v3_tiny_COCO(;batch=1, silent=false, w=416, h=416, cfgchanges=[(:net, 1, :width, w), (:net, 1, :height, h)], kwargs...)
+    yolo(joinpath(models_dir,"yolov3-tiny.cfg"), joinpath(artifact"yolov3-tiny-COCO", "yolov3-tiny-COCO.weights"), batch; silent, cfgchanges, kwargs...)
 end
-v3_tiny_416_COCO(;batch=1, silent=false, cfgchanges=nothing) = v3_tiny_COCO(w=416, h=416, batch=batch, silent=silent, cfgchanges=cfgchanges)
+v3_tiny_416_COCO(;cfgchanges=nothing, kwargs...) = v3_tiny_COCO(;w=416, h=416, cfgchanges, kwargs...)
diff --git a/src/yolo/yolo.jl b/src/yolo/yolo.jl
@@ -6,9 +6,10 @@ import ..AbstractModel, ..getModelInputSize
 
 const models_dir = joinpath(@__DIR__, "models")
 
+using CUDA
+import cuDNN # not used but needed to load Flux CUDA Exts in Flux 0.14+
 using Flux
-import Flux.gpu
-using Flux.CUDA
+
 using LazyArtifacts
 
 const CU_FUNCTIONAL = Ref{Bool}(false)
@@ -74,19 +75,20 @@ end
 
 Read the YOLO binary weights
 """
-function readweights(bytes::IOBuffer, kern::Int, ch::Int, fl::Int, bn::Bool)
+function readweights(bytes::Union{IOBuffer,Nothing}, kern::Int, ch::Int, fl::Int, bn::Bool)
+    dummy = isnothing(bytes)
     if bn
-        bb = reinterpret(Float32, read(bytes, fl*4))
-        bw = reinterpret(Float32, read(bytes, fl*4))
-        bm = reinterpret(Float32, read(bytes, fl*4))
-        bv = reinterpret(Float32, read(bytes, fl*4))
+        bb = dummy ? ones(Float32, fl) : reinterpret(Float32, read(bytes, fl*4))
+        bw = dummy ? ones(Float32, fl) : reinterpret(Float32, read(bytes, fl*4))
+        bm = dummy ? ones(Float32, fl) : reinterpret(Float32, read(bytes, fl*4))
+        bv = dummy ? ones(Float32, fl) : reinterpret(Float32, read(bytes, fl*4))
         cb = zeros(Float32, fl)
-        cw = reshape(reinterpret(Float32, read(bytes, kern*kern*ch*fl*4)), kern, kern, ch, fl)
+        cw = dummy ? ones(Float32, kern, kern, ch, fl) : reshape(reinterpret(Float32, read(bytes, kern*kern*ch*fl*4)), kern, kern, ch, fl)
         cw = Float32.(flip(cw))
         return cw, cb, bb, bw, bm, bv
     else
-        cb = reinterpret(Float32, read(bytes, fl*4))
-        cw = reshape(reinterpret(Float32, read(bytes, kern*kern*ch*fl*4)), kern, kern, ch, fl)
+        cb = dummy ? ones(Float32, fl) : reinterpret(Float32, read(bytes, fl*4))
+        cw = dummy ? ones(Float32, kern, kern, ch, fl) : reshape(reinterpret(Float32, read(bytes, kern*kern*ch*fl*4)), kern, kern, ch, fl)
         cw = Float32.(flip(cw))
         return cw, cb, 0.0, 0.0, 0.0, 0.0
     end
@@ -202,6 +204,9 @@ function assertdimconform(cfgvec::Vector{Pair{Symbol,Dict{Symbol,T}}}) where {T}
     return true
 end
 
+gpu(x, use::Bool) = use ? Flux.gpu(x) : x
+uses_gpu(model::T) where {T<:AbstractModel} = model.uses_gpu
+
 ########################################################
 ##### THE YOLO OBJECT AND CONSTRUCTOR ##################
 ########################################################
@@ -210,9 +215,12 @@ mutable struct yolo <: AbstractModel
     chain::Array{Any, 1}                     # This holds chains of weights and functions
     W::Dict{Int64, T} where T <: DenseArray  # This holds arrays that the model writes to
     out::Array{Dict{Symbol, Any}, 1}         # This holds values and arrays needed for inference
+    uses_gpu::Bool                           # Whether the gpu was requested to be used
 
     # The constructor takes the official YOLO config files and weight files
-    yolo(cfgfile::String, weightfile::String, batchsize::Int = 1; silent::Bool = false, cfgchanges=nothing) = begin
+    yolo(cfgfile::String, weightfile::Union{Nothing,String}, batchsize::Int = 1; silent::Bool = false, cfgchanges=nothing, use_gpu::Bool=true) = begin
+        # load dummy weights (avoids download for precompilation)
+        dummy = isnothing(weightfile)
         # read the config file and return [:layername => Dict(:setting => value), ...]
         # the first 'layer' is not a real layer, and has overarching YOLO settings
         cfgvec = cfgread(cfgfile)
@@ -226,10 +234,18 @@ mutable struct yolo <: AbstractModel
         cfg = cfgvec[1][2]
         yoloversion = any(first.(cfgvec) .== :region) ? 2 : 3 #v2 calls the last stage "region", v3 uses "yolo"
         cfg[:yoloversion] = yoloversion
-        weightbytes = IOBuffer(read(weightfile)) # read weights file sequentially like byte stream
+        weightbytes = if dummy
+            nothing # readweights knows to make up dummy weights if this is nothing
+        else
+            IOBuffer(read(weightfile)) # read weights file sequentially like byte stream
+        end
         # these settings are populated as the network is constructed below
         # some settings are re-read later for the last part of construction
-        maj, min, subv, im1, im2 = reinterpret(Int32, read(weightbytes, 4*5))
+        maj, min, subv, im1, im2 = if dummy
+            ones(Int32, 5)
+        else
+            reinterpret(Int32, read(weightbytes, 4*5))
+        end
         cfg[:darknetversion] = VersionNumber("$maj.$min.$subv")
         cfg[:batchsize] = batchsize
         cfg[:output] = []
@@ -248,8 +264,8 @@ mutable struct yolo <: AbstractModel
                 act     = ACT[block[:activation]]
                 bn      = haskey(block, :batch_normalize)
                 cw, cb, bb, bw, bm, bv = readweights(weightbytes, kern, ch[end], filters, bn)
-                push!(stack, gpu(Conv(cw, cb; stride = stride, pad = pad, dilation = 1)))
-                bn && push!(stack, gpu(BatchNorm(identity, bb, bw, bm, bv, 1f-5, 0.1f0, true, true, nothing, length(bb))))
+                push!(stack, gpu(Conv(cw, cb; stride = stride, pad = pad, dilation = 1), use_gpu))
+                bn && push!(stack, gpu(BatchNorm(identity, bb, bw, bm, bv, 1f-5, 0.1f0, true, true, nothing, length(bb)), use_gpu))
                 push!(stack, let; _act(x) = act.(x) end)
                 push!(fn, Chain(stack...))
                 push!(ch, filters)
@@ -314,7 +330,7 @@ mutable struct yolo <: AbstractModel
         # PART 2 - THE SKIPS
         ####################
         # Create test image. Note that darknet is row-major, so width-first
-        testimgs = [gpu(rand(Float32, cfg[:width], cfg[:height], cfg[:channels], batchsize))]
+        testimgs = [gpu(rand(Float32, cfg[:width], cfg[:height], cfg[:channels], batchsize), use_gpu)]
         # find all skip-layers and all YOLO layers
         needout = sort(vcat(0, [l[1] for l in filter(f -> typeof(f) <: Tuple, fn)], findall(x -> x == nothing, fn) .- 1))
         chainstack = Flux.Chain[] # layers that just feed forward can be grouped together in chains
@@ -413,7 +429,7 @@ mutable struct yolo <: AbstractModel
             out[i][:ignore] = get(cfg[:output][i], :ignore_thresh, 0.3) # for ignoring detections of same object (overlapping)
         end
 
-        return new(cfg, chainstack, W, out)
+        return new(cfg, chainstack, W, out, use_gpu)
     end
 end