In [1]:
# Use package versions builtin to this repository.
import Pkg, Random
Pkg.activate(@__DIR__)
Pkg.instantiate()

# Load Flux and PlotlyJS for sweet interactive graphics
using Flux, WAV, FFTW, PlotlyJS, DSP, Statistics, Printf

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h

┌ Info: Recompiling stale cache file /Users/sabae/.julia/compiled/v1.0/PlotlyJS/1r9Ld.ji for PlotlyJS [f0f68f2c-4968-5e81-91da-67840de0976a]
└ @ Base loading.jl:1184


We are going to build a convolutional audio classification engine, that takes audio samples, computes spectral representations of them, feeds that into a convolutional network, and classifies the audio samples.

Let's start with the dataset.  This was downloaded/extracted from the [Kaggle cats and dogs audio classification task](https://www.kaggle.com/mmoreaux/audio-cats-and-dogs/version/5).  We're going to begin by defining some utility functions for processing the data:

In [2]:
# Helper function to convert raw time-series data to a spectral representation
# with a given FFT length.
function stft(x, N; spectral_ROI=0.3, window = hanning(N))
    # Calculate indices where our frames start
    frame_starts = 1:N:(length(x) - N + 1)
    
    # Calculate the RFFT over the first dimension, windowed by `window`.
    frames = [rfft(x[idx:idx+N-1,:] .* window) for idx in frame_starts]
    
    # Only keep a portion of the spectrum
    Nkeep = round(Int, div(N,2)*spectral_ROI)
    frames = [frame[1:Nkeep] for frame in frames]
    
    # Return a matrix smooshing them all together, adding one more axis than there was:
    return hcat(frames...)'
end

stft (generic function with 1 method)

In [3]:
# Let's take a quick look at a cat and dog datapoint:
cat_data = wavread(joinpath("cats_dogs_train", "cat_1.wav"))[1];
dog_data = wavread(joinpath("cats_dogs_train", "dog_barking_1.wav"))[1];

D_cat = stft(cat_data, 512)
D_dog = stft(dog_data, 512);

In [4]:
plot(heatmap(z=log.(abs.(D_cat))), Layout(title="Cat Sample #1"))

In [5]:
plot(heatmap(z=log.(abs.(D_dog))), Layout(title="Dog Sample #1"))

## Data Loader

We're going to need to load these files in a randomized order, and obtain labels stating whether this is a "dog" file or a "cat" file.  Let's go ahead and build this data iterator:

In [6]:
import Base: show, length

struct CatDogDataset
    cat_data::Vector{Vector{Float64}}
    dog_data::Vector{Vector{Float64}}
    
    function CatDogDataset(dirpath::String)
        # Get list of files in data directory
        files = readdir(dirpath)
        
        # Split into cat and dog files
        cat_files = [joinpath(dirpath, f) for f in files if startswith(f, "cat_")]
        dog_files = [joinpath(dirpath, f) for f in files if startswith(f, "dog_")]
        
        # Helper function to load raw timeseries data from a .wav file,
        # sum across channels (if there are any) and return the resultant
        # 1-dimensional vector
        load_data(filename) = mean(wavread(filename)[1], dims=2)[:, 1]
                                
        # Load the file data, converting to a single channel if we need to.
        cat_data = [mean(wavread(f)[1], dims=2)[:,1] for f in cat_files]
        dog_data = [mean(wavread(f)[1], dims=2)[:,1] for f in dog_files]
        return new(cat_data, dog_data)
    end
end

# When we ask for the "length" of a dataset, return the minimum length
# of the classes.
function length(d::CatDogDataset)
    return min(length(d.cat_data), length(d.dog_data))
end
                            
function show(io::IO, d::CatDogDataset)
    num_cats = length(d.cat_data)
    num_dogs = length(d.dog_data)
    print(io, "CatDogDataset with $num_cats cat samples, $num_dogs dog samples")
end

train_dataset = CatDogDataset("cats_dogs_train")
test_dataset = CatDogDataset("cats_dogs_test")
                            
println("Training Dataset: $(train_dataset)")
println("Testing Dataset: $(test_dataset)");

Training Dataset: CatDogDataset with 125 cat samples, 74 dog samples
Testing Dataset: CatDogDataset with 39 cat samples, 39 dog samples


We next create a data sampler that shuffles, augments and batches our data:

In [7]:
import Base: iterate

# Next, define a batched datasampler that pulls from the CatDogDataset,
# implementing useful features such as data augmentation, stratified sampling,
# shuffling, etc....
struct RandomDataSampler
    # The dataset we'll be pulling from
    dataset::CatDogDataset
    
    # How many samples (total, split evenly among classes) to return from iterate()
    batch_size::Int
    
    # How much random noise to add
    noise_amnt::Float64
    
    # How much of the beginning/ending to allow truncation to throw away (in terms of overall length)?
    truncate_amnt::Float64
    
    function RandomDataSampler(dataset, batch_size, noise_amnt=0.01, truncate_amnt=0.1)
        # Don't allow the user to truncate the entire file away.
        @assert truncate_amnt < 0.5
        return new(dataset, batch_size, noise_amnt, truncate_amnt)
    end
end

length(ds::RandomDataSampler) = ceil(Int, length(ds.dataset)/ds.batch_size)

# Implement iteration protocol to yield a new batch of data,
# sampling completely at random, but stratifying so that we end
# up with equal cat and dog samples within each minibatch.
function iterate(ds::RandomDataSampler, batch_idx=1)
    # Stop after the total number of batches (rounding up, so we might repeat a little bit)
    if batch_idx > length(ds)
        return nothing
    end

    # Load random cats and dogs
    random_cats = rand(ds.dataset.cat_data, div(ds.batch_size,2))
    random_dogs = rand(ds.dataset.dog_data, div(ds.batch_size,2))
    
    # Randomly truncate timeseries by lopping off starting and ending pieces
    function truncate(data)
        start_trunc = rand(1:round(Int,ds.truncate_amnt*length(data)))
        end_trunc   = rand(0:round(Int,ds.truncate_amnt*length(data)))
        return data[start_trunc:end - end_trunc]
    end
    random_cats = truncate.(random_cats)
    random_dogs = truncate.(random_dogs)

    # Get maximum length, so that we can zero-pad everything to be the same length
    max_len = maximum(vcat(length.(random_cats), length.(random_dogs)))
    function zero_pad(data)
        return vcat(data, zeros(max_len - length(data)))
    end
    
    # When we zero pad, we can then smoosh together into a matrix, via hcat()
    random_cats = hcat(zero_pad.(random_cats))
    random_dogs = hcat(zero_pad.(random_dogs))
    
    # And now we smooth these matrices together into one big matrix to get X
    X = hcat(random_cats..., random_dogs...)
    
    # Finally, we augment X with noise:
    X .+= randn(size(X))
    
    # Y is a matrix of one-hot activated labels.
    Y = zeros(Int, 2, ds.batch_size)
    
    # The first half of the batch is cats, the second half is dogs
    Y[1, 1:div(ds.batch_size,2)] .= 1
    Y[2, div(ds.batch_size,2)+1:end] .= 1
    
    # Return X and Y, and (batch_idx + 1) so we keep track of the batch index
    # as for loops call iterate() again and again
    return (X, Y), batch_idx + 1
end

function show(io::IO, ds::RandomDataSampler)
    print(io, "RandomDataSampler with batch size $(ds.batch_size) and epoch length $(ceil(Int, length(ds.dataset)/ds.batch_size))")
end;

We also need a more "boring" data sampler for our test set that doesn't do any augmentation, returns "batches" of size 1, etc...

In [8]:
# Also a boring "SequentialDataSampler" that always gives a single batch
# at a time, does no augmentation, etc....
struct SequentialDataSampler
    # The dataset we'll be pulling from
    dataset::CatDogDataset
end

# Implement iteration protocal, just returning raw timeseries
function iterate(ds::SequentialDataSampler, batch_idx=1)
    # Stop after the total number of examples
    if batch_idx > length(ds)
        return nothing
    end
    
    num_cats = length(ds.dataset.cat_data)

    # If we're in the first half of our dataset, yield cats, directly
    # generating Y as appropriate.  If we're in the second half of our
    # dataset, yield the dogs.
    if batch_idx <= num_cats
        X = ds.dataset.cat_data[batch_idx][:,:]
        Y = Int64[1; 0]
    else
        X = ds.dataset.dog_data[batch_idx - num_cats][:,:]
        Y = Int64[0; 1]
    end
    
    return (X, Y), batch_idx + 1
end

function length(ds::SequentialDataSampler)
    num_cats = length(ds.dataset.cat_data)
    num_dogs = length(ds.dataset.dog_data)

    return num_cats + num_dogs
end

function show(io::IO, ds::SequentialDataSampler)
    print(io, "SequentialDataSampler with $(length(ds)) samples")
end;

In [9]:
train_sampler = RandomDataSampler(train_dataset, 8)
val_sampler = SequentialDataSampler(test_dataset)

SequentialDataSampler with 78 samples

In [10]:
# Show data sizes as an example of how to iterate over this data sampler
for (X, Y) in train_sampler
    @show size(X), size(Y)
end

(size(X), size(Y)) = ((200786, 8), (2, 8))
(size(X), size(Y)) = ((166806, 8), (2, 8))
(size(X), size(Y)) = ((223581, 8), (2, 8))
(size(X), size(Y)) = ((162622, 8), (2, 8))
(size(X), size(Y)) = ((165751, 8), (2, 8))
(size(X), size(Y)) = ((180488, 8), (2, 8))
(size(X), size(Y)) = ((183890, 8), (2, 8))
(size(X), size(Y)) = ((175695, 8), (2, 8))
(size(X), size(Y)) = ((185885, 8), (2, 8))
(size(X), size(Y)) = ((144275, 8), (2, 8))


## Model definition

We are going to take these timeseries and push them through the Short-Time-Fourier-Transform to get a time-frequency representation that will then be run through a recurrent network that takes in a column at a time to eventually produce an output vector that will be used to classify whether this is a cat or dog:

In [11]:
struct RecurrentModel
    feature_transformer
    recurrent_stacks
    classifier
    
    function RecurrentModel(NFFT, rnn_state_size, num_rnns; spectral_ROI=0.25)
        # Our "feature transformer" takes each timeseries, calculates the STFT and constructs an
        # image off of that
        xformer = (batch) -> begin
            STFT = (x) -> stft(x, NFFT; spectral_ROI=spectral_ROI)
            frames = [abs.(STFT(batch[:, idx])) for idx in 1:size(batch,2)]
            cat(frames...; dims=3)
        end

        recurrent_stacks = Chain(
            # Next, LSTM to go from NFFT down to rnn_state_size
            LSTM(round(Int, spectral_ROI*(NFFT/2+1)), rnn_state_size),
            # Keep on recurring within rnn_state_size for as many RNNs as we need
            (LSTM(rnn_state_size, rnn_state_size) for _ in 1:(num_rnns-1))...,
        )

        classifier = Chain(
            # Then convert from rnn_state_size down to our output class size
            Dense(rnn_state_size, 2),

            # And finally softmax it
            softmax,
        )

        return new(xformer, recurrent_stacks, classifier)
    end
end
Flux.@treelike RecurrentModel


# Forward pass definition
function (model::RecurrentModel)(X)
    # First, convert timeseries to temporal-spectral representation
    X = model.feature_transformer(X)
    
    # Reset the RNNs, then feed in a column of the image at a time
    Flux.reset!(model.recurrent_stacks)
    rnn_output = model.recurrent_stacks(X[1,:,:])
    for idx in 2:size(X,1)
        rnn_output = model.recurrent_stacks(X[idx,:,:])
    end
    
    # Finally, take that rnn output and classify it:
    classification = model.classifier(rnn_output)
    return classification
end

model = RecurrentModel(512, 16, 3)

RecurrentModel(getfield(Main, Symbol("##24#29")){Float64,Int64}(0.25, 512), Chain(Recur(LSTMCell(64, 16)), Recur(LSTMCell(16, 16)), Recur(LSTMCell(16, 16))), Chain(Dense(16, 2), NNlib.softmax))

In [12]:
# Show example output for an example batch
X, Y = first(train_sampler)
model(X)

Tracked 2×8 Array{Float64,2}:
 0.405853  0.417981  0.395743  0.399749  …  0.404967  0.404571  0.403309
 0.594147  0.582019  0.604257  0.600251     0.595033  0.595429  0.596691

## Training Loop

Alright, we're ready to train it up!  Let's write ourselves a training loop, very reminiscent to what we've seen before, but doing nice things like keeping track of training loss, so that we can plot it.

In [13]:
function accuracy(y_hat, y)
    ay_hat = Flux.onecold(y_hat)
    ay = Flux.onecold(y)
    return 100.0*sum(ay .== ay_hat)/size(y,2)
end

function train!(model, opt, train_data, val_data; num_epochs = 10)
    num_params = sum([prod(size(p)) for p in params(model)])
    println("Training model with $(num_params) parameters over $(num_epochs) epochs")
    train_losses = Float64[]
    val_losses = Float64[]
    
    for epoch_idx in 1:num_epochs
        for (x, y) in train_data
            y_hat = model(x)

            # Calculate the loss and backpropagate it
            loss = Flux.logitcrossentropy(y_hat, y)
            Flux.Optimise.@interrupts Flux.back!(loss)

            # Save training loss on a minibatch-by-minibatch basis
            push!(train_losses, Flux.Tracker.data(loss))

            # Update the weights by taking an optimizer step
            opt()
        end
        
        
        # At the end of each epoch, run our entire validation set through.

        # We know the shape of val_ys and val_y_hats
        val_ys = zeros(2, length(val_data))
        val_y_hats = zeros(2, length(val_data))
        
        batch_idx = 1
        for (x, y) in val_data
            val_y_hats[:, batch_idx] = Flux.Tracker.data(model(x))
            val_ys[:, batch_idx] = y
            batch_idx += 1
        end
        
        # finally, calculate overall validation loss:
        push!(val_losses, Flux.logitcrossentropy(val_y_hats, val_ys))
        println("[$epoch_idx]: val loss $(val_losses[end]), acc: $(@sprintf "%.2f" accuracy(val_y_hats, val_ys))")
    end
    
    # Return the losses so we can see how well we've converged
    return train_losses, val_losses
end

train! (generic function with 1 method)

In [14]:
models = Dict()
losses = Dict()

Dict{Any,Any} with 0 entries

In [None]:
for NFFT in (512, 256), state_size in (256, 128, 64), rnns in (3, 2)
    model_params = (NFFT,state_size,rnns)
    println("model hyperparameters: $(model_params)")
    
    models[model_params] = RecurrentModel(model_params...)
    opt = Flux.Optimise.ADAM(params(models[model_params]), 1e-2)
    losses[model_params] = train!(models[model_params], opt, train_sampler, val_sampler; num_epochs=100)
end

model hyperparameters: (512, 256, 3)
Training model with 1381378 parameters over 100 epochs
[1]: val loss 0.7054885774719337, acc: 50.00
[2]: val loss 0.7007106320897505, acc: 50.00
[3]: val loss 0.6957208777226257, acc: 50.00
[4]: val loss 0.6937366728230908, acc: 50.00
[5]: val loss 0.6933157782014882, acc: 50.00
[6]: val loss 0.6932803268087843, acc: 50.00
[7]: val loss 0.6932786542436985, acc: 50.00
[8]: val loss 0.6932802571828806, acc: 50.00
[9]: val loss 0.6932828380878043, acc: 50.00
[10]: val loss 0.6932831130316522, acc: 50.00
[11]: val loss 0.693282457115193, acc: 50.00
[12]: val loss 0.6932829848977003, acc: 41.03
[13]: val loss 0.693283766700495, acc: 44.87
[14]: val loss 0.6932839539589776, acc: 38.46
[15]: val loss 0.6932839041425455, acc: 42.31
[16]: val loss 0.6932845686676978, acc: 41.03
[17]: val loss 0.6932842881502881, acc: 42.31
[18]: val loss 0.6932846874255585, acc: 41.03
[19]: val loss 0.6932853257079444, acc: 42.31
[20]: val loss 0.6932857633483709, acc: 41.03

In [16]:
losses

Dict{Any,Any} with 0 entries

In [17]:
# Show losses for a specific set of parameters
model_params = (1024, 32, 4)
plot([
    scatter(;
        x=(0:(length(losses[model_params][1])-1))./length(train_sampler),
        y=losses[model_params][1],
        name="Train Losses",
    ),
    scatter(;
        x=1:length(losses[model_params][2]),
        y=losses[model_params][2],
        name="Validation Loss",
    ),
])

KeyError: KeyError: key (1024, 32, 4) not found