In [62]:
using Flux, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle
using Base.Iterators: repeated, partition
using Printf, BSON
using CSV
using DataFrames
using Tables

# In this model, we use an Autoencoder to compress the information in the OTUs to
# a lower dimension. The target dimension is 100, which is comparable to the preliminary
# model. I would also expect to try both undercomplete and denoising AE. If time permits,
# I also want to try sparse AE, however this will be in another notebook as the model
# would be different.

In [63]:
# This function divide the data into 10 part and combine the otu with labels
# and return them in a batch
# So each batch has 22 tuples of 50 otus and an encoded label
function make_fold(data, idx)
    # batch for otu, 100*22
    data_batch = Array{Float32, 2}(undef, length(idx), length(data[1,:]))
    for i in 1:length(idx)
        data_batch[i,:] = data[idx[i],:]
    end
    return (data_batch', data_batch')
end

make_fold (generic function with 1 method)

In [64]:
# Since we have very small amount of dataset,I am going to use 10-fold validation here
function k_fold_partition(otu_batch)
    # partition the whole dataset into 10 folds
    fold_idx = partition(1:length(otu_batch[:,1]), length(otu_batch[:,1])÷10+1)
    # call make_fold and store the 10 folds
    whole_set = [make_fold(otu_batch, i) for i in fold_idx]

    return whole_set
end

k_fold_partition (generic function with 1 method)

In [65]:
model = Chain(
    Chain(
    Dense(2394, 1000, σ),
    Dense(1000, 400, σ),
    Dense(400, 100)),
    Chain(
    Dense(100, 400, σ),
    Dense(400, 1000, σ),
    Dense(1000, 2394, σ))
)

Chain(Chain(Dense(2394, 1000, σ), Dense(1000, 400, σ), Dense(400, 100)), Chain(Dense(100, 400, σ), Dense(400, 1000, σ), Dense(1000, 2394, σ)))

In [66]:
function train(whole_set) 
    for k in 1:10 
        println("Start training on the ", k, "th fold...")
        train_set = gpu(whole_set[Not(k)])
        test_set = gpu(whole_set[k])
        # reset all the parameters
        Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
        
        loss(x, y) = crossentropy(model(x),y)
        loss_inc = 0
        val_loss = 0
        last_val_loss = 100
        
        opt = ADAM(0.001)
        
        for epoch_idx in 1:100
            Flux.train!(loss, params(model), train_set, opt)
            val_loss = loss(test_set...)
            if val_loss > last_val_loss 
                loss_inc += 1
            end
            last_val_loss = val_loss
            if loss_inc >= 3
                println("Overfitting, force quit at epoch ", epoch_idx)
                break
            end
        end
        println("Finished training the ", k, "th folder..." )
        println("The final validation loss is: ", last_val_loss)
    end
end

train (generic function with 1 method)

In [67]:
# This cell load data and change them into the data format I want
# It then partition the data into

# Load data from the CSV file
data = CSV.read("../preliminary-model/processed-data/otu-yield-per-plant.csv", DataFrame)
data_arr = Matrix(data)
otu = data_arr[:, 2:2395]
whole_set = k_fold_partition(otu)

Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
loss(x, y) = crossentropy(model(x),y)

print("The untrained loss for each folder is: ")
for i in 1:10
    print(loss(whole_set[i]...), ", ")
end

println("Start training...")
train(whole_set)

The untrained loss for each folder is: 18.289469, 19.105185, 19.363474, 18.959044, 20.85761, 19.659395, 20.16257, 19.77652, 19.373173, 18.625883, Start training...
Start training on the 1th fold...
Finished training the 1th folder...
The final validation loss is: 0.98374456
Start training on the 2th fold...
Overfitting, force quit at epoch 87
Finished training the 2th folder...
The final validation loss is: 1.3892647
Start training on the 3th fold...
Overfitting, force quit at epoch 51
Finished training the 3th folder...
The final validation loss is: 1.2635081
Start training on the 4th fold...
Finished training the 4th folder...
The final validation loss is: 3.2856605
Start training on the 5th fold...
Finished training the 5th folder...
The final validation loss is: 0.5024388
Start training on the 6th fold...
Overfitting, force quit at epoch 96
Finished training the 6th folder...
The final validation loss is: 5.8994045
Start training on the 7th fold...
Finished training the 7th folder.