In [33]:
using Flux, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle, params 
using Base.Iterators: repeated, partition
using Printf, BSON
using CSV
using DataFrames
using Tables

In [63]:
BSON.@load "AENN_4.bson" model

In [50]:
# This function divide the data into 10 part and combine the otu with labels
# and return them in a batch
# So each batch has 22 tuples of 50 otus and an encoded label
function make_fold(data, label, idx)
    # batch for otu, 100*22
    data_batch = Array{Float32, 2}(undef, length(idx), length(data[1,:]))
    for i in 1:length(idx)
        data_batch[i,:] = data[idx[i],:]
    end
    # batch for label, 1(onehot encoding)*22
    label_batch = onehotbatch(label[idx], 0:1)
    return (data_batch', label_batch)
end

make_fold (generic function with 2 methods)

In [51]:
# Since we have very small amount of dataset,I am going to use 10-fold validation here
function k_fold_partition(data, label)
    # partition the whole dataset into 10 folds
    fold_idx = partition(1:length(data[:,1]), length(data[:,1])÷10+1)
    # call make_fold and store the 10 folds
    whole_set = [make_fold(data, label, i) for i in fold_idx]

    return whole_set
end

k_fold_partition (generic function with 2 methods)

In [64]:
# model construction
@info("Feed-Forward NN construction")
ff_model = Chain(
    # Input 100 predictors and feed into 52 neurons in the hidden layer
    # use ReLu as activation function
    Dense(4, 4, relu),
    Dense(4, 4, relu),
    Dense(4, 2),
    softmax
)

┌ Info: Feed-Forward NN construction
└ @ Main In[64]:2


Chain(
  Dense(4 => 4, relu),                  [90m# 20 parameters[39m
  Dense(4 => 4, relu),                  [90m# 20 parameters[39m
  Dense(4 => 2),                        [90m# 10 parameters[39m
  NNlib.softmax,
) [90m                  # Total: 6 arrays, [39m50 parameters, 584 bytes.

In [70]:
# Train the model in this cell
function train(whole_set)
    # record the accuracy for each folder
    mean_accuracy = zeros(Float32,10)
    # loop for 10 folds
    for k in 1:10
        # set the kth folder as testing set
        train_set = whole_set[Not(k)]
        test_set = whole_set[k]
        best_acc = 0.0
        # reset all the parameters
        Flux.loadparams!(ff_model, map(p -> p .= randn.(), Flux.params(ff_model)))
        # set to terminate the epoches if not improved for too long
        last_improvement = 0
        # crossentropy as loss function
        loss(x, y) = crossentropy(ff_model(x),y)
        # take mean of accuracy
        accuracy(x, y) = mean(onecold(ff_model(x)) .== onecold(y))
        # when learning rate is larger(0.1), the accuracy is far worse and some time result in the
        # same value with different input
        # when it's smaller(0.001), the result does not differ much
        opt = ADAM(0.01)
        # 100 epoches for each folder
        for epoch_idx in 1:200
            Flux.train!(loss, Flux.params(ff_model), train_set, opt)
            acc = accuracy(test_set...)
            #println("Current folder: ", k, ", Epoch: ", epoch_idx, ", Accuracy: ", acc)
            if acc >= 0.999
                best_acc = acc
                break
            end
            # update the best accuracy and last improvement
            if acc >= best_acc
                best_acc = acc
                last_improvement = epoch_idx
            end
            # no improvement for too long
            if epoch_idx - last_improvement >= 20 && opt.eta > 1e-6
                opt.eta /= 10.0
                # After dropping learning rate, give it a few epochs to improve
                last_improvement = epoch_idx
            end
            if epoch_idx - last_improvement >= 50
                break
            end
        end
        # save the best accuracy for this folder
        mean_accuracy[k] = best_acc
    end
    # output the average accuracy for 10 folders
    return mean(mean_accuracy)
end


train (generic function with 1 method)

In [66]:
data = CSV.read("../processed-data/otu-yield-per-plant.csv", DataFrame)
data_arr = Matrix(data)
# only select the OTUs
otu = data_arr[:, 2:2395]
label = data_arr[:, 2396]
@info("Finish loading the data...")

┌ Info: Finish loading the data...
└ @ Main In[66]:6


In [67]:
code = Array{Float32}(undef, 216, 4)
for i in 1:length(general_info[:,1])
    code[i,:] .= model[1](otu[i,:])
end
@info ("Finish running the AE model ")

┌ Info: Finish running the AE model 
└ @ Main In[67]:5


In [72]:
# split the dataset into 10 folds
whole_set = k_fold_partition(code, label)
accuracy = train(whole_set)
println("The mean accuracy is ", accuracy)

The mean accuracy is 0.72424245
