In [1]:
using Flux, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle
using Base.Iterators: repeated, partition
using Printf, BSON
using CSV
using DataFrames

# In this model, we test how randomly select OTUs would work in the same model. 
# This model will only work for yield_per_plant as outcome, but I would assume that
# it will also work for others with minor modification.
# I will randomly select 100 out of 2395 OTUs as predictors and feed it into the NN
# For the hidden layer, I would only add one hidden layer. As discussed in many websites
# I viewed, under very rare circumstances would a second layer improve the performance.
# Also, with 2 or more layer, the NN would be harder to train, and it is bad with the 
# small amount of data we have.
# For the number of neurons, I read about a general rule of thumb that "the optimal size 
# of the hidden layer is between the input size and the output size:, so I set the hidden 
# size to 52.
# For output, I split the values of yield_per_plant into 4 categories, based on quantile. 
# This might be problematic.

In [2]:
##### This cell import the processed data for yield_per_plant and transform the outcome data.
# It also transform the data into array type so it's easier to work with.

@info("Loading data from the .csv file")
# Load data from the CSV file
data = CSV.read("./processed-data/otu-yield-per-plant.csv", DataFrame)
data_arr = Matrix(data)

@info("Data Processing...")
# Split the array into otu and label
otu = data_arr[:, 2:2395]
label = data_arr[:, 2396]
# Transform the array into 4 categories based on quantile
for i in 1:length(label)
    if label[i] <= 751
        label[i] = 0
    elseif label[i] <= 1068
        label[i] = 1
    elseif label[i] <= 1444
        label[i] = 2
    else
        label[i] = 3
    end
end

# transform the label into onehot encoding
# label_batch = onehotbatch(label, 0:3)

# select 50 unique numbers between 1 to 2394 (the 50 unique predictor)
# if not unique, select again
rand_num = zeros(Int, 100)
while true
    rand_num = rand((1:2394),100)
    if length(unique(rand_num)) == 100
        break
    end
end

# select the corresponding OTUs
otu_batch = otu[:,rand_num]

a = Flux.Data.DataLoader(otu_batch, batchsize=22)

println("Mean of all otu values: ", mean(otu_batch))

@info("Loading Dataset...Done")


┌ Info: Loading data from the .csv file
└ @ Main In[2]:4
┌ Info: Data Processing...
└ @ Main In[2]:9


Mean of all otu values: 0.0013581864052314813


┌ Info: Loading Dataset...Done
└ @ Main In[2]:46


In [3]:
# model construction
@info("Feed-Forward NN construction")
model = Chain(
    # Input 100 predictors and feed into 52 neurons in the hidden layer
    # use ReLu as activation function
    Dense(100, 52, relu),
    # feed 27 neurons to the output, which consists of 4 categories
    Dense(52, 4),
    # use softmax as the activation function
    softmax
)

┌ Info: Feed-Forward NN construction
└ @ Main In[3]:2


Chain(Dense(100, 52, relu), Dense(52, 4), softmax)

In [4]:
# Since we have very small amount of dataset,I am going to use 10-fold validation here

@info("Dividing data into 10 folds...")
# This function divide the data into 10 part and combine the otu with labels
# and return them in a batch
# So each batch has 22 tuples of 50 otus and an encoded label
function make_fold(data, label, idx)
    # batch for otu, 50*22
    data_batch = Array{Float32, 2}(undef, length(idx), length(otu_batch[1,:]))
    for i in 1:length(idx)
        data_batch[i,:] = data[idx[i],:]
    end
    # batch for label, 1(onehot encoding)*22
    label_batch = onehotbatch(label[idx], 0:3)
    return (data_batch', label_batch)
end

# partition the whole dataset into 10 folds
fold_idx = partition(1:length(otu_batch[:,1]), length(otu_batch[:,1])÷10+1)
# call make_fold and store the 10 folds
whole_set = [make_fold(otu_batch, label, i) for i in fold_idx]
println("Checking folded data...")
println("Number of folds:",length(whole_set))
println("Type of each fold:", typeof(whole_set[1]))
println("Number of data entries each fold contains:", length(whole_set[1][1][1,:]))
println("Number of predictors for each entry:", length(whole_set[1][1][:,1]))
@info("10-folder split, Done...")


┌ Info: Dividing data into 10 folds...
└ @ Main In[4]:3


Checking folded data...
Number of folds:10
Type of each fold:Tuple{LinearAlgebra.Adjoint{Float32,Array{Float32,2}},Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}}
Number of data entries each fold contains:22
Number of predictors for each entry:100


┌ Info: 10-folder split, Done...
└ @ Main In[4]:27


In [5]:
# Train the model in this cell

# record the accuracy for each folder
mean_accuracy = zeros(Float32,10)
# loop for 10 folds
for k in 1:10
    # set the kth folder as testing set
    train_set = gpu(whole_set[Not(k)])
    test_set = gpu(whole_set[k])
    best_acc = 0.0
    
    # set to terminate the epoches if not improved for too long
    last_improvement = 0
    # crossentropy as loss function
    loss(x, y) = crossentropy(model(x),y)
    # take mean of accuracy
    accuracy(x, y) = mean(onecold(model(x)) .== onecold(y))
    # when learning rate is larger(0.1), the accuracy is far worse and some time result in the
    # same value with different input
    # when it's smaller(0.001), the result does not differ much
    opt = ADAM(0.01)
    
    @info("Beginning training loop...")

    # 100 epoches for each folder
    for epoch_idx in 1:100
        Flux.train!(loss, params(model), train_set, opt)
        acc = accuracy(test_set...)

        if acc >= 0.999
            best_acc = acc
            @info("Reached our target accuracy of 99.9%...terminate.")
            break
        end
        # update the best accuracy and last improvement
        if acc >= best_acc
            best_acc = acc
            last_improvement = epoch_idx
        end
        # no improvement for too long
        if epoch_idx - last_improvement >= 15 && opt.eta > 1e-6
            opt.eta /= 10.0
            # After dropping learning rate, give it a few epochs to improve
            last_improvement = epoch_idx
        end
        if epoch_idx - last_improvement >= 30
            @warn("Terminate before end of epoches")
            break
        end
    
    end
    # save the best accuracy for this folder
    mean_accuracy[k] = best_acc
    @warn("Current Folder Terminate, best accuracy is $(best_acc)")
end
# output the average accuracy for 10 folders
println("Average accuracy for 10 folder:",mean(mean_accuracy))


┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54
┌ Info: Beginning training loop...
└ @ Main In[5]:23
└ @ Main In[5]:54


Average accuracy for 10 folder:0.5929293
