In [7]:
using Flux, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle, params 
using Base.Iterators: repeated, partition
using Printf, BSON
using CSV
using DataFrames
using Tables

##########################################################################################
#                                                                                        #
#                                  BRIEF INTRODUCTION                                    #
#                                                                                        #
##########################################################################################
# In this model, we use an Autoencoder to compress the information in the OTUs to        #
# a lower dimension.                                                                     #
##########################################################################################
# Currently the target dimension is 50, but this number is due to change in the future.  #
##########################################################################################
# The AE implemented in this notebook is the UnderComplete AutoEncoder. The reduced      #
# dimension is self-defined and the model forced the compression to the target dimension.#
# If overfitting occurred (which is not the case in dimension 50 and 100), I would use   #
# De-noising AutoEncoder instead, which is the same model but its output layer has random#
# noise added to it.                                                                     #
##########################################################################################
# If time permits, I would also try the sparse AutoEncoder, which adds a regularizer to  #
# the loss function and introduces sparsity. In this model, I would be able to let the   #
# model decide what is the optimal dimension for all necessary information.              #
##########################################################################################

In [8]:
#=
This function add the data into each fold for K-fold Cross-Validation.
INPUT:
data: the whole un-partitioned dataset
idx : the index of the data that should be included into this fold
RETURN:
A tuple that includes the inputs(data) and outputs(label) of this fold.
In this case, the data and the labels are identical
=#
function make_fold(data, idx)
    # The 2D array for data in each folder.The dimension is 22*2394
    data_batch = Array{Float32, 2}(undef, length(idx), length(data[1,:]))
    # Add all data for this folder into the batch
    for i in 1:length(idx)
        data_batch[i,:] = data[idx[i],:]
    end
    return (data_batch', data_batch')
end

make_fold (generic function with 1 method)

In [9]:
#=
This function partition the whole dataset into 10 folds
INPUT:
otu_batch: the whole dataset
RETURN:
the whole dataset divided into 10 folds
=#
function k_fold_partition(otu_batch)
    # partition the whole dataset into 10 folds
    fold_idx = partition(1:length(otu_batch[:,1]), length(otu_batch[:,1])÷10+1)
    # call make_fold and store the 10 folds
    whole_set = [make_fold(otu_batch, i) for i in fold_idx]

    return whole_set
end

k_fold_partition (generic function with 1 method)

In [10]:
#=
This is the model for the UnderComplete AutoEncoder.
It uses 2 layers for encoding and decoding instead of one.
The advantage is that it compress the information in two steps instead of one 
radical step, which in theory would be more stable.
The disadvantage is that the computation is much slower.
The code layer is of dimension 50.
The activation functions are sigmoid function. It is recommanded more by online 
sources than Relu.
=#
model = Chain(
    Chain(
    Dense(2394, 800, σ),
    Dense(800, 200, σ),
    Dense(200, 50)),
    Chain(
    Dense(50, 200, σ),
    Dense(200, 800, σ),
    Dense(800, 2394, σ))
)

Chain(
  Chain(
    Dense(2394 => 800, σ),              [90m# 1_916_000 parameters[39m
    Dense(800 => 200, σ),               [90m# 160_200 parameters[39m
    Dense(200 => 50),                   [90m# 10_050 parameters[39m
  ),
  Chain(
    Dense(50 => 200, σ),                [90m# 10_200 parameters[39m
    Dense(200 => 800, σ),               [90m# 160_800 parameters[39m
    Dense(800 => 2394, σ),              [90m# 1_917_594 parameters[39m
  ),
) [90m                  # Total: 12 arrays, [39m4_174_844 parameters, 15.927 MiB.

In [32]:
#=
This function train the 10 folds and validate
The procedure for each of the fold is as follows:
1. set 1 fold as the validation set and the rest as training set
2. reset all parameters in the model
3. train the model for at max 200 epoches
4. for each epoch, calculate the loss for the validation set
5. if the loss increases, increment the counter, otherwise set the loss as best loss
6. if the loss for validation set has increased for 5 continuous epoches, terminate 
   to prevent overfitting
7. return the best loss for this folder
=#
function train(whole_set) 
    uni_best = 1
    # loop through all 10 folders
    for k in 1:10 
        println("Start training on the ", k, "th fold...")
        # set the training set and the testing set
        train_set = whole_set[Not(k)]
        test_set = whole_set[k]
        # reset all the parameters
        Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
        # here the loss function is MSE, I also tried cross-entropy. 
        # I'll write down the result for both in the conclusion section
        loss(x, y) = Flux.Losses.mse(model(x),y)
        # record the number of continuous epoches that the loss increases
        loss_inc = 0
        # the loss of the current epoch for the validation set
        val_loss = 1000
        # the lowest loss so far in this fold
        best_loss = 1000

        # the optimizer is Adam with learning rate of 0.001
        opt = ADAM(0.001)
        # the maximum epoch is 200
        for epoch_idx in 1:300
            # train the network
            Flux.train!(loss, params(model), train_set, opt)
            # calculate the validation loss for this epoch
            val_loss = loss(test_set...)
            # if the loss increases, increment the counter
            if val_loss >= best_loss 
                loss_inc += 1
            else
                # if not, then set the current loss as lowest
                best_loss = val_loss
                loss_inc = 0
                if best_loss < uni_best
                    uni_best = best_loss
                    BSON.@save "AENN.bson" model epoch_idx uni_best
                end
            end
            # loss has increased for 5 continuous epoch, exit to prevent overfitting
            if loss_inc >= 5 && opt.eta > 1e-6
                opt.eta /= 10.0
                loss_inc = 0
            end
            
            if loss_inc >= 10
                println("Force exit to prevent overfit at epoch: ", epoch_idx)
                break
            end
        end
        println("Finished training the ", k, "th folder..." )
        println("The final validation loss is: ", best_loss)
        println("------------------------------------")
    end
    return uni_best
end

train (generic function with 1 method)

In [33]:
#=
This is the wrapper function to perform the training
=#

# Load data from the CSV file and transform them into an array
data = CSV.read("../processed-data/otu-yield-per-plant.csv", DataFrame)
data_arr = Matrix(data)
# only select the OTUs
otu = data_arr[:, 2:2395]
# partition them into 10 folds
whole_set = k_fold_partition(otu)

# reset the parameter of the model to get the untrained loss, just as a reference
Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
loss(x, y) = Flux.Losses.mse(model(x),y)
print("The untrained loss for each folder is: ")
for i in 1:10
    print(loss(whole_set[i]...), ", ")
end

# Start the training
println("Start training...")
best_loss = train(whole_set)
println("The lowest loss among all models is: ", best_loss)

##########################################################################################
#                                                                                        #
#                                      CONCLUSION                                        #
#                                                                                        #
##########################################################################################
# Not much problem has occured during the training process. The overfitting did not      #
# occured. However, whether the model is good enough to compress the necessary           #
# information is to be discussed. More on this later.                                    #
##########################################################################################
# For the model itself, I tried both 100 and 50 as target dimension. Based on the result,#
# 100 and 50 have similar validation loss, which indicates that 50 encoded neurons has   #
# as much information as 100 encoded neurons. This is good.                              #
##########################################################################################
# I tried MSE and cross-entropy as loss functions.                                       #
# For Cross-entropy:                                                                     #
# The loss for untrained model is around 20, and the trained model has                   #
# loss between 0.5 to 1.5, which some outliers with 3 or 5 losses. I found out that the  #
# loss converge towards 1.5 rapidly in the first 3 epoches for most cases. However, it   #
# is known that Cross-entropy is good for classification, and MSE is better for          #
# regression. Here we have a regression problem.                                         #
# For MSE:                                                                               #
# The loss for untrained model is much smaller, around 0.47 to 0.48. After the training, #
# the loss decreased to 0.13 to 0.16, and is very stable with no outlier.                #
##########################################################################################
# The big question for now is how to analyse the quality of the model.                   #
# For classification problems, the universal method is accuracy by comparing the one-hot #
# encoded output. This cannot be done in this case, as accuracy would be 0 (having       #
# exactly the same value for input and output is very bad).                              #
# The method I'm using now is to check the loss. Indeed the loss decreased a lot, but I  #
# do not know if it is good enough, as there is not a reference for comparison. One way  #
# is to normalize all the value so that the loss in more universal in some way.          #
##########################################################################################
# Next step for AutoEncoder:                                                             #
# I will put the code layer into a feedforward NN, just to compare its performance       #
# against the random selection model. If time permits, I want to implement the variants  #
# I mentioned in the beginning of this notebook. However, I do want to focus on the BNN  #
# as that is the main challenge. Thus, if the performance is better in the feedforward   #
# NN, then I would move forward to BNN, and maybe visit this part later in the project.  #
##########################################################################################

The untrained loss for each folder is: 0.46488762, 0.4618976, 0.46499193, 0.46676403, 0.4609492, 0.46118698, 0.4636053, 0.46428007, 0.46435606, 0.46840236, Start training...
Start training on the 1th fold...
Force exit to prevent overfit at epoch: 126
Finished training the 1th folder...
The final validation loss is: 0.14770779
------------------------------------
Start training on the 2th fold...
Force exit to prevent overfit at epoch: 98
Finished training the 2th folder...
The final validation loss is: 0.15950239
------------------------------------
Start training on the 3th fold...
Force exit to prevent overfit at epoch: 171
Finished training the 3th folder...
The final validation loss is: 0.15170877
------------------------------------
Start training on the 4th fold...
Finished training the 4th folder...
The final validation loss is: 0.13265596
------------------------------------
Start training on the 5th fold...
Finished training the 5th folder...
The final validation loss is: 0.1