In [1]:
using Flux, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle
using Base.Iterators: repeated, partition
using Printf, BSON
using CSV
using DataFrames
using Tables

##########################################################################################
#                                                                                        #
#                                  BRIEF INTRODUCTION                                    #
#                                                                                        #
##########################################################################################
# In this model, we use an Autoencoder to compress the information in the OTUs to        #
# a lower dimension.                                                                     #
##########################################################################################
# Currently the target dimension is 50, but this number is due to change in the future.  #
##########################################################################################
# The AE implemented in this notebook is the UnderComplete AutoEncoder. The reduced      #
# dimension is self-defined and the model forced the compression to the target dimension.#
# If overfitting occurred (which is not the case in dimension 50 and 100), I would use   #
# De-noising AutoEncoder instead, which is the same model but its output layer has random#
# noise added to it.                                                                     #
##########################################################################################
# If time permits, I would also try the sparse AutoEncoder, which adds a regularizer to  #
# the loss function and introduces sparsity. In this model, I would be able to let the   #
# model decide what is the optimal dimension for all necessary information.              #
##########################################################################################

In [2]:
#=
This function add the data into each fold for K-fold Cross-Validation.
INPUT:
data: the whole un-partitioned dataset
idx : the index of the data that should be included into this fold
RETURN:
A tuple that includes the inputs(data) and outputs(label) of this fold.
In this case, the data and the labels are identical
=#
function make_fold(data, idx)
    # The 2D array for data in each folder.The dimension is 22*2394
    data_batch = Array{Float32, 2}(undef, length(idx), length(data[1,:]))
    # Add all data for this folder into the batch
    for i in 1:length(idx)
        data_batch[i,:] = data[idx[i],:]
    end
    return (data_batch', data_batch')
end

make_fold (generic function with 1 method)

In [3]:
#=
This function partition the whole dataset into 10 folds
INPUT:
otu_batch: the whole dataset
RETURN:
the whole dataset divided into 10 folds
=#
function k_fold_partition(otu_batch)
    # partition the whole dataset into 10 folds
    fold_idx = partition(1:length(otu_batch[:,1]), length(otu_batch[:,1])÷10+1)
    # call make_fold and store the 10 folds
    whole_set = [make_fold(otu_batch, i) for i in fold_idx]

    return whole_set
end

k_fold_partition (generic function with 1 method)

In [4]:
#=
This is the model for the UnderComplete AutoEncoder.
It uses 2 layers for encoding and decoding instead of one.
The advantage is that it compress the information in two steps instead of one 
radical step, which in theory would be more stable.
The disadvantage is that the computation is much slower.
The code layer is of dimension 50.
The activation functions are sigmoid function. It is recommanded more by online 
sources than Relu.
=#
model = Chain(
    Chain(
    Dense(2394, 800, σ),
    Dense(800, 200, σ),
    Dense(200, 50)),
    Chain(
    Dense(50, 200, σ),
    Dense(200, 800, σ),
    Dense(800, 2394, σ))
)

Chain(Chain(Dense(2394, 800, σ), Dense(800, 200, σ), Dense(200, 50)), Chain(Dense(50, 200, σ), Dense(200, 800, σ), Dense(800, 2394, σ)))

In [5]:
function train(whole_set) 
    for k in 1:10 
        println("Start training on the ", k, "th fold...")
        train_set = gpu(whole_set[Not(k)])
        test_set = gpu(whole_set[k])
        # reset all the parameters
        Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
        
        loss(x, y) = crossentropy(model(x),y)
        loss_inc = 0
        val_loss = 1000
        best_loss = 1000
        last_val_loss = 1000
        
        opt = ADAM(0.001)
        
        for epoch_idx in 1:200
            Flux.train!(loss, params(model), train_set, opt)
            val_loss = loss(test_set...)
            if val_loss >= last_val_loss 
                loss_inc += 1
            else
                best_loss = val_loss
            end
            last_val_loss = val_loss
            if loss_inc >= 5
                println("Overfitting, force quit at epoch ", epoch_idx)
                break
            end
        end
        println("Finished training the ", k, "th folder..." )
        println("The final validation loss is: ", best_loss)
        println("------------------------------------")
    end
end

train (generic function with 1 method)

In [6]:
# This cell load data and change them into the data format I want
# It then partition the data into

# Load data from the CSV file
data = CSV.read("../preliminary-model/processed-data/otu-yield-per-plant.csv", DataFrame)
data_arr = Matrix(data)
otu = data_arr[:, 2:2395]
whole_set = k_fold_partition(otu)

Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
loss(x, y) = crossentropy(model(x),y)

print("The untrained loss for each folder is: ")
for i in 1:10
    print(loss(whole_set[i]...), ", ")
end

println("Start training...")
train(whole_set)

The untrained loss for each folder is: 20.07151, 19.86875, 21.555265, 22.037834, 22.264091, 23.456688, 23.189516, 24.2721, 24.593622, 23.640228, Start training...
Start training on the 1th fold...
Overfitting, force quit at epoch 85
Finished training the 1th folder...
The final validation loss is: 1.6548467
------------------------------------
Start training on the 2th fold...
Finished training the 2th folder...
The final validation loss is: 1.1548356
------------------------------------
Start training on the 3th fold...
Overfitting, force quit at epoch 185
Finished training the 3th folder...
The final validation loss is: 1.1771184
------------------------------------
Start training on the 4th fold...
Overfitting, force quit at epoch 110
Finished training the 4th folder...
The final validation loss is: 1.7424244
------------------------------------
Start training on the 5th fold...
Overfitting, force quit at epoch 114
Finished training the 5th folder...
The final validation loss is: 0.