In [1]:
using Flux, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle, params 
using Base.Iterators: repeated, partition
using Printf, BSON
using CSV
using DataFrames
using Tables

In [4]:
original_dat = CSV.read("../processed-data/otu-yield-per-plant.csv", header=true, DataFrame)
generated_dat = CSV.read("../processed-data/generated-data.csv", header=false, DataFrame)

original = Matrix(original_dat)
generated = Matrix(generated_dat)
full_data = vcat(original, generated)

otu = full_data[:,2:60]
label = full_data[:,62]

percentage = [(i, count(==(i), label)) for i in unique(label)]
println("Number of low-yield samples: ", percentage[1][2], "; Number of high-yield samples: ", percentage[2][2])

Number of low-yield samples: 222; Number of high-yield samples: 222


In [5]:
function make_fold(data, idx)
    # The 2D array for data in each folder.The dimension is 22*2394
    data_batch = Array{Float32, 2}(undef, length(idx), length(data[1,:]))
    # Add all data for this folder into the batch
    for i in 1:length(idx)
        data_batch[i,:] = data[idx[i],:]
    end
    return (data_batch', data_batch')
end

make_fold (generic function with 1 method)

In [6]:
function k_fold_partition(otu_batch)
    # partition the whole dataset into 10 folds
    fold_idx = partition(1:length(otu_batch[:,1]), length(otu_batch[:,1])÷10+1)
    # call make_fold and store the 10 folds
    whole_set = [make_fold(otu_batch, i) for i in fold_idx]

    return whole_set
end

k_fold_partition (generic function with 1 method)

In [38]:
model = Chain(
    Chain(
    Dense(59, 32, σ),
    #Dense(40, 25, σ),
    Dense(32, 5)),
    Chain(
    Dense(5, 32, σ),
    #Dense(25, 40, σ),
    Dense(32, 59, σ))
)

Chain(
  Chain(
    Dense(59 => 32, σ),                 [90m# 1_920 parameters[39m
    Dense(32 => 5),                     [90m# 165 parameters[39m
  ),
  Chain(
    Dense(5 => 32, σ),                  [90m# 192 parameters[39m
    Dense(32 => 59, σ),                 [90m# 1_947 parameters[39m
  ),
) [90m                  # Total: 8 arrays, [39m4_224 parameters, 17.125 KiB.

In [39]:
function train(whole_set) 
    uni_best = 1
    # loop through all 10 folders
    for k in 1:10 
        println("Start training on the ", k, "th fold...")
        # set the training set and the testing set
        train_set = whole_set[Not(k)]
        test_set = whole_set[k]
        # reset all the parameters
        Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
        # here the loss function is MSE, I also tried cross-entropy. 
        # I'll write down the result for both in the conclusion section
        loss(x, y) = Flux.Losses.mse(model(x),y)
        # record the number of continuous epoches that the loss increases
        loss_inc = 0
        # the loss of the current epoch for the validation set
        val_loss = 1000
        # the lowest loss so far in this fold
        best_loss = 1000

        # the optimizer is Adam with learning rate of 0.001
        opt = ADAM(0.001)
        # the maximum epoch is 200
        for epoch_idx in 1:10000
            # train the network
            Flux.train!(loss, params(model), train_set, opt)
            # calculate the validation loss for this epoch
            val_loss = loss(test_set...)
            # if the loss increases, increment the counter
            if val_loss >= best_loss 
                loss_inc += 1
            else
                # if not, then set the current loss as lowest
                best_loss = val_loss
                loss_inc = 0
                if best_loss < uni_best
                    uni_best = best_loss
                    BSON.@save "AENN.bson" model epoch_idx uni_best
                end
            end
            # loss has increased for 5 continuous epoch, exit to prevent overfitting
            if loss_inc >= 5 && opt.eta > 1e-8
                opt.eta /= 10.0
                loss_inc = 0
            end
            
            if loss_inc >= 10
                println("Force exit to prevent overfit at epoch: ", epoch_idx)
                break
            end
        end
        println("Finished training the ", k, "th folder..." )
        println("The final validation loss is: ", best_loss)
        println("------------------------------------")
    end
    return uni_best
end

train (generic function with 1 method)

In [40]:
whole_set = k_fold_partition(otu)
# reset the parameter of the model to get the untrained loss, just as a reference
Flux.loadparams!(model, map(p -> p .= randn.(), Flux.params(model)))
loss(x, y) = Flux.Losses.mse(model(x),y)
print("The untrained loss for each folder is: ")
for i in 1:10
    print(loss(whole_set[i]...), ", ")
end

# Start the training
println("Start training...")
best_loss = train(whole_set)
println("The lowest loss among all models is: ", best_loss)

The untrained loss for each folder is: 0.4032061, 0.40256885, 0.4030643, 0.4028901, 0.401919, 0.40236816, 0.40230292, 0.40226257, 0.40237787, 0.40295884, Start training...
Start training on the 1th fold...
Force exit to prevent overfit at epoch: 1623
Finished training the 1th folder...
The final validation loss is: 2.0491874e-5
------------------------------------
Start training on the 2th fold...
Force exit to prevent overfit at epoch: 1424
Finished training the 2th folder...
The final validation loss is: 4.5302775e-5
------------------------------------
Start training on the 3th fold...
Force exit to prevent overfit at epoch: 1997
Finished training the 3th folder...
The final validation loss is: 3.443326e-5
------------------------------------
Start training on the 4th fold...
Finished training the 4th folder...
The final validation loss is: 2.2111126e-5
------------------------------------
Start training on the 5th fold...
Force exit to prevent overfit at epoch: 1178
Finished traini

In [41]:
BSON.@load "AENN.bson" model


In [48]:
a = otu[1,:]
b = model(otu[1,:])
d = otu[5,:]
e = model(otu[5,:])
c = hcat(a,b,d,e)
CSV.write("../processed-data/plot.csv",  Tables.table(c), writeheader=false)

"../processed-data/plot.csv"