In [None]:
import Pkg 
Pkg.activate("./..")

In [None]:
using Distributed

In [None]:
addprocs(3)

In [None]:
nprocs()

In [None]:
Threads.nthreads()

In [None]:
@everywhere begin 
    import QuantumGrav as QG
    import Arrow
    import Distributions
    import JLD2
    import ProgressMeter: @showprogress
    import Flux
end

Generate some dummy data first. This is only there to demonstrate the usage of the `Dataset` type with the `Flux.Dataloader` type, so the details of data generation don´t matter here. 

In [None]:
data = QG.DataGeneration.generate_data_for_manifold(
    dimension = 2,
    seed = 329478,
    num_datapoints = 128,
)

In [None]:
data[:manifold]

create a bunch of files, here with the same data just for demonstration

In [None]:
dir = tempdir()
for i in 1:10   
    Arrow.write(joinpath(tempdir(), "testdata$(i).arrow"), data)
end


In [None]:
JLD2.jldopen(joinpath(dir, "testdata.jld2"), "w") do file
    for i in 1:10
        for k in keys(data)
            file["chunk$(i)/$(k)"] = data[k]
        end
    end
end

Create a dataset from the thing. The dataset uses lazy loading to fetch data on demand, and caches some of it to allow for a compromise between memory usage and speed. 

In [None]:
dset = QG.DataLoader.Dataset(
    dir, 
    mode = "arrow",
    cache_size = 5
)

In [None]:
dsetjld = QG.DataLoader.Dataset(
    dir, 
    mode = "jld2",
    cache_size = 5
)

Use the created dataset with a Flux dataloader (itself based on `MLUtils.jl`). We use shuffle and confirm that the data is reordered in the first batch. This can now be used to write a training loop

In [None]:
[x.manifold for x in dset[1:32]] 

In [None]:
shuffle_loader = Flux.DataLoader(
    dset,
    batchsize = 32,
    shuffle = true,
)

In [None]:
d = [first(shuffle_loader)[i].manifold for i in 1:32]

Data is shuffled. Yay! We can do the same thing without shuffling, and should get the data in the order it is in the dataset

In [None]:
deterministic_loader = Flux.DataLoader(
    dset,
    batchsize = 32,
    shuffle = false,
)

In [None]:
d = [first(deterministic_loader)[i].manifold for i in 1:32]

It's ordered now. Yay!

We can use some features from `MLUtils.jl` to split the data into train, test and validation loaders for example. note that the splits need to b in (0,1) (exclusive intervals!) and that the `splitob` function return an additional one for the last index that should be empty if the split fractions sum to 1, otherwise the last one contains the leftovers

In [None]:
train_loader, valid_loader, test_loader = Flux.DataLoader.(Flux.splitobs(dset, at=(0.5, 0.3, 0.2))[1:3], 
    batchsize = 16,
    shuffle = true,
)

In [None]:
train_loader

In [None]:
valid_loader

In [None]:
test_loader

data loaders should cover the full dataset

In [None]:
(length(train_loader) + length(valid_loader) + length(test_loader)) * 16

In [None]:
(length(train_loader) + length(valid_loader) + length(test_loader)) * 16 >= length(dset) # >= because the last batch may be smaller than the batch size

For k-fold cross validation, we can use the `kfolds` function

In [None]:
for (x_train, x_val) in Flux.kfolds(dset, 5)
    println("Training set size: ", length(x_train))
    println("Validation set size: ", length(x_val))
end

By default the folds are created using static splits. Use `shuffleobs` to randomly assign observations to the folds.

In [None]:
for (x_train, x_val) in Flux.kfolds(Flux.shuffleobs(dset), 5)
    println("Training set size: ", length(x_train))
    println("Validation set size: ", length(x_val))
end

There is a lot more than this. Go to the `MLUtils.jl` documentation to learn more. 

## actual data generation for manifold like csets 

In [None]:
data = nothing 
train_loader = nothing
valid_loader = nothing
test_loader = nothing
dset = nothing
dsetjld = nothing
shuffle_loader = nothing
GC.gc() # Force garbage collection to free memory

In [None]:
large = d -> Distributions.Uniform(0.5*10^(d + 1), 2* 10^(d + 1))
interm = d -> Distributions.Uniform(3*10^d, 7* 10^d)
small = d -> Distributions.Uniform(0.5*10^d, 2.5* 10^d)
tiny = d -> Distributions.Uniform(0.1*10^(d), 0.25* 10^(d))

JLD2 in a single process

In [None]:
Distributed.nprocs(), Threads.nthreads()

In [None]:
dir = joinpath(tempdir(), "testdata")
mkdir(dir)

In [None]:
JLD2.jldopen(joinpath(dir, "manifold_like_small.jld2"), "w"; compress=true) do file
    @showprogress for i in 1:2^2

        data = QG.DataGeneration.generate_data_for_manifold(
            dimension = 2,
            seed = 329478,
            num_datapoints = 128,
            choose_num_events = small
        )

        for k in keys(data)
            file["chunk$(i)/$(k)"] = data[k]
        end
    end
end

Arrow files with multiple processes 

In [None]:
dir = joinpath("/", "mnt", "dataLinux", "machinelearning_data", "QuantumGrav", "interm")

if !isdir(dir)
    mkdir(dir)
end

@showprogress @distributed for i in 1:2^4

    if isfile(joinpath(dir, "chunk$(i).arrow"))
        throw(ArgumentError("File chunk$(i).arrow already exists in $dir."))
    end

    data = QG.DataGeneration.generate_data_for_manifold(
                dimension = 2,
                seed = 329478,
                num_datapoints = 2^12,
                choose_num_events = interm
            )

    Arrow.write(
                joinpath(dir, "chunk$(i).arrow"),
                data,
                compress = :zstd,
            )
end