In [1]:
import Pkg 
Pkg.activate("./..")

[32m[1m  Activating[22m[39m project at `~/Development/QuantumGrav`


In [2]:
import QuantumGrav as QG
import Flux 
import DataFrames
import CausalSets
import Arrow
import JLD2

Generate some dummy data first. This is only there to demonstrate the usage of the `Dataset` type with the `Flux.Dataloader` type, so the details of data generation don´t matter here. 

In [3]:
data = QG.DataGeneration.generate_data_for_manifold(
    dimension = 2,
    seed = 329478,
    num_datapoints = 128,
)

Dict{Symbol, Vector} with 17 entries:
  :past_relations         => Vector{Vector{Int8}}[[[0, 0, 0, 0, 0, 0, 0, 0, 0, …
  :n                      => Float32[977.0, 718.0, 739.0, 833.0, 1028.0, 872.0,…
  :chains_4               => Float32[1.64535f9, 4.31597f8, 4.9879f8, 8.88973f8,…
  :chains_3               => Float32[2.67767f7, 9.87888f6, 1.11307f7, 1.66091f7…
  :chains_10              => Float32[5.56135f16, 2.30276f15, 2.39353f15, 1.4609…
  :link_matrix            => SparseMatrixCSC{Float32, Int32}[sparse(Int32[1, 1,…
  :chain_dimension_4      => Float32[1.98833, 2.01977, 2.01107, 1.98175, 2.0086…
  :coords                 => Vector{Vector{Float32}}[[[-0.474722, 0.0141404], […
  :relation_dimension     => Float32[1.97364, 2.02043, 1.99069, 1.98331, 2.0235…
  :relation_count         => Float32[243390.0, 126919.0, 137486.0, 175655.0, 25…
  :cardinality_abundances => Vector{Float32}[[977.0, 5518.0, 4344.0, 3915.0, 36…
  :dimension              => Float32[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,

In [4]:
data[:manifold]

128-element Vector{String}:
 "hypercylinder"
 "hypercylinder"
 "minkowski"
 "hypercylinder"
 "antiDeSitter"
 "antiDeSitter"
 "minkowski"
 "torus"
 "hypercylinder"
 "minkowski"
 ⋮
 "minkowski"
 "deSitter"
 "deSitter"
 "minkowski"
 "minkowski"
 "deSitter"
 "minkowski"
 "torus"
 "torus"

create a bunch of files, here with the same data just for demonstration

In [5]:
dir = tempdir()
for i in 1:10   
    Arrow.write(joinpath(tempdir(), "testdata$(i).arrow"), data)
end


In [6]:
JLD2.jldopen(joinpath(dir, "testdata.jld2"), "w") do file
    for i in 1:10
        for k in keys(data)
            file["chunk$(i)/$(k)"] = data[k]
        end
    end
end

Create a dataset from the thing. The dataset uses lazy loading to fetch data on demand, and caches some of it to allow for a compromise between memory usage and speed. 

In [7]:
dset = QG.DataLoader.Dataset(
    dir, 
    mode = "arrow",
    cache_size = 5
)

QuantumGrav.DataLoader.Dataset("/var/folders/hb/ydjt274n5xq9fcm68pjb03j80000gn/T", ["testdata.arrow", "testdata1.arrow", "testdata10.arrow", "testdata2.arrow", "testdata3.arrow", "testdata4.arrow", "testdata5.arrow", "testdata6.arrow", "testdata7.arrow", "testdata8.arrow", "testdata9.arrow", "testdata_1.arrow", "testdata_2.arrow", "testdata_3.arrow"], Dict(56 => (2, 20), 35 => (1, 35), 425 => (12, 29), 429 => (12, 33), 60 => (2, 24), 220 => (7, 4), 308 => (9, 20), 67 => (2, 31), 215 => (6, 35), 73 => (3, 1)…), 36, Dict{Int64, Any}(), 5, "arrow")

In [8]:
dsetjld = QG.DataLoader.Dataset(
    dir, 
    mode = "jld2",
    cache_size = 5
)

QuantumGrav.DataLoader.Dataset("/var/folders/hb/ydjt274n5xq9fcm68pjb03j80000gn/T", "testdata.jld2", Dict(1144 => (9, 120), 1175 => (10, 23), 719 => (6, 79), 1028 => (9, 4), 699 => (6, 59), 831 => (7, 63), 1074 => (9, 50), 319 => (3, 63), 687 => (6, 47), 1199 => (10, 47)…), 128, Dict{Int64, Any}(), 5, "jld2")

Use the created dataset with a Flux dataloader (itself based on `MLUtils.jl`). We use shuffle and confirm that the data is reordered in the first batch. This can now be used to write a training loop

In [9]:
[x.manifold for x in dset[1:32]] 

32-element Vector{String}:
 "hypercylinder"
 "hypercylinder"
 "minkowski"
 "hypercylinder"
 "antiDeSitter"
 "antiDeSitter"
 "minkowski"
 "torus"
 "hypercylinder"
 "minkowski"
 ⋮
 "hypercylinder"
 "deSitter"
 "hypercylinder"
 "minkowski"
 "hypercylinder"
 "antiDeSitter"
 "antiDeSitter"
 "deSitter"
 "deSitter"

In [10]:
shuffle_loader = Flux.DataLoader(
    dset,
    batchsize = 32,
    shuffle = true,
)

16-element DataLoader(::QuantumGrav.DataLoader.Dataset, shuffle=true, batchsize=32)
  with first element:
  32-element Vector{NamedTuple}

In [11]:
d = [first(shuffle_loader)[i].manifold for i in 1:32]

32-element Vector{String}:
 "antiDeSitter"
 "hypercylinder"
 "antiDeSitter"
 "deSitter"
 "minkowski"
 "deSitter"
 "hypercylinder"
 "hypercylinder"
 "antiDeSitter"
 "minkowski"
 ⋮
 "antiDeSitter"
 "deSitter"
 "hypercylinder"
 "deSitter"
 "deSitter"
 "minkowski"
 "hypercylinder"
 "hypercylinder"
 "minkowski"

Data is shuffled. Yay! We can do the same thing without shuffling, and should get the data in the order it is in the dataset

In [50]:
deterministic_loader = Flux.DataLoader(
    dset,
    batchsize = 32,
    shuffle = false,
)

16-element DataLoader(::QuantumGrav.DataLoader.Dataset, batchsize=32)
  with first element:
  32-element Vector{@NamedTuple{past_relations::SubArray{SubArray{Int8, 1, Arrow.Primitive{Int8, Vector{Int8}}, Tuple{UnitRange{Int64}}, true}, 1, Arrow.List{SubArray{Int8, 1, Arrow.Primitive{Int8, Vector{Int8}}, Tuple{UnitRange{Int64}}, true}, Int32, Arrow.Primitive{Int8, Vector{Int8}}}, Tuple{UnitRange{Int64}}, true}, n::Float32, chains_4::Float32, chains_3::Float32, chains_10::Float32, link_matrix::SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, chain_dimension_4::Float32, coords::SubArray{SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, 1, Arrow.List{SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, Int32, Arrow.Primitive{Float32, Vector{Float32}}}, Tuple{UnitRange{Int64}}, true}, relation_dimension::Float32, relation_count::Float32, cardinality_abundan

In [51]:
d = [first(deterministic_loader)[i].manifold for i in 1:32]

32-element Vector{String}:
 "hypercylinder"
 "hypercylinder"
 "minkowski"
 "hypercylinder"
 "antiDeSitter"
 "antiDeSitter"
 "minkowski"
 "torus"
 "hypercylinder"
 "minkowski"
 ⋮
 "hypercylinder"
 "deSitter"
 "hypercylinder"
 "minkowski"
 "hypercylinder"
 "antiDeSitter"
 "antiDeSitter"
 "deSitter"
 "deSitter"

It's ordered now. Yay!

We can use some features from `MLUtils.jl` to split the data into train, test and validation loaders for example. note that the splits need to b in (0,1) (exclusive intervals!) and that the `splitob` function return an additional one for the last index that should be empty if the split fractions sum to 1, otherwise the last one contains the leftovers

In [12]:
train_loader, valid_loader, test_loader = Flux.DataLoader.(Flux.splitobs(dset, at=(0.5, 0.3, 0.2))[1:3], 
    batchsize = 16,
    shuffle = true,
)

(DataLoader(::MLUtils.ObsView{QuantumGrav.DataLoader.Dataset, UnitRange{Int64}}, shuffle=true, batchsize=16), DataLoader(::MLUtils.ObsView{QuantumGrav.DataLoader.Dataset, UnitRange{Int64}}, shuffle=true, batchsize=16), DataLoader(::MLUtils.ObsView{QuantumGrav.DataLoader.Dataset, UnitRange{Int64}}, shuffle=true, batchsize=16))

In [13]:
train_loader

16-element DataLoader(::MLUtils.ObsView{QuantumGrav.DataLoader.Dataset, UnitRange{Int64}}, shuffle=true, batchsize=16)
  with first element:
  16-element Vector{@NamedTuple{past_relations::SubArray{SubArray{Int8, 1, Arrow.Primitive{Int8, Vector{Int8}}, Tuple{UnitRange{Int64}}, true}, 1, Arrow.List{SubArray{Int8, 1, Arrow.Primitive{Int8, Vector{Int8}}, Tuple{UnitRange{Int64}}, true}, Int32, Arrow.Primitive{Int8, Vector{Int8}}}, Tuple{UnitRange{Int64}}, true}, n::Float32, chains_4::Float32, chains_3::Float32, chains_10::Float32, link_matrix::SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, chain_dimension_4::Float32, coords::SubArray{SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, 1, Arrow.List{SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, Int32, Arrow.Primitive{Float32, Vector{Float32}}}, Tuple{UnitRange{Int64}}, true}, relation_dimension::Floa

In [14]:
valid_loader

10-element DataLoader(::MLUtils.ObsView{QuantumGrav.DataLoader.Dataset, UnitRange{Int64}}, shuffle=true, batchsize=16)
  with first element:
  16-element Vector{NamedTuple}

In [15]:
test_loader

7-element DataLoader(::MLUtils.ObsView{QuantumGrav.DataLoader.Dataset, UnitRange{Int64}}, shuffle=true, batchsize=16)
  with first element:
  16-element Vector{@NamedTuple{past_relations::SubArray{SubArray{Int8, 1, Arrow.Primitive{Int8, Vector{Int8}}, Tuple{UnitRange{Int64}}, true}, 1, Arrow.List{SubArray{Int8, 1, Arrow.Primitive{Int8, Vector{Int8}}, Tuple{UnitRange{Int64}}, true}, Int32, Arrow.Primitive{Int8, Vector{Int8}}}, Tuple{UnitRange{Int64}}, true}, relation_count::Float32, relation_dimension::Float32, coords::SubArray{SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, 1, Arrow.List{SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, Int32, Arrow.Primitive{Float32, Vector{Float32}}}, Tuple{UnitRange{Int64}}, true}, chain_dimension_3::Float32, cardinality_abundances::SubArray{Float32, 1, Arrow.Primitive{Float32, Vector{Float32}}, Tuple{UnitRange{Int64}}, true}, linkMatrix::SubArray{Float32, 

data loaders should cover the full dataset

In [18]:
(length(train_loader) + length(valid_loader) + length(test_loader)) * 16

528

In [21]:
(length(train_loader) + length(valid_loader) + length(test_loader)) * 16 >= length(dset) # >= because the last batch may be smaller than the batch size

true

For k-fold cross validation, we can use the `kfolds` function

In [22]:
for (x_train, x_val) in Flux.kfolds(dset, 5)
    println("Training set size: ", length(x_train))
    println("Validation set size: ", length(x_val))
end

Training set size: 403
Validation set size: 101
Training set size: 403
Validation set size: 101
Training set size: 403
Validation set size: 101
Training set size: 403
Validation set size: 101
Training set size: 404
Validation set size: 100


By default the folds are created using static splits. Use `shuffleobs` to randomly assign observations to the folds.

In [23]:
for (x_train, x_val) in Flux.kfolds(Flux.shuffleobs(dset), 5)
    println("Training set size: ", length(x_train))
    println("Validation set size: ", length(x_val))
end

Training set size: 403
Validation set size: 101
Training set size: 403
Validation set size: 101
Training set size: 403
Validation set size: 101
Training set size: 403
Validation set size: 101
Training set size: 404
Validation set size: 100


There is a lot more than this. Go to the `MLUtils.jl` documentation to learn more. 