In [1]:
%install-location $cwd/swift-install
%install '.package(path: "../../..")' Epochs

Installing packages:
	.package(path: "../../..")
		Epochs
With SwiftPM flags: []
Working in: /tmp/tmpuu5gmkrr/swift-install
/home/sgugger/swift/usr/bin/swift-build: /home/sgugger/anaconda3/lib/libcurl.so.4: no version information available (required by /home/sgugger/swift/usr/lib/swift/linux/libFoundationNetworking.so)
[1/2] Compiling Epochs Batches.swift
[2/4] Compiling Epochs Collatable.swift
[3/4] Compiling Epochs Backend.swift
[4/5] Merging module Epochs
[5/8] Wrapping AST for Epochs for debugging
[6/8] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[7/9] Merging module jupyterInstalledPackages
[8/8] Linking libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [2]:
import TensorFlow
import Epochs

In [7]:
// Base use
// Some raw items (for instance filenames)
let rawItems = 0..<512
// A heavy-compute function lazily mapped on it (for instance, opening the images)
let dataSet = rawItems.lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
// A `Batches` defined on this:
let batches = Batches(on: dataSet, batchSize: 64, makeBatch: defaultMakeBatch)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [8]:
// Enabling shuffle
let batches = Batches(on: dataSet.shuffled(), batchSize: 64, makeBatch: defaultMakeBatch)

In [21]:
// Use with padding
// Let's create an array of things of various lengths (for instance texts)
var dataSet: [Tensor<Int32>] = []
for _ in 0..<512 {
    dataSet.append(Tensor<Int32>(
        randomUniform: [Int.random(in: 1...200)], 
        lowerBound: Tensor<Int32>(0), 
        upperBound: Tensor<Int32>(100)
    ))
}

// We need to pad those tensors to make them all the same length.
// We could do this in one lazy transform applied beforehand and pad everything
// to the same length, but it's not memory-efficient: some batches might need less
// padding. So we need to add the padding after having selected the samples we
// are trying to batch.
public func padTensors<Tensors: Collection>(tensors: Tensors) -> [Tensor<Int32>]
where Tensors.Element==Tensor<Int32> {
    let maxLength = tensors.map{ $0.shape[0] }.max()!
    return tensors.map { (t: Tensor<Int32>) -> Tensor<Int32> in 
        let remaining = Tensor<Int32>(zeros: [maxLength - t.shape[0]])
        return Tensor<Int32>(concatenating: [t, remaining])
    }
}

func makeBatchWithPadding<BatchSamples: Collection>(samples: BatchSamples) -> Tensor<Int32>
where BatchSamples.Element==Tensor<Int32>
{
    return .init(collating: padTensors(tensors: samples))
}

let batches = Batches(on: dataSet, batchSize: 64, makeBatch: makeBatchWithPadding)
for b in batches {
    print(b.shape)
}

[64, 199]
[64, 199]
[64, 200]
[64, 198]
[64, 200]
[64, 200]
[64, 199]
[64, 199]


In [27]:
// Use with a sampler
// In our previous example, another way to be memory efficient is to batch
// samples of roughly the same lengths.
let sortedDataset = dataSet.sortedInBatches() { $0.shape[0] > $1.shape[0] }

let batches = Batches(on: sortedDataset, batchSize: 64, makeBatch: makeBatchWithPadding)
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 176]
[64, 155]
[64, 127]
[64, 104]
[64, 78]
[64, 54]
[64, 24]


In [26]:
// When using a `batchSize` we get a bit of shuffle:
let sortedDataset = dataSet.shuffled().sortedInBatches(batchSize: 256) { $0.shape[0] > $1.shape[0] }

let batches = Batches(on: sortedDataset, batchSize: 64, makeBatch: makeBatchWithPadding)
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 176]
[64, 155]
[64, 127]
[64, 104]
[64, 78]
[64, 54]
[64, 24]


In [38]:
// Sometimes the shuffle method needs to be applied on the dataset itself, 
// like for language modeling. Here is a base version of that to test
// the API allows it.
struct DataSet: RandomAccessCollection {
    typealias Index = Int
    typealias Element = Tensor<Int32>
    
    let numbers: [[Int]]
    let sequenceLength: Int
    // The texts all concatenated together
    private var stream: [Int]
    
    var startIndex: Int { return 0 }
    var endIndex: Int { return stream.count / sequenceLength }
    func index(after i: Int) -> Int { i+1 }
    
    init(numbers: [[Int]], sequenceLength: Int) {
        self.numbers = numbers
        self.sequenceLength = sequenceLength
        stream = numbers.reduce([], +)
    }
    
    subscript(index: Int) -> Tensor<Int32> {
        get { 
            let i = index * sequenceLength
            return Tensor<Int32>(stream[i..<i+sequenceLength].map { Int32($0)} )
        }
    }
}

In [39]:
//Let's create such a DataSet
let numbers: [[Int]] = [[1,2,3,4,5], [6,7,8], [9,10,11,12,13,14,15], [16,17,18]]
let dataset = DataSet(numbers: numbers, sequenceLength: 3)

In [44]:
public struct ShufflingTemplate: BatcherTemplate {
    public typealias SourceDataSet = DataSet 
    public func shuffleIndices(on dataset: inout DataSet, indices: [Int]) -> [Int] {
        dataset.shuffle()
        return indices
    }
}

In [45]:
//Now let's look at what it gives us:
var batcher = Batcher(with: ShufflingTemplate(), on: dataset, batchSize: 3)
batcher.reorder()
for b in batcher {
    print(b)
}

[[1, 2, 3],
 [4, 5, 6],
 [7, 8, 9]]
[[10, 11, 12],
 [13, 14, 15],
 [16, 17, 18]]


In [46]:
batcher.reorder(shuffled: true)
for b in batcher {
    print(b)
}

[[ 9, 10, 11],
 [12, 13, 14],
 [15,  6,  7]]
[[ 8, 16, 17],
 [18,  1,  2],
 [ 3,  4,  5]]
