In [1]:
%install-location $cwd/swift-install
%install '.package(path: "../../..")' Epochs

Installing packages:
	.package(path: "../../..")
		Epochs
With SwiftPM flags: []
Working in: /tmp/tmpf4au_rlp/swift-install
[1/2] Compiling Epochs Batches.swift
[2/3] Compiling Epochs Collatable.swift
[3/4] Merging module Epochs
[4/7] Wrapping AST for Epochs for debugging
[5/7] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[6/8] Merging module jupyterInstalledPackages
[7/7] Linking libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [2]:
import TensorFlow
import Epochs

In [3]:
// Base use
// Some raw items (for instance filenames)
let rawItems = 0..<512
// A heavy-compute function lazily mapped on it (for instance, opening the images)
let dataSet = rawItems.lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
// A `Batches` defined on this:
let batches = Batches(of: 64, from: dataSet, \.collated)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [4]:
// Enabling shuffle
let batches = Batches(of: 64, from: dataSet.shuffled(), \.collated)

In [5]:
// Use with padding
// Let's create an array of things of various lengths (for instance texts)
var dataSet: [Tensor<Int32>] = []
for _ in 0..<512 {
    dataSet.append(Tensor<Int32>(
        randomUniform: [Int.random(in: 1...200)], 
        lowerBound: Tensor<Int32>(0), 
        upperBound: Tensor<Int32>(100)
    ))
}

// We need to pad those tensors to make them all the same length.
// We could do this in one lazy transform applied beforehand and pad everything
// to the same length, but it's not memory-efficient: some batches might need less
// padding. So we need to add the padding after having selected the samples we
// are trying to batch.
public func padTensors<Tensors: Collection>(tensors: Tensors) -> [Tensor<Int32>]
where Tensors.Element==Tensor<Int32> {
    let maxLength = tensors.map{ $0.shape[0] }.max()!
    return tensors.map { (t: Tensor<Int32>) -> Tensor<Int32> in 
        let remaining = Tensor<Int32>(zeros: [maxLength - t.shape[0]])
        return Tensor<Int32>(concatenating: [t, remaining])
    }
}

func makeBatchWithPadding<BatchSamples: Collection>(samples: BatchSamples) -> Tensor<Int32>
where BatchSamples.Element==Tensor<Int32>
{
    return .init(collating: padTensors(tensors: samples))
}

let batches = Batches(on: dataSet, batchSize: 64, makeBatch: makeBatchWithPadding)
for b in batches {
    print(b.shape)
}

: 

In [None]:
// Use with a sampler
// In our previous example, another way to be memory efficient is to batch
// samples of roughly the same lengths.
let sortedDataset = dataSet.sortedInBatches() { $0.shape[0] > $1.shape[0] }

let batches = Batches(on: sortedDataset, batchSize: 64, makeBatch: makeBatchWithPadding)
for b in batches {
    print(b.shape)
}

In [None]:
// When using a `batchSize` we get a bit of shuffle:
let sortedDataset = dataSet.shuffled().sortedInBatches(batchSize: 256) { $0.shape[0] > $1.shape[0] }

let batches = Batches(on: sortedDataset, batchSize: 64, makeBatch: makeBatchWithPadding)
for b in batches {
    print(b.shape)
}

In [None]:
struct LanguageModelDataset<Texts: RandomAccessCollection> where Texts.Element == [Int] {
    /// The underlying collection of texts
    public var texts: Texts
    /// The length of the samples returned when indexing
    private let sequenceLength: Int
    // The texts all concatenated together
    private var stream: [Int]
    
    init(texts: Texts, sequenceLength: Int) {
        self.texts = texts
        self.sequenceLength = sequenceLength
        stream = texts.reduce([], +)
    }
}

In [None]:
extension LanguageModelDataset: RandomAccessCollection {
    public typealias Index = Int
    public typealias Element = Tensor<Int32>
    
    public var startIndex: Int { return 0 }
    public var endIndex: Int { return stream.count / sequenceLength }
    public func index(after i: Int) -> Int { i+1 }
    
    public subscript(index: Int) -> Tensor<Int32> {
        get { 
            let i = index * sequenceLength
            return Tensor<Int32>(stream[i..<i+sequenceLength].map { Int32($0)} )
        }
    }
}

In [None]:
//Let's create such a DataSet
let numbers: [[Int]] = [[1,2,3,4,5], [6,7,8], [9,10,11,12,13,14,15], [16,17,18]]
var dataset = LanguageModelDataset(texts: numbers, sequenceLength: 3)

In [None]:
//Now let's look at what it gives us:
let batches = Batches(of: 3, from: dataset, \.collated)
for b in batches {
    print(b)
}

In [None]:
var dataset = LanguageModelDataset(texts: numbers.shuffled(), sequenceLength: 3)
let batches = Batches(of: 3, from: dataset, \.collated)
for b in batches {
    print(b)
}