In [1]:
%install-location $cwd/swift-install
%install '.package(path: "../../..")' Epochs

Installing packages:
	.package(path: "../../..")
		Epochs
With SwiftPM flags: []
Working in: /tmp/tmppy1or1xb/swift-install
/home/sgugger/swift/usr/bin/swift-build: /home/sgugger/anaconda3/lib/libcurl.so.4: no version information available (required by /home/sgugger/swift/usr/lib/swift/linux/libFoundationNetworking.so)
[1/2] Compiling Epochs BatchesGenerator.swift
[2/3] Merging module Epochs
[3/6] Wrapping AST for Epochs for debugging
[4/6] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[5/7] Merging module jupyterInstalledPackages
[6/6] Linking libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [2]:
import TensorFlow
import Epochs

In [3]:
let rawItems = 0..<512
let dataset = rawItems.lazy.map { (x: Int) -> Tensor<Float> in
  if x%64 == 0 { print(x) } //To check if it's lazy or not
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

In [4]:
let dataset1 = dataset.shuffled()

0
64
128
192
256
320
384
448


In [5]:
let dataset2 = ReindexedCollection(dataset).innerShuffled()

In [6]:
// A `Batches` defined on this:
let batches = Batches(of: 64, from: dataset2, \.collated)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

64
[64, 224, 224, 3]
[64, 224, 224, 3]
256
[64, 224, 224, 3]
192
128
[64, 224, 224, 3]
[64, 224, 224, 3]
448
320
0
[64, 224, 224, 3]
[64, 224, 224, 3]
384
[64, 224, 224, 3]


In [7]:
// Base use
// Some raw items (for instance filenames)
let rawItems = 0..<512
// A heavy-compute function lazily mapped on it (for instance, opening the images)
let dataSet = rawItems.lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
// A `Batches` defined on this:
let batches = Batches(of: 64, from: dataSet, \.collated)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [8]:
// Enabling shuffle
let batches = Batches(of: 64, from: dataSet.shuffled(), \.collated)
// This should absolutely not be done this way because it traverses the collection:
print(type(of: batches))

Batches<Array<Tensor<Float>>, Tensor<Float>>


In [9]:
// We need to actually go back to raw collection:
let dataSet = rawItems.shuffled().lazy.map { _ -> Tensor<Float> in
  return Tensor<Float>(randomNormal: [224, 224, 3])
}
let batches = Batches(of: 64, from: dataSet, \.collated)
print(type(of: batches))

Batches<LazyMapSequence<Array<Int>, Tensor<Float>>, Tensor<Float>>


In [10]:
// ReindexCollection does that for us
let dataSet = rawItems.lazy.map { _ -> Tensor<Float> in
  return Tensor<Float>(randomNormal: [224, 224, 3])
}
let batches = Batches(of: 64, from: ReindexedCollection(dataSet), \.collated)
print(type(of: batches))

Batches<ReindexedCollection<LazyMapSequence<Range<Int>, Tensor<Float>>>, Tensor<Float>>


In [11]:
// Use with padding
// Let's create an array of things of various lengths (for instance texts)
var dataSet: [Tensor<Int32>] = []
for _ in 0..<512 {
    dataSet.append(Tensor<Int32>(
        randomUniform: [Int.random(in: 1...200)], 
        lowerBound: Tensor<Int32>(0), 
        upperBound: Tensor<Int32>(100)
    ))
}

// We need to pad those tensors to make them all the same length.
// We could do this in one lazy transform applied beforehand and pad everything
// to the same length, but it's not memory-efficient: some batches might need less
// padding. So we need to add the padding after having selected the samples we
// are trying to batch.
let batches = Batches(of: 64, from: dataSet) { $0.paddedAndCollated(with: 0) }
for b in batches {
    print(b.shape)
}

[64, 195]
[64, 200]
[64, 193]
[64, 197]
[64, 196]
[64, 199]
[64, 198]
[64, 197]


In [12]:
// Use with a sampler
// In our previous example, another way to be memory efficient is to batch
// samples of roughly the same lengths.
let sortedDataset = dataSet.sorted { $0.shape[0] > $1.shape[0] }

let batches = Batches(of: 64, from: sortedDataset) { $0.paddedAndCollated(with: 0) }
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 174]
[64, 153]
[64, 125]
[64, 96]
[64, 67]
[64, 45]
[64, 23]


In [13]:
// When using a `batchSize` we get a bit of shuffle:
// This can all be applied on a lazy collection without breaking the lasziness as long as the sort function does not access the dataset
var sortedDataset = ReindexedCollection(dataSet).innerShuffled().sortedInBatches(of: 256) { dataSet[$0].shape[0] > dataSet[$1].shape[0] }

let batches = Batches(of: 64, from: sortedDataset) { $0.paddedAndCollated(with: 0) }
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 153]
[64, 96]
[64, 44]
[64, 200]
[64, 153]
[64, 96]
[64, 46]


In [14]:
struct LanguageModelDataset<Texts: RandomAccessCollection> where Texts.Element == [Int] {
    /// The underlying collection of texts
    public var texts: Texts
    /// The length of the samples returned when indexing
    private let sequenceLength: Int
    // The texts all concatenated together
    private var stream: [Int]
    
    init(texts: Texts, sequenceLength: Int) {
        self.texts = texts
        self.sequenceLength = sequenceLength
        stream = texts.reduce([], +)
    }
}

In [15]:
extension LanguageModelDataset: RandomAccessCollection {
    public typealias Index = Int
    public typealias Element = Tensor<Int32>
    
    public var startIndex: Int { return 0 }
    public var endIndex: Int { return stream.count / sequenceLength }
    public func index(after i: Int) -> Int { i+1 }
    
    public subscript(index: Int) -> Tensor<Int32> {
        get { 
            let i = index * sequenceLength
            return Tensor<Int32>(stream[i..<i+sequenceLength].map { Int32($0)} )
        }
    }
}

In [16]:
//Let's create such a DataSet
let numbers: [[Int]] = [[1,2,3,4,5], [6,7,8], [9,10,11,12,13,14,15], [16,17,18]]
var dataset = LanguageModelDataset(texts: numbers, sequenceLength: 3)

In [17]:
//Now let's look at what it gives us:
let batches = Batches(of: 3, from: dataset, \.collated)
for b in batches {
    print(b)
}

[[1, 2, 3],
 [4, 5, 6],
 [7, 8, 9]]
[[10, 11, 12],
 [13, 14, 15],
 [16, 17, 18]]


In [18]:
var dataset = LanguageModelDataset(texts: numbers.shuffled(), sequenceLength: 3)
let batches = Batches(of: 3, from: dataset, \.collated)
for b in batches {
    print(b)
}

[[16, 17, 18],
 [ 1,  2,  3],
 [ 4,  5,  9]]
[[10, 11, 12],
 [13, 14, 15],
 [ 6,  7,  8]]


## `BatchesGenerator`

In [20]:
// Base use

// A heavy-compute function lazily mapped on it (for instance, opening the images)
let trainingSet = (0..<512).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
let validationSet = (0..<256).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }

In [25]:
// A `Batches` defined on this:
let batchesGenerator = BatchesGenerator(
    of: 64, 
    from: ReindexedCollection(trainingSet), 
    and: ReindexedCollection(validationSet), 
    with: LazyBatchesMaker(makeBatch: \.collated)
)

In [26]:
let (trainingBatches, validationBatches) = batchesGenerator.nextEpoch()
for b in trainingBatches { print(b.shape) }
for b in validationBatches { print(b.shape) }

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
