In [1]:
%install-location $cwd/swift-install
%install '.package(path: "../../..")' Epochs

Installing packages:
	.package(path: "../../..")
		Epochs
With SwiftPM flags: []
Working in: /tmp/tmpm604spky/swift-install
/home/sgugger/swift/usr/bin/swift-build: /home/sgugger/anaconda3/lib/libcurl.so.4: no version information available (required by /home/sgugger/swift/usr/lib/swift/linux/libFoundationNetworking.so)
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[2/3] Merging module jupyterInstalledPackages
Initializing Swift...
Installation complete!


In [2]:
import TensorFlow
import Epochs

In [3]:
let rawItems = 0..<512
var accessed = rawItems.map { _ in false }
let dataset = rawItems.lazy.map { (x: Int) -> Tensor<Float> in
  accessed[x] = true
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

In [4]:
let dataset1 = dataset.shuffled()
accessed.reduce(true) { $0 && $1 }

true


In [5]:
accessed = rawItems.map { _ in false }
let dataset2 = ReindexedCollection(dataset).innerShuffled()
accessed.reduce(true) { $0 && !$1 }

true


In [6]:
// A `Batches` defined on this:
let batches = Batches(of: 64, from: dataset2, \.collated)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [7]:
// Base use
// Some raw items (for instance filenames)
let rawItems = 0..<512
// A heavy-compute function lazily mapped on it (for instance, opening the images)
let dataSet = rawItems.lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
// A `Batches` defined on this:
let batches = Batches(of: 64, from: dataSet, \.collated)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [8]:
// Enabling shuffle
let batches = Batches(of: 64, from: dataSet.shuffled(), \.collated)
// This should absolutely not be done this way because it traverses the collection:
print(type(of: batches))

Batches<Array<Tensor<Float>>, Tensor<Float>>


In [9]:
// We need to actually go back to raw collection:
let dataSet = rawItems.shuffled().lazy.map { (x: Int) -> Tensor<Float> in
  accessed[x] = true
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

accessed = rawItems.map { _ in false }
let batches = Batches(of: 64, from: dataSet, \.collated)
for (i, batch) in batches.enumerated() {
  if i == 0 {
      print(accessed[0..<64].reduce(true) { $0 && $1 })
  }
  print(accessed.filter() { $0 == true }.count)
}

false
64
128
192
256
320
384
448
512


In [10]:
// ReindexCollection does that for us
let dataSet = rawItems.lazy.map { (x: Int) -> Tensor<Float> in
  accessed[x] = true
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

accessed = rawItems.map { _ in false }
let batches = Batches(of: 64, from: ReindexedCollection(dataSet).innerShuffled(), \.collated)
for (i, batch) in batches.enumerated() {
  if i == 0 {
      print(accessed[0..<64].reduce(true) { $0 && $1 })
  }
  print(accessed.filter() { $0 == true }.count)
}

false
64
128
192
256
320
384
448
512


In [20]:
let x = Tensor<Int32>(repeating: 0, shape: [0])

In [31]:
let x: Int32 = Int32.random(in:Int32.min..<Int32.max)

In [25]:
let seed: TensorFlowSeed = (graph: 42, op: 33)

In [22]:
let x = Tensor<Float>(randomNormal: [4, 5], seed: TensorFlowSeed(43))

: 

In [19]:
// Use with padding
// Let's create an array of things of various lengths (for instance texts)
var dataSet: [Tensor<Int32>] = []
for _ in 0..<512 {
    dataSet.append(Tensor<Int32>(
        randomUniform: [Int.random(in: 1...200)], 
        lowerBound: Tensor<Int32>(0), 
        upperBound: Tensor<Int32>(100)
    ))
}

// We need to pad those tensors to make them all the same length.
// We could do this in one lazy transform applied beforehand and pad everything
// to the same length, but it's not memory-efficient: some batches might need less
// padding. So we need to add the padding after having selected the samples we
// are trying to batch.
let batches = Batches(of: 64, from: dataSet) { $0.paddedAndCollated(with: 0) }
for (i, b) in batches.enumerated() {
    print(b.shape)
    let shapes = dataSet[(i * 64)..<((i + 1) * 64)].map { Int($0.shape[0]) }
    let expectedShape = shapes.reduce(0) { max($0, $1) }
}

[64, 190]
[64, 200]
[64, 199]
[64, 196]
[64, 197]
[64, 199]
[64, 198]
[64, 200]


In [12]:
// Use with a sampler
// In our previous example, another way to be memory efficient is to batch
// samples of roughly the same lengths.
let sortedDataset = dataSet.sorted { $0.shape[0] > $1.shape[0] }

let batches = Batches(of: 64, from: sortedDataset) { $0.paddedAndCollated(with: 0) }
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 173]
[64, 146]
[64, 119]
[64, 97]
[64, 71]
[64, 48]
[64, 23]


In [13]:
// When using a `batchSize` we get a bit of shuffle:
// This can all be applied on a lazy collection without breaking the lasziness as long as the sort function does not access the dataset
var sortedDataset = ReindexedCollection(dataSet).innerShuffled().sortedInBatches(of: 256) { dataSet[$0].shape[0] > dataSet[$1].shape[0] }

let batches = Batches(of: 64, from: sortedDataset) { $0.paddedAndCollated(with: 0) }
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 140]
[64, 96]
[64, 47]
[64, 200]
[64, 149]
[64, 97]
[64, 50]


In [14]:
struct LanguageModelDataset<Texts: RandomAccessCollection> where Texts.Element == [Int] {
    /// The underlying collection of texts
    public var texts: Texts
    /// The length of the samples returned when indexing
    private let sequenceLength: Int
    // The texts all concatenated together
    private var stream: [Int]
    
    init(texts: Texts, sequenceLength: Int) {
        self.texts = texts
        self.sequenceLength = sequenceLength
        stream = texts.reduce([], +)
    }
}

In [15]:
extension LanguageModelDataset: RandomAccessCollection {
    public typealias Index = Int
    public typealias Element = Tensor<Int32>
    
    public var startIndex: Int { return 0 }
    public var endIndex: Int { return stream.count / sequenceLength }
    public func index(after i: Int) -> Int { i+1 }
    
    public subscript(index: Int) -> Tensor<Int32> {
        get { 
            let i = index * sequenceLength
            return Tensor<Int32>(stream[i..<i+sequenceLength].map { Int32($0)} )
        }
    }
}

In [16]:
//Let's create such a DataSet
let numbers: [[Int]] = [[1,2,3,4,5], [6,7,8], [9,10,11,12,13,14,15], [16,17,18]]
var dataset = LanguageModelDataset(texts: numbers, sequenceLength: 3)

In [17]:
//Now let's look at what it gives us:
let batches = Batches(of: 3, from: dataset, \.collated)
for (i, b) in batches.enumerated() {
  let expected = Tensor<Int32>(rangeFrom: Int32(1 + i * 9), to: Int32(1 + (i + 1) * 9), stride: 1)
  //let y = x.reshaped(to: [3, 3])
}

In [18]:
let x = Tensor<Int32>(rangeFrom: 10, to: 19, stride: 1)
x.reshaped(to: [3, 3])

[[10, 11, 12],
 [13, 14, 15],
 [16, 17, 18]]


In [32]:
var dataset = LanguageModelDataset(texts: numbers.shuffled(), sequenceLength: 3)
let batches = Batches(of: 3, from: dataset, \.collated)
var stream: [Int] = []
for batch in batches {
  stream += batch.scalars.map { Int($0) }
}

In [41]:
func isSubset(_ x: [Int], from y: [Int]) -> Bool {
  if let i = y.firstIndex(of: x[0]) {
    return x.enumerated().allSatisfy() { (k: Int, o: Int) -> Bool in
      o == y[i + k]
    }  
  }
  return false
}

In [42]:
numbers.allSatisfy{ isSubset($0, from: stream) }

true


## `BatchesGenerator`

In [22]:
// Base use

// A heavy-compute function lazily mapped on it (for instance, opening the images)
let trainingSet = (0..<512).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
let validationSet = (0..<256).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }

In [23]:
// A `Batches` defined on this:
let batchesGenerator = BatchesGenerator(
    of: 64, 
    from: ReindexedCollection(trainingSet), 
    and: ReindexedCollection(validationSet), 
    with: LazyBatchesMaker(makeBatch: \.collated)
)

In [24]:
let (trainingBatches, validationBatches) = batchesGenerator.nextEpoch()
for b in trainingBatches { print(b.shape) }
for b in validationBatches { print(b.shape) }

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [25]:
/// An infinite generator of training and validation data in batches 
///
/// - Note: if the `batchSize` changes during one epoch, it will only be
///   reflected at the next.
public struct BatchesGenerator1<Samples: Collection, Batch> {
  /// Training dataset.
  public let training: Samples
  /// Validation dataset.
  public let validation: Samples
  /// The batch size.
  public var batchSize: Int
  /// How to make a `Batch` from a slice of `BatchSampleSet`.
  private let makeBatches: (Int, Samples, Bool) -> Batches<Samples, Batch>
  
  /// Creates an instance that will be able to generate `Batches` of `batchSize`
  /// from `training`and `validation` samples, using `maker`
  public init(
    of batchSize: Int,
    from training:Samples, 
    and validation: Samples,
    _ makeBatches: @escaping (Int, Samples, Bool) -> Batches<Samples, Batch>
  ) {
    self.batchSize = batchSize
    self.training = training
    self.validation = validation
    self.makeBatches = makeBatches
  }
    
  /// Returns new `Batches` for training and validation, with a reshuffle of 
  /// the training data
  public func nextEpoch() -> (
    training: Batches<Samples, Batch>, 
    validation: Batches<Samples, Batch>
  ) {
  return (
    training: makeBatches(batchSize, training, true), 
    validation: makeBatches(batchSize, validation, false))
  }
}

In [26]:
func baseMakeBatches<Samples: RandomAccessCollection>(
  of batchSize: Int, from samples: ReindexedCollection<Samples>, isTrain: Bool
) -> Batches<ReindexedCollection<Samples>, Samples.Element> where Samples.Element: Collatable {
  let dataset = isTrain ? samples.innerShuffled() : samples
  return Batches(of: batchSize, from: dataset, \.collated)    
}

In [27]:
// Base use

// A heavy-compute function lazily mapped on it (for instance, opening the images)
let trainingSet = (0..<512).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
let validationSet = (0..<256).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }

In [28]:
// A `Batches` defined on this:
let batchesGenerator = BatchesGenerator1(
    of: 64, 
    from: ReindexedCollection(trainingSet), 
    and: ReindexedCollection(validationSet), 
    baseMakeBatches
)

In [29]:
let (trainingBatches, validationBatches) = batchesGenerator.nextEpoch()
for b in trainingBatches { print(b.shape) }
for b in validationBatches { print(b.shape) }

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
