In [1]:
%install-location $cwd/swift-install
%install '.package(path: "../../..")' Epochs
%install '.package(url: "https://github.com/paiv/swift-pcg-random.git", .upToNextMajor(from: "1.0.0"))' PcgRandom

Installing packages:
	.package(path: "../../..")
		Epochs
	.package(url: "https://github.com/paiv/swift-pcg-random.git", .upToNextMajor(from: "1.0.0"))
		PcgRandom
With SwiftPM flags: []
Working in: /tmp/tmpfdsl947o/swift-install
/home/sgugger/swift/usr/bin/swift-build: /home/sgugger/anaconda3/lib/libcurl.so.4: no version information available (required by /home/sgugger/swift/usr/lib/swift/linux/libFoundationNetworking.so)
[1/3] Merging module PcgRandom
[2/5] Compiling Epochs NonuniformTrainingEpochs.swift
[3/5] Compiling Epochs Collatable.swift
[4/6] Merging module Epochs
[5/7] Wrapping AST for Epochs for debugging
[6/8] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[7/9] Merging module jupyterInstalledPackages
[8/8] Linking libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [2]:
import TensorFlow
import Epochs

In [13]:
UInt32(time(nil))

1587497019


In [8]:
public func nonuniformInferenceBatches1<Samples: Collection>(
  samples: Samples, batchSize: Int, areInAscendingSizeOrder:
      @escaping (Samples.Element, Samples.Element) -> Bool
) -> Slices<LazilySelected<Samples, [Samples.Index]>>{
  let sampleOrder = Array(samples.indices).sorted { 
      areInAscendingSizeOrder(samples[$0], samples[$1])
  }
  return samples.selecting(sampleOrder).inBatches(of: batchSize)
}

In [9]:
let nonuniformDataset: [Tensor<Int32>] = { 
    var dataset: [Tensor<Int32>] = []
    for _ in 0..<512 {
      dataset.append(Tensor<Int32>(
                       randomUniform: [Int.random(in: 1...200)], 
                       lowerBound: Tensor<Int32>(0), 
                       upperBound: Tensor<Int32>(100)
                    ))
    }
    return dataset
  }()

In [14]:
let batches = nonuniformInferenceBatches1(
      samples: nonuniformDataset, batchSize: 64) { $0.shape[0] < $1.shape[0] }

In [16]:
for batch in batches { print(batch.paddedAndCollated(with: 0).shape) }

[64, 200]
[64, 172]
[64, 146]
[64, 126]
[64, 104]
[64, 73]
[64, 44]
[64, 21]


In [7]:
let x: [Int] = (0..<100).map { _ in Int.random(in:0..<100) }

In [10]:
let sampleOrder = Array(x.indices).sorted { x[$0] > x[$1] }

In [12]:
print(type(of:x.selecting(sampleOrder).inBatches(of: 10)))

Slices<LazilySelected<Array<Int>, Array<Int>>>


In [4]:
public struct NonuniformInferenceBatches<Samples: Collection> {
  private let samples: Samples
  
  /// The number of samples in a batch.
  let batchSize: Int

  /// The ordering of samples in the current epoch.
  private var sampleOrder: [Samples.Index]

  /// Creates an instance drawing samples from `samples` into batches of size
  /// `batchSize`.
  ///
  /// - Parameters:
  ///   - areInAscendingSizeOrder: a predicate that returns `true` iff the size
  ///     of the first parameter is less than that of the second.
  public init(
    samples: Samples,
    batchSize: Int,
    areInAscendingSizeOrder:
      @escaping (Samples.Element, Samples.Element) -> Bool
  ) {
    self.samples = samples
    self.batchSize = batchSize
    sampleOrder = Array(samples.indices).sorted { 
      areInAscendingSizeOrder(samples[$0], samples[$1])
    }
  }
}

In [3]:
protocol BatchMaker {
  associatedtype Batch
  associatedtype Samples
  func makeBatch<C: Collection>(_: C) -> Batch where C.Element == Samples
}

In [4]:
class Epochs<
  TrainingEpochs: Sequence&IteratorProtocol, 
  ValidationBatches: Collection, 
  Maker: BatchMaker
>: Sequence, IteratorProtocol 
where 
  TrainingEpochs.Element: Collection,
  TrainingEpochs.Element.Element: Collection,
  TrainingEpochs.Element.Element.Element == Maker.Samples,
  ValidationBatches.Element: Collection, 
  ValidationBatches.Element.Element == Maker.Samples {
  
  private var training: TrainingEpochs
  private let validation: ValidationBatches
  private let batchMaker: Maker

  public init(training: TrainingEpochs, validation: ValidationBatches,
              batchMaker: Maker) {
    self.training = training
    self.validation = validation
    self.batchMaker = batchMaker
  }
          
  /// The type of each epoch, a tuple with training batches and
  /// inference batches
  public typealias Element = 
    (LazyMapSequence<TrainingEpochs.Element, Maker.Batch>,
     LazyMapSequence<ValidationBatches, Maker.Batch>)

  /// Returns the next epoch in sequence.
  public func next() -> Element? {
    if let train = training.next() {
      return (train.lazy.map(batchMaker.makeBatch), 
              validation.lazy.map(batchMaker.makeBatch))
    }
    return nil
  }
}

In [15]:
typealias tstType = Collection<Int, Float>

: 

In [11]:
public struct CollationBatchMaker<Batch>: BatchMaker 
where Batch: Collatable { 
  public typealias Samples = Batch
  public func makeBatch<C: Collection>(_ x: C) -> Batch where C.Element == Batch {
    return x.collated
  }
}

In [6]:
let trainingSet = (0..<512).lazy.map { (x: Int) -> Tensor<Float> in
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

In [7]:
let validationSet = (0..<256).lazy.map { (x: Int) -> Tensor<Float> in
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

In [8]:
var pcg = Pcg64Random(seed: 42)

In [9]:
let trainingEpochs = TrainingEpochs(samples: trainingSet, batchSize: 64, entropy: pcg)

In [12]:
let epochs = Epochs(
  training: TrainingEpochs(samples: trainingSet, batchSize: 64, entropy: pcg),
  validation: validationSet.inBatches(of: 64),
  batchMaker: CollationBatchMaker()
)

In [18]:
for trainingBatches in trainingEpochs.prefix(1) {
  for batchSamples in trainingBatches {
    let batch = batchSamples.collated
    let (images, labels) = (batch.first, batch.second)
    let (loss, gradients) = valueWithGradient(at: model) { model -> Tensor<Float> in
      let logits = model(images)
      return softmaxCrossEntropy(logits: logits, labels: labels)
     }
    optimizer.update(&model, along: gradients)
  }
  
  for batch in validationBatches {
    print(batch.shape)
  }
}

: 

In [8]:
print(type(of: trainingEpochs))

TrainingEpochs<LazyMapSequence<Range<Int>, Tensor<Float>>, Pcg64Random>


In [10]:
let epochs = Epochs(
  training: TrainingEpochs(samples: trainingSet, batchSize: 64, entropy: pcg),
  validation: validationSet.inBatches(of: 64)) { $0.collated }

: 

In [3]:
let cuts = [0, 5, 8, 15, 24, 30]
let texts = (0..<5).map { Array(cuts[$0]..<cuts[$0+1]) }
texts

▿ 5 elements
  ▿ 0 : 5 elements
    - 0 : 0
    - 1 : 1
    - 2 : 2
    - 3 : 3
    - 4 : 4
  ▿ 1 : 3 elements
    - 0 : 5
    - 1 : 6
    - 2 : 7
  ▿ 2 : 7 elements
    - 0 : 8
    - 1 : 9
    - 2 : 10
    - 3 : 11
    - 4 : 12
    - 5 : 13
    - 6 : 14
  ▿ 3 : 9 elements
    - 0 : 15
    - 1 : 16
    - 2 : 17
    - 3 : 18
    - 4 : 19
    - 5 : 20
    - 6 : 21
    - 7 : 22
    - 8 : 23
  ▿ 4 : 6 elements
    - 0 : 24
    - 1 : 25
    - 2 : 26
    - 3 : 27
    - 4 : 28
    - 5 : 29


In [4]:
func preBatchTranspose<C: Collection>(_ base: C, for batchSize: Int, collectionCount: Int? = nil) -> [C.Index] {
  let count = collectionCount ?? base.count
  let batchCount = count / batchSize
  return (0..<count).map { (i: Int) -> C.Index in 
    let j = batchCount * (i % batchSize) + i / batchSize 
    return base.index(base.startIndex, offsetBy: j) 
  }
}

In [13]:
let sequenceLength = 3
let batchSize = 2

let sequences = texts.selecting(Array(0..<5)).joined().inBatches(of: sequenceLength)
let indices = preBatchTranspose(sequences, for: batchSize)
let batches = sequences.selecting(indices).inBatches(of: batchSize)

In [15]:
print(type(of: sequences))

Slices<FlattenSequence<LazilySelected<Array<Array<Int>>, Array<Int>>>>


In [8]:
for b in batches { 
  print(Tensor<Int32>(b.map { Tensor<Int32>($0.map { Int32($0) })}))
}

[[ 0,  1,  2],
 [15, 16, 17]]
[[ 3,  4,  5],
 [18, 19, 20]]
[[ 6,  7,  8],
 [21, 22, 23]]
[[ 9, 10, 11],
 [24, 25, 26]]
[[12, 13, 14],
 [27, 28, 29]]


In [12]:
typealias Samples = [[Int]]
typealias Sentences = Slices<FlattenSequence<LazilySelected<Samples, Array<Samples.Index>.SubSequence>>>

In [3]:
class Tracker {
  var accessed: Bool = false
}

In [5]:
import PcgRandom

var pcg = Pcg64Random(seed: 42)

In [6]:
let rawItems: [Tracker] = Array(0..<512).map{ _ in Tracker() }
let dataset = rawItems.lazy.map { (x: Tracker) -> Tensor<Float> in
  x.accessed = true
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

In [7]:
let batches = dataset[..<500].inBatches(of: 64).lazy.map(\.collated)

In [8]:
rawItems.allSatisfy{ !$0.accessed }

true


In [7]:
let _ = rawItems.map { $0.accessed = false }
var test = true
for (i,batch) in batches.enumerated() {
  test = test && (batch.shape == TensorShape([64, 224, 224, 3]))
  let limit = (i + 1) * 64
  test = test && rawItems[..<limit].allSatisfy(\.accessed)
  test = test && rawItems[limit...].allSatisfy{ !$0.accessed }
}
test

true


In [14]:
var trainingEpochs = UniformTrainingEpochs(samples: dataset, batchSize: 64, entropy: pcg)

In [17]:
let a = dataset[0]

In [18]:
var observedSampleOrder: [ObjectIdentifier]?

true


In [29]:
print(type(of: dataset))

LazyMapSequence<Array<Tracker>, Tensor<Float>>


In [28]:
let rawItems: [Tracker] = Array(0..<512).map{ _ in Tracker() }
let dataset = rawItems.lazy.map { (x: Tracker) -> Tensor<Float> in
  x.accessed = true
  return Tensor<Float>(randomNormal: [224, 224, 3])
}
var trainingEpochs = UniformTrainingEpochs(samples: dataset, batchSize: 64, entropy: pcg)

var accessed = Array(0..<512)
for batches in trainingEpochs.prefix(20) {
  var newAccessed: [Int] = []
  for batch in batches {
    let collatedBatch = batch.collated
    newAccessed += Array(0..<512).filter { rawItems[$0].accessed }    
  }
  print(accessed != newAccessed)
}

true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true


In [29]:
for abatch in dataset {
    print(batch.shape)
}

[64, 32, 32, 3]
[64, 32, 32, 3]
[64, 32, 32, 3]
[64, 32, 32, 3]
[64, 32, 32, 3]
[64, 32, 32, 3]
[64, 32, 32, 3]
[64, 32, 32, 3]


In [21]:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [4]:
let dataset1 = dataset.shuffled()
accessed.reduce(true) { $0 && $1 }

true


In [5]:
accessed = rawItems.map { _ in false }
let dataset2 = ReindexedCollection(dataset).innerShuffled()
accessed.reduce(true) { $0 && !$1 }

true


In [6]:
// A `Batches` defined on this:
let batches = Batches(of: 64, from: dataset2, \.collated)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [7]:
// Base use
// Some raw items (for instance filenames)
let rawItems = 0..<512
// A heavy-compute function lazily mapped on it (for instance, opening the images)
let dataSet = rawItems.lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
// A `Batches` defined on this:
let batches = Batches(of: 64, from: dataSet, \.collated)
// Iteration over it:
for batch in batches {
    print(batch.shape)
}

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [8]:
// Enabling shuffle
let batches = Batches(of: 64, from: dataSet.shuffled(), \.collated)
// This should absolutely not be done this way because it traverses the collection:
print(type(of: batches))

Batches<Array<Tensor<Float>>, Tensor<Float>>


In [9]:
// We need to actually go back to raw collection:
let dataSet = rawItems.shuffled().lazy.map { (x: Int) -> Tensor<Float> in
  accessed[x] = true
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

accessed = rawItems.map { _ in false }
let batches = Batches(of: 64, from: dataSet, \.collated)
for (i, batch) in batches.enumerated() {
  if i == 0 {
      print(accessed[0..<64].reduce(true) { $0 && $1 })
  }
  print(accessed.filter() { $0 == true }.count)
}

false
64
128
192
256
320
384
448
512


In [10]:
// ReindexCollection does that for us
let dataSet = rawItems.lazy.map { (x: Int) -> Tensor<Float> in
  accessed[x] = true
  return Tensor<Float>(randomNormal: [224, 224, 3])
}

accessed = rawItems.map { _ in false }
let batches = Batches(of: 64, from: ReindexedCollection(dataSet).innerShuffled(), \.collated)
for (i, batch) in batches.enumerated() {
  if i == 0 {
      print(accessed[0..<64].reduce(true) { $0 && $1 })
  }
  print(accessed.filter() { $0 == true }.count)
}

false
64
128
192
256
320
384
448
512


In [19]:
// Use with padding
// Let's create an array of things of various lengths (for instance texts)
var dataSet: [Tensor<Int32>] = []
for _ in 0..<512 {
    dataSet.append(Tensor<Int32>(
        randomUniform: [Int.random(in: 1...200)], 
        lowerBound: Tensor<Int32>(0), 
        upperBound: Tensor<Int32>(100)
    ))
}

// We need to pad those tensors to make them all the same length.
// We could do this in one lazy transform applied beforehand and pad everything
// to the same length, but it's not memory-efficient: some batches might need less
// padding. So we need to add the padding after having selected the samples we
// are trying to batch.
let batches = Batches(of: 64, from: dataSet) { $0.paddedAndCollated(with: 0) }
for (i, b) in batches.enumerated() {
    print(b.shape)
    let shapes = dataSet[(i * 64)..<((i + 1) * 64)].map { Int($0.shape[0]) }
    let expectedShape = shapes.reduce(0) { max($0, $1) }
}

[64, 190]
[64, 200]
[64, 199]
[64, 196]
[64, 197]
[64, 199]
[64, 198]
[64, 200]


In [12]:
// Use with a sampler
// In our previous example, another way to be memory efficient is to batch
// samples of roughly the same lengths.
let sortedDataset = dataSet.sorted { $0.shape[0] > $1.shape[0] }

let batches = Batches(of: 64, from: sortedDataset) { $0.paddedAndCollated(with: 0) }
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 173]
[64, 146]
[64, 119]
[64, 97]
[64, 71]
[64, 48]
[64, 23]


In [13]:
// When using a `batchSize` we get a bit of shuffle:
// This can all be applied on a lazy collection without breaking the lasziness as long as the sort function does not access the dataset
var sortedDataset = ReindexedCollection(dataSet).innerShuffled().sortedInBatches(of: 256) { dataSet[$0].shape[0] > dataSet[$1].shape[0] }

let batches = Batches(of: 64, from: sortedDataset) { $0.paddedAndCollated(with: 0) }
for b in batches {
    print(b.shape)
}

[64, 200]
[64, 140]
[64, 96]
[64, 47]
[64, 200]
[64, 149]
[64, 97]
[64, 50]


In [14]:
struct LanguageModelDataset<Texts: RandomAccessCollection> where Texts.Element == [Int] {
    /// The underlying collection of texts
    public var texts: Texts
    /// The length of the samples returned when indexing
    private let sequenceLength: Int
    // The texts all concatenated together
    private var stream: [Int]
    
    init(texts: Texts, sequenceLength: Int) {
        self.texts = texts
        self.sequenceLength = sequenceLength
        stream = texts.reduce([], +)
    }
}

In [15]:
extension LanguageModelDataset: RandomAccessCollection {
    public typealias Index = Int
    public typealias Element = Tensor<Int32>
    
    public var startIndex: Int { return 0 }
    public var endIndex: Int { return stream.count / sequenceLength }
    public func index(after i: Int) -> Int { i+1 }
    
    public subscript(index: Int) -> Tensor<Int32> {
        get { 
            let i = index * sequenceLength
            return Tensor<Int32>(stream[i..<i+sequenceLength].map { Int32($0)} )
        }
    }
}

In [16]:
//Let's create such a DataSet
let numbers: [[Int]] = [[1,2,3,4,5], [6,7,8], [9,10,11,12,13,14,15], [16,17,18]]
var dataset = LanguageModelDataset(texts: numbers, sequenceLength: 3)

In [17]:
//Now let's look at what it gives us:
let batches = Batches(of: 3, from: dataset, \.collated)
for (i, b) in batches.enumerated() {
  let expected = Tensor<Int32>(rangeFrom: Int32(1 + i * 9), to: Int32(1 + (i + 1) * 9), stride: 1)
  //let y = x.reshaped(to: [3, 3])
}

In [18]:
let x = Tensor<Int32>(rangeFrom: 10, to: 19, stride: 1)
x.reshaped(to: [3, 3])

[[10, 11, 12],
 [13, 14, 15],
 [16, 17, 18]]


In [32]:
var dataset = LanguageModelDataset(texts: numbers.shuffled(), sequenceLength: 3)
let batches = Batches(of: 3, from: dataset, \.collated)
var stream: [Int] = []
for batch in batches {
  stream += batch.scalars.map { Int($0) }
}

In [41]:
func isSubset(_ x: [Int], from y: [Int]) -> Bool {
  if let i = y.firstIndex(of: x[0]) {
    return x.enumerated().allSatisfy() { (k: Int, o: Int) -> Bool in
      o == y[i + k]
    }  
  }
  return false
}

In [42]:
numbers.allSatisfy{ isSubset($0, from: stream) }

true


## `BatchesGenerator`

In [22]:
// Base use

// A heavy-compute function lazily mapped on it (for instance, opening the images)
let trainingSet = (0..<512).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
let validationSet = (0..<256).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }

In [23]:
// A `Batches` defined on this:
let batchesGenerator = BatchesGenerator(
    of: 64, 
    from: ReindexedCollection(trainingSet), 
    and: ReindexedCollection(validationSet), 
    with: LazyBatchesMaker(makeBatch: \.collated)
)

In [24]:
let (trainingBatches, validationBatches) = batchesGenerator.nextEpoch()
for b in trainingBatches { print(b.shape) }
for b in validationBatches { print(b.shape) }

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]


In [25]:
/// An infinite generator of training and validation data in batches 
///
/// - Note: if the `batchSize` changes during one epoch, it will only be
///   reflected at the next.
public struct BatchesGenerator1<Samples: Collection, Batch> {
  /// Training dataset.
  public let training: Samples
  /// Validation dataset.
  public let validation: Samples
  /// The batch size.
  public var batchSize: Int
  /// How to make a `Batch` from a slice of `BatchSampleSet`.
  private let makeBatches: (Int, Samples, Bool) -> Batches<Samples, Batch>
  
  /// Creates an instance that will be able to generate `Batches` of `batchSize`
  /// from `training`and `validation` samples, using `maker`
  public init(
    of batchSize: Int,
    from training:Samples, 
    and validation: Samples,
    _ makeBatches: @escaping (Int, Samples, Bool) -> Batches<Samples, Batch>
  ) {
    self.batchSize = batchSize
    self.training = training
    self.validation = validation
    self.makeBatches = makeBatches
  }
    
  /// Returns new `Batches` for training and validation, with a reshuffle of 
  /// the training data
  public func nextEpoch() -> (
    training: Batches<Samples, Batch>, 
    validation: Batches<Samples, Batch>
  ) {
  return (
    training: makeBatches(batchSize, training, true), 
    validation: makeBatches(batchSize, validation, false))
  }
}

In [26]:
func baseMakeBatches<Samples: RandomAccessCollection>(
  of batchSize: Int, from samples: ReindexedCollection<Samples>, isTrain: Bool
) -> Batches<ReindexedCollection<Samples>, Samples.Element> where Samples.Element: Collatable {
  let dataset = isTrain ? samples.innerShuffled() : samples
  return Batches(of: batchSize, from: dataset, \.collated)    
}

In [27]:
// Base use

// A heavy-compute function lazily mapped on it (for instance, opening the images)
let trainingSet = (0..<512).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }
let validationSet = (0..<256).lazy.map { _ in Tensor<Float>(randomNormal: [224, 224, 3]) }

In [28]:
// A `Batches` defined on this:
let batchesGenerator = BatchesGenerator1(
    of: 64, 
    from: ReindexedCollection(trainingSet), 
    and: ReindexedCollection(validationSet), 
    baseMakeBatches
)

In [29]:
let (trainingBatches, validationBatches) = batchesGenerator.nextEpoch()
for b in trainingBatches { print(b.shape) }
for b in validationBatches { print(b.shape) }

[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
[64, 224, 224, 3]
