## Notes

Porting `class Runner` to Swift is a WIP.

* `TrainerCallback` is a class, not a protocol, because `Trainer` needs to store a `[TrainerCallback]` array.

Todos:
* Improve naming, currently many names come directly from Python

In [None]:
import TensorFlow

In [None]:
// Example loss function.
// TODO: This should be moved into the TensorFlow library/APIs.
@differentiable(vjp: _vjpSoftmaxCrossEntropy)
func softmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
    features: Tensor<Scalar>, labels: Tensor<Scalar>
) -> Tensor<Scalar> {
    return Raw.softmaxCrossEntropyWithLogits(features: features, labels: labels).loss.mean()
}

@usableFromInline
func _vjpSoftmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
    features: Tensor<Scalar>, labels: Tensor<Scalar>
) -> (Tensor<Scalar>, (Tensor<Scalar>) -> (Tensor<Scalar>, Tensor<Scalar>)) {
    let (loss, grad) = Raw.softmaxCrossEntropyWithLogits(features: features, labels: labels)
    let batchSize = Tensor<Scalar>(features.shapeTensor[0])
    return (loss.mean(), { v in ((v / batchSize) * grad, Tensor<Scalar>(0)) })
}

In [None]:
// Example type for use with `Dataset`.
// TODO: The usage of this should be re-evaluated.
public struct Example<DataScalar, LabelScalar>: TensorGroup
    where DataScalar: TensorFlowFloatingPoint,
          LabelScalar: TensorFlowFloatingPoint {
    public var data: Tensor<DataScalar>
    public var labels: Tensor<LabelScalar>
}

In [None]:
/// A training loop.
///
/// Trains the given model at the given keypath 
public func train<M, O: Optimizer, S>(
    _ model: inout M,
    at variablesKeyPath: WritableKeyPath<M, M.AllDifferentiableVariables>,
    on dataset: Dataset<Example<S, S>>,
    using optimizer: inout O,
    loss: @escaping @differentiable (Tensor<S>, Tensor<S>) -> Tensor<S>
) where O.Model == M, O.Scalar == S,
        M.Input == Tensor<S>, M.Output == Tensor<S>
{
    let context = Context(learningPhase: .training)
    for batch in dataset {
        let (x, y) = (batch.data, batch.labels)
        let (loss, (𝛁model, _)) = model.valueWithGradient(at: y) { (model, y) -> Tensor<S> in
            let preds = model.applied(to: x, in: context)
            return loss(preds, y)
        }
        print(loss)
        optimizer.update(&model[keyPath: variablesKeyPath], along: 𝛁model)
    }
}

In [None]:
// Example usage.
var model = Dense<Float>(inputSize: 784, outputSize: 10)
var optimizer = SGD<Dense<Float>, Float>(learningRate: 0.1)

let data = Tensor<Float>(randomNormal: [10, 10, 784])
let labels = Tensor<Float>(randomNormal: [10, 10])
let dataset = Dataset<Example<Float, Float>>(elements: Example<Float, Float>(data: data, labels: labels))

train(&model, at: \Dense<Float>.allDifferentiableVariables, on: dataset, using: &optimizer, loss: softmaxCrossEntropy)

## Add Callbacks

The code below adds callbacks and defines a new training loop.

In [None]:
/// CallbackResult allows callbacks to control the training loop.
public enum CallbackResult {
    /// Proceed with the training step.
    case proceed
    /// Skip the rest of the training step, and move immediately to the next step.
    case skip
    /// Stop training.
    case stop
}


open class TrainingCallbacks<M, O: Optimizer, S> 
    where O.Model == M, O.Scalar == S,
          M.Input == Tensor<S>, M.Output == Tensor<S> {
              
    open func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        return .proceed
    }
    
    // TODO: Figure out what to pass here!
    open func beforeBatch() -> CallbackResult {
        return .proceed
    }
    
    open func afterBatch(loss: inout Tensor<S>) -> CallbackResult {
        return .proceed
    }
    
}

In [None]:
class Recorder<M, O: Optimizer, S>: TrainingCallbacks<M, O, S>
    where O.Model == M, O.Scalar == S,
          M.Input == Tensor<S>, M.Output == Tensor<S> {
    private var optimizer: O? = nil
    private var losses: [S] = []
    private var lrs: [O.Scalar] = []
    override func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        self.optimizer = optimizer
        return .proceed
    }
    
    override func afterBatch(loss: inout Tensor<S>) -> CallbackResult {
        lrs.append(optimizer!.learningRate)
        losses.append(loss.scalarized())
        return .proceed
    }
}

In [None]:
/// Simple SGD optimizer with a modifiable learning rate.
public class SettableSGD<Model: Layer>: Optimizer
    where Model.AllDifferentiableVariables == Model.CotangentVector {
    /// The learning rate.
    public var learningRate: Float {
        willSet(newLearningRate) {
            precondition(newLearningRate >= 0, "Learning rate must be non-negative")
        }
    }

    public init(learningRate: Float = 0.01) {
        precondition(learningRate >= 0, "Learning rate must be non-negative")
        self.learningRate = learningRate
    }

    public func update(_ model: inout Model.AllDifferentiableVariables,
                       along direction: Model.CotangentVector) {
        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Scalar>.self) {
            model[keyPath: kp] += learningRate * direction[keyPath: kp]
        }
    }
}


In [None]:
let foo = SettableSGD<Dense<Float>>()

In [None]:
foo.learningRate

In [None]:
foo.learningRate = 0.2

In [None]:
foo.learningRate

In [None]:
/// A non-generalized learning rate scheduler
class LearningRateScheduler<M, O: SettableSGD<M>>: TrainingCallbacks<M, O, Float>
    where O.Model == M,
          M.Input == Tensor<Float>, M.Output == Tensor<Float> {
    
    // A learning rate schedule from step to float.
    typealias ScheduleFunc = (Int) -> Float

    private var optimizer: O?
    private let scheduler: ScheduleFunc
    private var step = 0
    
    init(scheduler: @escaping ScheduleFunc) {
        self.scheduler = scheduler
    }

    override func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        self.optimizer = optimizer
        return .proceed
    }
              
    override func beforeBatch() -> CallbackResult {
        step += 1
        self.optimizer!.learningRate = scheduler(step)
        return .proceed
    }
    
}

In [None]:
class SequentialCallbacks< M, O: Optimizer, S>: TrainingCallbacks<M, O, S>
    where O.Model == M, O.Scalar == S,
          M.Input == Tensor<S>, M.Output == Tensor<S> {
    
    private let callbacks: [TrainingCallbacks<M, O, S>]

    init(_ callbacks: [TrainingCallbacks<M, O, S>]) {
        self.callbacks = callbacks
    }
    convenience init(_ callbacks: TrainingCallbacks<M, O, S>...) {
        self.init(callbacks)
    }
              
    override func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        for cb in callbacks {
            let cbResult = cb.beforeTrain(model: &model, optimizer: &optimizer)
            switch cbResult {
                case .stop, .skip: return cbResult
                case .proceed: break
            }
        }
        return .proceed
    }
    
    // TODO: Figure out what to pass here!
    override func beforeBatch() -> CallbackResult {
        for cb in callbacks {
            let cbResult = cb.beforeBatch()
            switch cbResult {
                case .stop, .skip: return cbResult
                case .proceed: break
            }
        }
        return .proceed
    }
    
    override func afterBatch(loss: inout Tensor<S>) -> CallbackResult {
        for cb in callbacks {
            let cbResult = cb.afterBatch(loss: &loss)
            switch cbResult {
                case .stop, .skip: return cbResult
                case .proceed: break
            }
        }
        return .proceed
    }
}

In [None]:
/// A training loop, now improved with callbacks!
public func trainWithCallbacks<M, O: Optimizer, S>(
    _ model: inout M,
    at variablesKeyPath: WritableKeyPath<M, M.AllDifferentiableVariables>,
    on dataset: Dataset<Example<S, S>>,
    using optimizer: inout O,
    loss: @escaping @differentiable (Tensor<S>, Tensor<S>) -> Tensor<S>,
    callbacks: TrainingCallbacks<M, O, S>
) where O.Model == M, O.Scalar == S,
        M.Input == Tensor<S>, M.Output == Tensor<S>
{
    let context = Context(learningPhase: .training)
    callbacks.beforeTrain(model: &model, optimizer: &optimizer)
    for batch in dataset {
        callbacks.beforeBatch()  // TODO: pass in batch!
        let (x, y) = (batch.data, batch.labels)
        var (loss, (𝛁model, _)) = model.valueWithGradient(at: y) { (model, y) -> Tensor<S> in
            let preds = model.applied(to: x, in: context)
            return loss(preds, y)
        }
        callbacks.afterBatch(loss: &loss)
        print(loss)
        optimizer.update(&model[keyPath: variablesKeyPath], along: 𝛁model)
    }
}