In [None]:
%install '.package(path: "$cwd/FastaiNotebook_04_callbacks")' FastaiNotebook_04_callbacks

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_04_callbacks")
		FastaiNotebook_04_callbacks
With SwiftPM flags: []
Working in: /tmp/tmptubdev2g
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 3.56s
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'FastaiNotebook_04_callbacks' (6 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Loading library...
Installation complete!


## Load data

In [None]:
import FastaiNotebook_04_callbacks
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

('inline', 'module://ipykernel.pylab.backend_inline')


In [None]:
// export
import Path
import TensorFlow

In [None]:
let data = mnistDataBunch(flat: true)

In [None]:
let (n,m) = (60000,784)
let c = 10
let nHid = 50

In [None]:
func modelInit() -> BasicModel {return BasicModel(nIn: m, nHid: nHid, nOut: c)}

In [None]:
// TODO: When TF-421 is fixed, switch back to the normal `softmaxCrossEntropy`.

@differentiable(vjp: _vjpSoftmaxCrossEntropy)
func softmaxCrossEntropy1<Scalar: TensorFlowFloatingPoint>(
    _ features: Tensor<Scalar>, _ labels: Tensor<Scalar>
) -> Tensor<Scalar> {
    return Raw.softmaxCrossEntropyWithLogits(features: features, labels: labels).loss.mean()
}

@usableFromInline
func _vjpSoftmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
    features: Tensor<Scalar>, labels: Tensor<Scalar>
) -> (Tensor<Scalar>, (Tensor<Scalar>) -> (Tensor<Scalar>, Tensor<Scalar>)) {
    let (loss, grad) = Raw.softmaxCrossEntropyWithLogits(features: features, labels: labels)
    let batchSize = Tensor<Scalar>(features.shapeTensor[0])
    return (loss.mean(), { v in ((v / batchSize) * grad, Tensor<Scalar>(0)) })
}

## Stateful optimizer

In [None]:
//export
open class StatDelegate<Scalar: TensorFlowFloatingPoint> {
    open var name: String { return "" }
    var defaultConfig: [String: Scalar] { return [:] }
    func update(
        state: inout [String: Tensor<Scalar>],
        for param: Tensor<Scalar>,
        along direction: Tensor<Scalar>,
        config: inout [String: Scalar]
    ) { }
}

//export
open class StepDelegate<Scalar: TensorFlowFloatingPoint> {
    var defaultConfig: [String: Scalar] { return [:] }
    func update(
        param: inout Tensor<Scalar>,
        along direction: inout Tensor<Scalar>,
        state: [String: Tensor<Scalar>],
        config: inout [String: Scalar]
    ) { }
}

In [None]:
//export
class StatefulOptimizer<Model: Layer,
                        Scalar: TensorFlowFloatingPoint>: Optimizer
    where Model.AllDifferentiableVariables == Model.CotangentVector{
    var config: [String: Scalar]
    var learningRate: Scalar {
        get { return config["learningRate"]!} 
        set { config["learningRate"] = newValue }
    }
    var states: [String: Model.AllDifferentiableVariables]
    var statDelegates: [StatDelegate<Scalar>]
    var stepDelegates: [StepDelegate<Scalar>]
    init(
        stepDelegates: [StepDelegate<Scalar>],
        statDelegates: [StatDelegate<Scalar>],
        config: [String: Scalar]
    ) {
        self.config = [:]
        states = [:]
        for stepDelegate in stepDelegates {
            self.config.merge(stepDelegate.defaultConfig) { (_, new) in new }
        }
        for statDelegate in statDelegates {
            self.config.merge(statDelegate.defaultConfig) { (_, new) in new }
            states[statDelegate.name] = Model.AllDifferentiableVariables.zero
        }
        self.config.merge(config) { (_, new) in new }
        self.stepDelegates = stepDelegates
        self.statDelegates = statDelegates
    }
    func update(
        _ model: inout Model.AllDifferentiableVariables,
        along direction: Model.CotangentVector
    ) {
        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Scalar>.self) {
            var grad = direction[keyPath: kp]
            var state = states.mapValues(){$0[keyPath: kp]}
            for statDelegate in statDelegates {
                statDelegate.update(
                    state: &state,
                    for: model[keyPath: kp],
                    along: grad,
                    config: &config
                )
            }
            for n in states.keys { states[n]![keyPath: kp] = state[n]! }
            for stepDelegate in stepDelegates {
                stepDelegate.update(
                    param: &model[keyPath: kp],
                    along: &grad,
                    state: state,
                    config: &config
                )
            }
        }
    }
}

In [None]:
//export
class SGDStep<Scalar: TensorFlowFloatingPoint>: StepDelegate<Scalar> {
    override func update(
        param: inout Tensor<Scalar>,
        along direction: inout Tensor<Scalar>,
        state: [String: Tensor<Scalar>],
        config: inout [String: Scalar]
    ) {
        param -= Scalar(config["learningRate"]!) * direction
    }
}

In [None]:
//export
class WeightDecay<Scalar: TensorFlowFloatingPoint>: StepDelegate<Scalar> {
    override var defaultConfig: [String: Scalar] { return ["weightDecay": 0.0] }
    override func update(
        param: inout Tensor<Scalar>,
        along direction: inout Tensor<Scalar>,
        state: [String: Tensor<Scalar>],
        config: inout [String: Scalar]
    ) {
        param *= 1 - config["learningRate"]! * config["weightDecay"]!
    }
}

In [None]:
//export
class L2Regularization<Scalar: TensorFlowFloatingPoint>: StepDelegate<Scalar> {
    override var defaultConfig: [String: Scalar] { return ["weightDecay": 0.0] }
    override func update(
        param: inout Tensor<Scalar>,
        along direction: inout Tensor<Scalar>,
        state: [String: Tensor<Scalar>],
        config: inout [String: Scalar]
    ) {
        direction += config["weightDecay"]! * param
    }
}

In [None]:
//export
class AverageGrad<Scalar: TensorFlowFloatingPoint>: StatDelegate<Scalar> {
    override var defaultConfig: [String: Scalar] { return ["momentum": 0.9] }
    let dampened: Bool
    init(dampened: Bool = false) { self.dampened = dampened }
    override var name: String { return "averageGrad" }
    override func update(
        state: inout [String: Tensor<Scalar>],
        for param: Tensor<Scalar>,
        along direction: Tensor<Scalar>,
        config: inout [String: Scalar]
    ) {
        state["averageGrad"]! *= config["momentum"]!
        config["momentumDampening"] = 1.0 - (dampened ? config["momentum"]! : 0.0)
        state["averageGrad"]! += config["momentumDampening"]! * direction
    }
}

In [None]:
let opt = StatefulOptimizer<BasicModel, Float>(stepDelegates: [SGDStep()], statDelegates: [], 
                                               config: ["learningRate":0.01])

In [None]:
let learner = Learner(data: data, lossFunction: softmaxCrossEntropy1, optimizer: opt, initializingWith: modelInit)

In [None]:
learner.delegates = [Learner.TrainEvalDelegate(), Learner.AvgMetric(metrics: [accuracy])]

In [None]:
learner.fit(2)

Epoch 0: [0.46199676, 0.8835]
Epoch 1: [0.35245648, 0.9038]


In [None]:
//export
class MomentumStep<Scalar: TensorFlowFloatingPoint>: StepDelegate<Scalar> {
    override func update(
        param: inout Tensor<Scalar>,
        along direction: inout Tensor<Scalar>,
        state: [String: Tensor<Scalar>],
        config: inout [String: Scalar]
    ) {
        param -= config["learningRate"]! * state["averageGrad"]!
    }
}

In [None]:
let opt = StatefulOptimizer<BasicModel, Float>(stepDelegates: [MomentumStep()], statDelegates: [AverageGrad()], 
                                               config: ["learningRate":0.01])

In [None]:
let learner = Learner(data: data, lossFunction: softmaxCrossEntropy1, optimizer: opt, initializingWith: modelInit)

In [None]:
learner.delegates = [Learner.TrainEvalDelegate(), Learner.AvgMetric(metrics: [accuracy])]

In [None]:
learner.fit(2)

Epoch 0: [0.24837255, 0.9279]
Epoch 1: [0.17791083, 0.9486]


In [None]:
//export
class AverageSquaredGrad<Scalar: TensorFlowFloatingPoint>: StatDelegate<Scalar> {
    override var defaultConfig: [String: Scalar] { return ["squareMomentum": 0.99] }
    let dampened: Bool
    init(dampened: Bool = false) { self.dampened = dampened }
    override var name: String { return "averageSquaredGrad" }
    override func update(
        state: inout [String: Tensor<Scalar>],
        for param: Tensor<Scalar>,
        along direction: Tensor<Scalar>,
        config: inout [String: Scalar]
    ) {
        state["averageSquaredGrad"]! *= config["squareMomentum"]!
        config["squareMomentumDampening"] = 1.0 - (dampened ? config["squareMomentum"]! : 0.0)
        state["averageSquaredGrad"]! += config["squareMomentumDampening"]! * direction.squared()
    }
}

In [None]:
//export
class StepCount<Scalar: TensorFlowFloatingPoint>: StatDelegate<Scalar> {
    override var name: String { return "step" }
    override func update(
        state: inout [String: Tensor<Scalar>],
        for param: Tensor<Scalar>,
        along direction: Tensor<Scalar>,
        config: inout [String: Scalar]
    ) {
        state["step"]! += 1.0
    }
}

In [None]:
//export
func debias<Scalar: TensorFlowFloatingPoint>(
    momentum: Scalar,
    dampening: Scalar,
    step: Tensor<Scalar> 
) -> Tensor<Scalar> {
    return dampening * (1 - pow(momentum, step)) / (1 - momentum)
}

In [None]:
//export
class AdamStep<Scalar: TensorFlowFloatingPoint>: StepDelegate<Scalar> {
    override var defaultConfig: [String: Scalar] { return ["epsilon": 1e-5] }
    override func update(
        param: inout Tensor<Scalar>,
        along direction: inout Tensor<Scalar>,
        state: [String: Tensor<Scalar>],
        config: inout [String: Scalar]
    ) {
        let debiasedLearningRate = config["learningRate"]! / debias(
            momentum: config["momentum"]!,
            dampening: config["momentumDampening"]!,
            step: state["step"]!
        )
        let debiasedRMSGrad = sqrt(state["averageSquaredGrad"]! / debias(
            momentum: config["squareMomentum"]!,
            dampening: config["squareMomentumDampening"]!,
            step: state["step"]!
        )) + config["epsilon"]!
        param -= debiasedLearningRate * state["averageGrad"]! / debiasedRMSGrad
    }
}

In [None]:
let opt = StatefulOptimizer<BasicModel, Float>(
    stepDelegates: [AdamStep()], 
    statDelegates: [AverageGrad(), AverageSquaredGrad(), StepCount()], 
    config: ["learningRate":0.01])

In [None]:
let learner = Learner(data: data, lossFunction: softmaxCrossEntropy1, optimizer: opt, initializingWith: modelInit)

In [None]:
learner.delegates = [Learner.TrainEvalDelegate(), Learner.AvgMetric(metrics: [accuracy])]

In [None]:
learner.fit(2)

Epoch 0: [0.18092719, 0.9477]
Epoch 1: [0.14641517, 0.9578]


In [None]:
class LambStep<Scalar: TensorFlowFloatingPoint>: StepDelegate<Scalar> {
    override var defaultConfig: [String: Scalar] {
        return ["epsilon": 1e-6, "weightDecay": 0.0]
    }
    override func update(
        param: inout Tensor<Scalar>,
        along direction: inout Tensor<Scalar>,
        state: [String: Tensor<Scalar>],
        config: inout [String: Scalar]
    ) {
        let debiasedAverageGrad = state["averageGrad"]! / debias(
            momentum: config["momentum"]!,
            dampening: config["momentumDampening"]!,
            step: state["step"]!
        )
        let debiasedRMSGrad = sqrt(state["averageSquaredGrad"]! / debias(
            momentum: config["squareMomentum"]!,
            dampening: config["squareMomentumDampening"]!,
            step: state["step"]!
        ) + config["epsilon"]!)
        let step = debiasedAverageGrad / debiasedRMSGrad + config["weightDecay"]! * param
        let r1 = sqrt((param * param).mean())
        let r2 = sqrt((step * step).mean())
        let factor = min(r1 / r2, Scalar(10.0))
        param -= config["learningRate"]! * factor * step
    }
}

## Export

In [None]:
notebookToScript(fname: (Path.cwd / "09_optimizer.ipynb").string)