In [None]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/FastaiNotebook_02a_why_sqrt5")' FastaiNotebook_02a_why_sqrt5

Installing packages:
	.package(path: "/home/ubuntu/git/fastai_docs/dev_swift/FastaiNotebook_02a_why_sqrt5")
		FastaiNotebook_02a_why_sqrt5
With SwiftPM flags: []
Working in: /tmp/tmpn9q2vnh2/swift-install
/home/ubuntu/swift/usr/bin/swift-build: /home/ubuntu/anaconda3/lib/libuuid.so.1: no version information available (required by /home/ubuntu/swift/usr/lib/swift/linux/libFoundation.so)
/home/ubuntu/swift/usr/bin/swift-build: /home/ubuntu/anaconda3/lib/libcurl.so.4: no version information available (required by /home/ubuntu/swift/usr/lib/swift/linux/libFoundation.so)
/home/ubuntu/swift/usr/bin/swiftc: /home/ubuntu/anaconda3/lib/libuuid.so.1: no version information available (required by /home/ubuntu/swift/usr/bin/swiftc)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
/home/ubuntu/swift/usr/bin/swiftc: /home/ubuntu/anaconda3/lib/libuuid.so.1: no version information available (required by /home/ubuntu/swift/usr/bin/swiftc)

/home/ubuntu/swift/usr/bin/swift: /home/ubuntu/anaco

In [None]:
//export
import Path
import TensorFlow

In [None]:
import FastaiNotebook_02a_why_sqrt5

In [None]:
// export
public typealias TI = Tensor<Int32>

In [None]:
let y = TI(repeating: 0, shape: [1024])

### Data

In [None]:
var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: Path.home/".fastai"/"data"/"mnist_tst", flat: true)

In [None]:
let trainMean = xTrain.mean()
let trainStd  = xTrain.standardDeviation()

In [None]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [None]:
let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max().scalarized()+1
print(n,m,c)

In [None]:
let nHid = 50

In [None]:
public struct MyModel: Layer {
    public var layer1: FADense<Float>
    public var layer2: FADense<Float>
    
    public init(nIn: Int, nHid: Int, nOut: Int){
        layer1 = FADense(inputSize: nIn, outputSize: nHid, activation: relu)
        layer2 = FADense(inputSize: nHid, outputSize: nOut)
    }
    
    @differentiable
    public func call(_ input: TF) -> TF {
        return input.sequenced(through: layer1, layer2)
    }
}

In [None]:
var model = MyModel(nIn: m, nHid: nHid, nOut: Int(c))

In [None]:
let pred = model(xTrain)

### Cross entropy loss

In [None]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let exped = exp(activations) 
    return log(exped / exped.sum(alongAxes: -1))
}

In [None]:
let smPred = logSoftmax(pred)

In [None]:
yTrain[0..<3]

In [None]:
(smPred[0][5],smPred[1][0],smPred[2][4])

There is no fancy indexing yet so we have to use gather to get the indices we want out of our softmaxed predictions.

In [None]:
func nll<Scalar>(_ input: Tensor<Scalar>, _ target :TI) -> Tensor<Scalar> 
    where Scalar:TensorFlowFloatingPoint{
        let idx: TI = Raw.range(start: Tensor(0), limit: Tensor(60000), delta: Tensor(1))
        let indices = Raw.concat(concatDim: Tensor(1), [idx.expandingShape(at: 1), target.expandingShape(at: 1)])
        let losses = Raw.gatherNd(params: input, indices: indices)
        return -losses.mean()
    }

In [None]:
nll(smPred, yTrain)

In [None]:
time(repeating: 100){ let _ = nll(smPred, yTrain) }

Simplify `logSoftmax` with log formulas.

In [None]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - log(exp(activations).sum(alongAxes: -1))
}

In [None]:
let smPred = logSoftmax(pred)

In [None]:
nll(smPred, yTrain)

Use LogSumExp trick

In [None]:
smPred.max(alongAxes: -1).shape

In [None]:
func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let m = x.max(alongAxes: -1)
    return m + log(exp(x-m).sum(alongAxes: -1))
}

In [None]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - logSumExp(activations)
}

In [None]:
let smPred = logSoftmax(pred)

In [None]:
nll(smPred, yTrain)

In S4TF nll loss is combined with softmax in:

In [None]:
let loss = softmaxCrossEntropy(logits: pred, labels: yTrain)
loss

In [None]:
time(repeating: 100){ let _ = nll(logSoftmax(pred), yTrain)}

In [None]:
time(repeating: 100){ let _ = softmaxCrossEntropy(logits: pred, labels: yTrain)}

## Basic training loop

Basically the training loop repeats over the following steps:
- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [None]:
// export
public func accuracy(_ output: TF, _ target: TI) -> TF{
    let corrects = TF(output.argmax(squeezingAxis: 1) .== target)
    return corrects.mean()
}

In [None]:
print(accuracy(pred, yTrain))

In [None]:
let bs=64                         // batch size
let xb = xTrain[0..<bs]          // a mini-batch from x
let preds = model(xb) //predictions
print(preds[0], preds.shape)

In [None]:
let yb = yTrain[0..<bs]
let loss = softmaxCrossEntropy(logits: preds, labels: yb)

In [None]:
print(accuracy(preds, yb))

In [None]:
let lr:Float = 0.5   // learning rate
let epochs = 1      // how many epochs to train for

Then we can go

In [None]:
let (loss, grads) = model.valueWithGradient { model -> TF in
    let preds = model(xb)
    return softmaxCrossEntropy(logits: preds, labels: yb)
}

Loop by hand

In [None]:
for epoch in 1...epochs{
    for i in 0..<((n-1)/bs){
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model(xb)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        model.layer1.weight -= lr * grads.layer1.weight
        model.layer1.bias   -= lr * grads.layer1.bias
        model.layer2.weight -= lr * grads.layer2.weight
        model.layer2.bias   -= lr * grads.layer2.bias
    }
}

In [None]:
let preds = model(xValid)
accuracy(preds, yValid)

86% in one epoch, not too bad!

Naming all the parameters is a bit boring. We can use `AllDifferentiableVariables` objects to access them all.

In [None]:
for epoch in 1...epochs{
    for i in 0..<((n-1)/bs){
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model(xb)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        var parameters = model.allDifferentiableVariables
        for kp in parameters.keyPaths{ 
            parameters[keyPath: kp] -= lr * grads[keyPath: kp]
        }
    }
}

Then we can use a S4TF optimizer to do the step for us.

In [None]:
let optimizer = SGD(for: model, learningRate: lr)

In [None]:
for epoch in 1...epochs{
    for i in 0..<((n-1)/bs){
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model(xb)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

## Dataset

We can create a swift `Dataset` from our arrays. It will automatically batch things for us.

In [None]:
// export
public struct DataBatch<Inputs: Differentiable & TensorGroup, Labels: TensorGroup>: TensorGroup {
    public var xb: Inputs
    public var yb: Labels    
    
    public init(xb: Inputs, yb: Labels){ (self.xb,self.yb) = (xb,yb) }
}

In [None]:
let train_ds:Dataset<DataBatch> = Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(bs)

In [None]:
for epoch in 1...epochs{
    for batch in train_ds{
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model(batch.xb)
            return softmaxCrossEntropy(logits: preds, labels: batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

This `Dataset` can also do the shuffle for us:

In [None]:
for epoch in 1...epochs{
    for batch in train_ds.shuffled(sampleCount: 60000, randomSeed: 42){
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model(batch.xb)
            return softmaxCrossEntropy(logits: preds, labels: batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

### Training loop

In [None]:
public func train<Opt: Optimizer, Label:TensorGroup>(
    _ model: inout Opt.Model,
    on ds: Dataset<DataBatch<Opt.Model.Input, Label>>,
    using opt: inout Opt,
    lossFunc: @escaping @differentiable (Opt.Model.Output, @nondiff Label) -> Tensor<Opt.Scalar>
) where Opt.Model: Layer,
        Opt.Model.Input: TensorGroup,
        Opt.Model.CotangentVector == Opt.Model.AllDifferentiableVariables,
        Opt.Scalar: TensorFlowFloatingPoint
{
    for batch in ds {
        let (loss, 𝛁model) = model.valueWithGradient { model -> Tensor<Opt.Scalar> in 
            let pred = model(batch.xb)                                      
            return lossFunc(pred, batch.yb)
        }
        opt.update(&model.allDifferentiableVariables, along: 𝛁model)
    }
}

In [None]:
var model = MyModel(nIn: m, nHid: nHid, nOut: Int(c))
var optimizer = SGD(for: model, learningRate: lr)

In [None]:
train(&model, on: train_ds, using: &optimizer, lossFunc: softmaxCrossEntropy)

In [None]:
let preds = model(xValid)
accuracy(preds, yValid)

### Export

In [None]:
notebookToScript(fname: Path.cwd / "03_minibatch_training.ipynb")