In [1]:
%install-location $cwd/swift-install
%install '.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")' Path
%install '.package(path: "~/git/SwiftData")' Batcher

Installing packages:
	.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")
		Path
	.package(path: "~/git/SwiftData")
		Batcher
With SwiftPM flags: []
Working in: /tmp/tmprjnwpybx/swift-install
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc

In [2]:
import Path
import TensorFlow
import Batcher
import Foundation

## Understanding the language model dataset

A language model task is to guess the next word in a stream of texts. When having a list of tokenized and numericalized texts, we usually concatenate them all together in one big stream, separate it in the desired numbers of batches (which are `batchSize` chunks of continuous texts) then read through those `sequenceLength` at a time.

Let's look at an example:

In [3]:
let items = [[0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22]]

In [4]:
let dataset = LanguageModelDataset(openItem: { $0 }, batchSize: 4, sequenceLength: 3, items: items)

Here our stream is the sequence of integers from 0 to 22. With a batchsize of 4, we split it in four chunks which are:
```
0,1,2,3,4
5,6,7,8,9
10,11,12,13,14
15,16,17,18,19
```
The last three bits of the stream are thrown away because we don't have a round multiple of 4.

Then if read with a sequenceLength of 3, the first batch has for input
```
0,1,2
5,6,7
10,11,12
15,16,17
```
and for target the next words:
```
1,2,3
6,7,8
11,12,13
16,17,18
```

Let's put our dataset in a batcher to check it does all of this for us:

In [5]:
let batcher = Batcher(on: dataset, batchSize: 4)

In [6]:
for x in batcher { print(x) }

TensorPair<Int32, Int32>(input: [[ 0,  1,  2],
 [ 5,  6,  7],
 [10, 11, 12],
 [15, 16, 17]], target: [[ 1,  2,  3],
 [ 6,  7,  8],
 [11, 12, 13],
 [16, 17, 18]])
TensorPair<Int32, Int32>(input: [[ 3,  4],
 [ 8,  9],
 [13, 14],
 [18, 19]], target: [[ 4,  5],
 [ 9, 10],
 [14, 15],
 [19, 20]])


The first batch is as expected, and the second one has only a sequence length of 2 because our big chunks of text have a length of 5 here.

Behind the scenes, `LanguageModelDataset` implements a new collection which has the proper length and subscrit, to return the pair input/target of text (and not the raw texts of varying lengths).

With the shuffle enabled, the texts are shuffled before being concatenated to form the stream. We just need to use `languageModelSample` as a `sampleIndices` function.

In [7]:
let batcher = Batcher(on: dataset, batchSize: 4, shuffle: true, sampleIndices: languageModelSample)

In [8]:
for x in batcher { print(x) }

TensorPair<Int32, Int32>(input: [[19, 20,  0],
 [ 3,  4, 11],
 [14, 15, 16],
 [ 5,  6,  7]], target: [[20,  0,  1],
 [ 4, 11, 12],
 [15, 16, 17],
 [ 6,  7,  8]])
TensorPair<Int32, Int32>(input: [[ 1,  2],
 [12, 13],
 [17, 18],
 [ 8,  9]], target: [[ 2,  3],
 [13, 14],
 [18,  5],
 [ 9, 10]])


## On the IMDB dataset

I'm not dealing with tokenization here so I'm using fastai v2 to tokenize the imdb dataset for me. To get this, install [fastai v2](https://github.com/fastai/fastai2) then run anywhere the following script
```
from fastai2.text.all import *
path = untar_data(URLs.IMDB)
tokenize_folder(path)
```

It will create the following folder with one file per tokenized text.

In [9]:
let dataPath = Path.home/".fastai"/"data"
let path = dataPath/"imdb_tok"

In [10]:
let fnames = collectFiles(under: path/"train", recurse: true, filtering: ["txt"])

In [11]:
fnames.count

25000


In each text file, the tokens are separated by texts:

In [12]:
func readTokenizedText(_ fname: Path, separator: Character = " ") -> [String] {
    let text = try! String(contentsOf: URL(fileURLWithPath: fname.string), encoding: .utf8)
    return text.split(separator: separator).map { String($0) }
}

To create a vocabulary where we will throw away the rare words, we need to determine how many each of them is present. The following functions counts that for us and also returns the lengths of each text (will save time after).

In [13]:
func countTokens(_ fnames: [Path]) -> ([Int], [String:Int]) {
    var counts: [String:Int] = [:]
    var lengths: [Int] = []
    for fname in fnames {
        let tokens = readTokenizedText(fname)
        lengths.append(tokens.count)
        for t in tokens {
            counts[t] = (counts[t] ?? 0) + 1
        }
    }
    return (lengths,counts)
}

In [14]:
let (lengths, counts) = countTokens(fnames)

Then the following function will create a vocabyulary containing all the most frequent words up to `maxCount`, and with a minimum frequency of `minFreq` (NB: a language model can barely learn anything about words rarely present in the dataset).

In [15]:
func makeVocab(_ counts: [String:Int], minFreq: Int = 2, maxCount: Int = 60000) 
-> (itos: [Int:String], stoi: [String:Int]) {
    let withoutSpec = counts.filter { $0.0 != "xxunk" && $0.0 != "xxpad" }
    let sorted = withoutSpec.sorted { $0.1 > $1.1 }
    var itos: [Int:String] = [0:"xxunk", 1:"xxpad"]
    var stoi: [String:Int] = ["xxunk":0, "xxpad":1]
    for (i,x) in sorted.enumerated() {
        if i+2 >= maxCount || x.1 < minFreq { break }
        itos[i+2] = (x.0)
        stoi[x.0] = i+2
    }
    return (itos: itos, stoi: stoi)
}

In [16]:
let vocab = makeVocab(counts)

With our `vocab`, we can then numericalize each tokenized text, e.g. convert an array of strings to an array of integers.

In [17]:
func numericalize(_ tokens: [String], with stoi: [String:Int]) -> [Int] {
    return tokens.map { stoi[$0] ?? 0 }
}

In [18]:
let tst = readTokenizedText(fnames[0])
tst.count

374


In [19]:
lengths[0]

374


### Language model batcher

We can create a `LanguageModelDataset` from all our filenames, providing a function that can read them. Since it will need all the lengths of every sample to work, we can provide the array of lengths of each text to speed up the init (if we don't, it will make a pass over the dataset to compute them).

In [20]:
let dataset = LanguageModelDataset(
    openItem: { numericalize(readTokenizedText($0), with: vocab.stoi) }, 
    batchSize: 64, 
    sequenceLength: 72, 
    items: fnames, 
    lengths: lengths
)

And we can batch our samples:

In [21]:
let batcher = Batcher(on: dataset, batchSize: 64, numWorkers: 4, shuffle: true, sampleIndices: languageModelSample)

In [22]:
let b = batcher.first {_ in true}!

In [23]:
print(b.input.shape,b.target.shape)

[64, 72] [64, 72]


### Text classification

For classification, we can use the same `basicDataset` as before, the only difference is that since the texts are of different lengths, we will need to pad them to the same size before collating them, using `padSamples`.

In [24]:
let labelToInt: [String: Int] = ["neg":0, "pos":1]

In [25]:
let dataset = basicDataset(
    from: fnames, 
    toInput: { Tensor<Int32>(numericalize(readTokenizedText($0), with: vocab.stoi).map { Int32($0) }) }, 
    toTarget: { Tensor<Int32>(Int32(labelToInt[$0.parent.basename()]!)) }
)

In [26]:
let batcher = Batcher(on: dataset, batchSize: 64, numWorkers: 4, shuffle: true, padSamples: padInputs())

In [27]:
let b = batcher.first {_ in true}!

In [28]:
print(b.input.shape,b.target.shape)

[64, 1002] [64]
