In [1]:
%install-location $cwd/swift-install
%install '.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")' Path
//%install '.package(url: "https://github.com/sgugger/SwiftData.git", .branch("master"))' SwiftData
%install '.package(path: "~/git/SwiftData")' SwiftData2

Installing packages:
	.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")
		Path
	.package(path: "~/git/SwiftData")
		SwiftData2
With SwiftPM flags: []
Working in: /tmp/tmp4czj86zv/swift-install
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swi

In [2]:
import Path
import TensorFlow
import SwiftData2
import Foundation

## Understanding the language model dataset

In [3]:
let items = [[0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22]]

In [4]:
let dataset = LanguageModelDataset(openItem: { $0 }, batchSize: 4, sequenceLength: 3, items: items)

In [5]:
let batcher = Batcher(on: dataset, batchSize: 4, numWorkers: 2,
                      collateSamples: collateTensors)

In [6]:
for x in batcher { print(x) }

([[ 0,  1,  2],
 [ 5,  6,  7],
 [10, 11, 12],
 [15, 16, 17]], [[ 1,  2,  3],
 [ 6,  7,  8],
 [11, 12, 13],
 [16, 17, 18]])
([[ 3,  4],
 [ 8,  9],
 [13, 14],
 [18, 19]], [[ 4,  5],
 [ 9, 10],
 [14, 15],
 [19, 20]])


In [7]:
// TODO: handle shuffling the items

## IMDB dataset

In [8]:
let dataPath = Path.home/".fastai"/"data"
let path = dataPath/"imdb_tok"

In [9]:
let fnames = collectFiles(under: path/"train", recurse: true, filtering: ["txt"])

In [10]:
fnames.count

25000


In [11]:
func readTokenizedText(_ fname: Path, separator: Character = " ") -> [String] {
    let text = try! String(contentsOf: URL(fileURLWithPath: fname.string), encoding: .utf8)
    return text.split(separator: separator).map { String($0) }
}

In [12]:
func countTokens(_ fnames: [Path]) -> ([Int], [String:Int]) {
    var counts: [String:Int] = [:]
    var lengths: [Int] = []
    for fname in fnames {
        let tokens = readTokenizedText(fname)
        lengths.append(tokens.count)
        for t in tokens {
            counts[t] = (counts[t] ?? 0) + 1
        }
    }
    return (lengths,counts)
}

In [13]:
let (lengths, counts) = countTokens(fnames)

In [14]:
func makeVocab(_ counts: [String:Int], minFreq: Int = 2, maxCount: Int = 60000) -> (itos: [Int:String], stoi: [String:Int]) {
    let withoutSpec = counts.filter { $0.0 != "xxunk" && $0.0 != "xxpad" }
    let sorted = withoutSpec.sorted { $0.1 > $1.1 }
    var itos: [Int:String] = [0:"xxunk", 1:"xxpad"]
    var stoi: [String:Int] = ["xxunk":0, "xxpad":1]
    for (i,x) in sorted.enumerated() {
        if i+2 >= maxCount || x.1 < minFreq { break }
        itos[i+2] = (x.0)
        stoi[x.0] = i+2
    }
    return (itos: itos, stoi: stoi)
}

In [15]:
let vocab = makeVocab(counts)

In [16]:
func numericalize(_ tokens: [String], with stoi: [String:Int]) -> [Int] {
    return tokens.map { stoi[$0] ?? 0 }
}

In [17]:
let tst = readTokenizedText(fnames[0])
tst.count

374


In [18]:
lengths[0]

374


### Language model

In [19]:
let dataset = LanguageModelDataset(openItem: { numericalize(readTokenizedText($0), with: vocab.stoi) }, 
                                   batchSize: 64, sequenceLength: 72, items: fnames, lengths: lengths)

In [20]:
let batcher = Batcher(on: dataset, batchSize: 64, numWorkers: 4, collateSamples: collateTensors)

In [21]:
let b = batcher.first {_ in true}!

In [22]:
print(b.0.shape,b.1.shape)

[64, 72] [64, 72]


### Text classification

In [23]:
print(fnames[0])

/home/sgugger/.fastai/data/imdb_tok/train/pos/10544_8.txt


In [24]:
let labelToInt: [String: Int] = ["neg":0, "pos":1]

In [25]:
let dataset = basicDataset(
    from: fnames, 
    toInput: { Tensor<Int32>(numericalize(readTokenizedText($0), with: vocab.stoi).map { Int32($0) }) }, 
    toTarget: { Tensor<Int32>(Int32(labelToInt[$0.parent.basename()]!)) }
)

In [26]:
let batcher = Batcher(on: dataset, batchSize: 64, numWorkers: 4, shuffle: true,
                      padSamples: padInputs(), collateSamples: collateTensors)

In [27]:
let b = batcher.first {_ in true}!

In [28]:
print(b.0.shape,b.1.shape)

[64, 1391] [64]
