In [1]:
%install-location $cwd/swift-install
%install '.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")' Path
//%install '.package(url: "https://github.com/sgugger/SwiftData.git", .branch("master"))' SwiftData
%install '.package(path: "~/git/SwiftData")' SwiftData

Installing packages:
	.package(url: "https://github.com/mxcl/Path.swift", from: "0.16.1")
		Path
	.package(path: "~/git/SwiftData")
		SwiftData
With SwiftPM flags: []
Working in: /tmp/tmpaumxofm4/swift-install
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swif

In [2]:
import Path
import TensorFlow
import SwiftData
import Foundation

## Understanding the language model template

In [6]:
let dataset = [[0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22]]

In [7]:
let lengths = dataset.map{ $0.count }

In [8]:
let template = LanguageModelTemplate(openItem: { $0 }, batchSize: 4, sequenceLength: 3, lengths: lengths)

In [10]:
let batcher = Batcher(from: template, on: dataset, batchSize: 4, numWorkers: 4)

In [11]:
for x in batcher { print(x) }

([[ 0,  1,  2],
 [ 5,  6,  7],
 [10, 11, 12],
 [15, 16, 17]], [[ 1,  2,  3],
 [ 6,  7,  8],
 [11, 12, 13],
 [16, 17, 18]])
([[ 3,  4],
 [ 8,  9],
 [13, 14],
 [18, 19]], [[ 4,  5],
 [ 9, 10],
 [14, 15],
 [19, 20]])


In [14]:
let batcher = Batcher(from: template, on: dataset, batchSize: 4, numWorkers: 4, shuffle: true)

In [15]:
for x in batcher { print(x) }

([[ 5,  6,  7],
 [10, 19, 20],
 [ 2,  3,  4],
 [11, 12, 13]], [[ 6,  7,  8],
 [19, 20,  0],
 [ 3,  4, 21],
 [12, 13, 14]])
([[ 8,  9],
 [ 0,  1],
 [21, 22],
 [14, 15]], [[ 9, 10],
 [ 1,  2],
 [22, 11],
 [15, 16]])


## IMDB dataset

In [16]:
let dataPath = Path.home/".fastai"/"data"
let path = dataPath/"imdb_tok"

In [17]:
let fnames = collectFiles(under: path/"train", recurse: true, filtering: ["txt"])

In [18]:
fnames.count

25000


In [19]:
func readTokenizedText(_ fname: Path, separator: Character = " ") -> [String] {
    let text = try! String(contentsOf: URL(fileURLWithPath: fname.string), encoding: .utf8)
    return text.split(separator: separator).map { String($0) }
}

In [25]:
func countTokens(_ fnames: [Path]) -> ([Int], [String:Int]) {
    var counts: [String:Int] = [:]
    var lengths: [Int] = []
    for fname in fnames {
        let tokens = readTokenizedText(fname)
        lengths.append(tokens.count)
        for t in tokens {
            counts[t] = (counts[t] ?? 0) + 1
        }
    }
    return (lengths,counts)
}

In [26]:
let (lengths, counts) = countTokens(fnames)

In [27]:
func makeVocab(_ counts: [String:Int], minFreq: Int = 2, maxCount: Int = 60000) -> (itos: [Int:String], stoi: [String:Int]) {
    let withoutSpec = counts.filter { $0.0 != "xxunk" && $0.0 != "xxpad" }
    let sorted = withoutSpec.sorted { $0.1 > $1.1 }
    var itos: [Int:String] = [0:"xxunk", 1:"xxpad"]
    var stoi: [String:Int] = ["xxunk":0, "xxpad":1]
    for (i,x) in sorted.enumerated() {
        if i+2 >= maxCount || x.1 < minFreq { break }
        itos[i+2] = (x.0)
        stoi[x.0] = i+2
    }
    return (itos: itos, stoi: stoi)
}

In [28]:
let vocab = makeVocab(counts)

In [29]:
func numericalize(_ tokens: [String], with stoi: [String:Int]) -> [Int] {
    return tokens.map { stoi[$0] ?? 0 }
}

In [30]:
let tst = readTokenizedText(fnames[0])
tst.count

374


In [31]:
lengths[0]

374


### Language model

In [33]:
let template = LanguageModelTemplate(openItem: { numericalize(readTokenizedText($0), with: vocab.stoi) }, 
                                            batchSize: 64, sequenceLength: 72, lengths: lengths)

In [34]:
let batcher = Batcher(from: template, on: fnames, batchSize: 64, numWorkers: 4, shuffle: true)

In [35]:
let b = batcher.first {_ in true}!

In [36]:
print(b.0.shape,b.1.shape)

[64, 72] [64, 72]


### Text classification

In [40]:
print(fnames[0])

/home/sgugger/.fastai/data/imdb_tok/train/pos/10544_8.txt


In [41]:
let labelToInt: [String: Int] = ["neg":0, "pos":1]

In [43]:
let template = TextClassificationTemplate(
    openItem: { numericalize(readTokenizedText($0), with: vocab.stoi) },
    labelFunc: { labelToInt[$0.parent.basename()]! } )

In [44]:
let batcher = Batcher(from: template, on: fnames, batchSize: 64, numWorkers: 4, shuffle: true)

In [45]:
let b = batcher.first {_ in true}!

In [46]:
print(b.0.shape,b.1.shape)

[64, 767] [64]
