This repository has been archived by the owner on Nov 3, 2020. It is now read-only.
/
TRECWithTwoFilterWidths.kt
123 lines (92 loc) · 4.86 KB
/
TRECWithTwoFilterWidths.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
package com.komputation.cpu.demos.trec
import com.komputation.cpu.network.network
import com.komputation.demos.trec.NLP
import com.komputation.demos.trec.TRECData
import com.komputation.initialization.uniformInitialization
import com.komputation.instructions.continuation.activation.Activation
import com.komputation.instructions.continuation.activation.relu
import com.komputation.instructions.continuation.convolution.convolution
import com.komputation.instructions.continuation.dense.dense
import com.komputation.instructions.continuation.dropout.dropout
import com.komputation.instructions.continuation.stack.stack
import com.komputation.instructions.entry.lookup
import com.komputation.instructions.loss.crossEntropyLoss
import com.komputation.optimization.historical.nesterov
import java.io.File
import java.util.*
fun main(args: Array<String>) {
if (args.size != 2) {
throw Exception("Please specify the path to the Glove word embeddings and the number of dimensions.")
}
val embeddingFilePath = args.first()
val dimensions = args.last().toInt()
TrecWithTwoFilterWidths().run(embeddingFilePath, dimensions)
}
class TrecWithTwoFilterWidths {
fun run(embeddingFilePath: String, embeddingDimension: Int) {
val random = Random(1)
val initialization = uniformInitialization(random, -0.1f, 0.1f)
val optimization = nesterov(0.008f, 0.95f)
val batchSize = 16
val numberIterations = 7
val numberFilters = 45
val filterWidths = intArrayOf(2, 3)
val maximumFilterWidth = filterWidths.max()!!
val filterHeight = embeddingDimension
val keepProbability = 0.67f
val trecDirectory = File(javaClass.classLoader.getResource("trec").toURI())
val trainingFile = File(trecDirectory, "training.data")
val testFile = File(trecDirectory, "test.data")
val (trainingCategories, trainingDocuments) = TRECData.readExamples(trainingFile)
val (testCategories, testDocuments) = TRECData.readExamples(testFile)
val vocabulary = NLP.generateVocabulary(trainingDocuments)
val embeddingFile = File(embeddingFilePath)
val embeddingMap = NLP.embedVocabulary(vocabulary, embeddingFile)
val embeddableVocabulary = embeddingMap.keys.sorted()
val missing = vocabulary.minus(embeddingMap.keys)
val trainingDocumentsWithFilteredTokens = NLP.filterTokens(trainingDocuments, embeddableVocabulary)
val maximumDocumentLength = trainingDocumentsWithFilteredTokens.maxBy { document -> document.size }!!.size
val testDocumentsWithFilteredTokens = NLP.filterTokens(testDocuments, embeddableVocabulary)
val embeddableTrainingIndices = NLP.filterDocuments(trainingDocumentsWithFilteredTokens, maximumFilterWidth)
val embeddableTestIndices = NLP.filterDocuments(testDocumentsWithFilteredTokens, maximumFilterWidth)
val embeddableTrainingDocuments = trainingDocumentsWithFilteredTokens.slice(embeddableTrainingIndices)
val embeddableTestDocuments = testDocumentsWithFilteredTokens.slice(embeddableTestIndices)
val trainingRepresentations = NLP.vectorizeDocuments(embeddableTrainingDocuments, embeddableVocabulary)
val testRepresentations = NLP.vectorizeDocuments(embeddableTestDocuments, embeddableVocabulary)
val embeddableTrainingCategories = trainingCategories.slice(embeddableTrainingIndices)
val embeddableTestCategories = testCategories.slice(embeddableTestIndices)
val indexedCategories = NLP.indexCategories(trainingCategories.toSet())
val numberCategories = indexedCategories.size
val trainingTargets = NLP.createTargets(embeddableTrainingCategories, indexedCategories)
val testTargets = NLP.createTargets(embeddableTestCategories, indexedCategories)
val embeddings = embeddableVocabulary
.map { token -> embeddingMap[token]!! }
.toTypedArray()
val sentenceClassifier = network(
batchSize,
lookup(embeddings, maximumDocumentLength, embeddingDimension, optimization),
stack(
convolution(numberFilters, 2, filterHeight, initialization, optimization),
convolution(numberFilters, 3, filterHeight, initialization, optimization)
),
relu(),
dropout(random, keepProbability),
dense(numberCategories, Activation.Softmax, initialization, optimization)
)
val test = sentenceClassifier
.test(
testRepresentations,
testTargets,
batchSize,
numberCategories,
1)
sentenceClassifier.training(
trainingRepresentations,
trainingTargets,
numberIterations,
crossEntropyLoss()) { _ : Int, _ : Float ->
println(test.run())
}
.run()
}
}