-
Notifications
You must be signed in to change notification settings - Fork 4
/
nlp.clj
291 lines (243 loc) · 8.37 KB
/
nlp.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
(ns scicloj.ml.smile.nlp
(:require [clojure.string :as str]
[pppmap.core :as ppp]
[tech.v3.dataset :as ds])
(:import smile.nlp.normalizer.SimpleNormalizer
smile.nlp.stemmer.PorterStemmer
[smile.nlp.tokenizer SimpleTokenizer BreakIteratorSentenceSplitter]
[smile.nlp.dictionary EnglishStopWords]
[smile.classification DiscreteNaiveBayes DiscreteNaiveBayes$Model]
smile.util.SparseArray))
(defn resolve-stopwords [stopwords-option]
(if (keyword? stopwords-option)
(iterator-seq (.iterator (EnglishStopWords/valueOf (str/upper-case (name stopwords-option)))))
stopwords-option))
(defn word-process [stemmer ^SimpleNormalizer normalizer ^String word]
(let [word
(-> word
(str/lower-case)
(#(.normalize normalizer %)))
word (if (nil? stemmer)
word
(.stem stemmer word))]
word))
(defn default-tokenize
"Tokenizes text.
The usage of a stemmer can be configured by options :stemmer "
[text options]
(let [normalizer (SimpleNormalizer/getInstance)
stemmer-type (get options :stemmer :porter)
tokenizer (SimpleTokenizer. )
stemmer (case stemmer-type
:none nil
:porter (PorterStemmer.)
)
sentence-splitter (BreakIteratorSentenceSplitter.)
tokens
(->> text
(.normalize normalizer)
(.split sentence-splitter)
(map #(.split tokenizer %))
(map seq)
flatten
(remove nil?)
(map #(word-process stemmer normalizer % ))
)]
tokens))
(defn default-text->bow
"Converts text to token counts (a map token -> count).
Takes options:
`stopwords` being either a keyword naming a
default Smile dictionary (:default :google :comprehensive :mysql)
or a seq of stop words.
`stemmer` being either :none or :porter for selecting the porter stemmer.
"
[text options]
(let [normalizer (SimpleNormalizer/getInstance)
stemmer (PorterStemmer.)
stopwords-option (:stopwords options)
stopwords (resolve-stopwords stopwords-option)
processed-stop-words (map #(word-process stemmer normalizer %) stopwords)
tokens (default-tokenize text options)
freqs (frequencies tokens)]
(apply dissoc freqs processed-stop-words)))
(defn- remove-punctuation [sentence]
(->>
sentence
(filter #(or (Character/isLetter %)
(Character/isSpace %)
(Character/isDigit %)))
(apply str)))
(defn count-vectorize
"Converts text column `text-col` to bag-of-words representation
in the form of a frequency-count map"
([ds text-col bow-col {:keys [text->bow-fn]
:or {text->bow-fn default-text->bow}
:as options
}]
;; (def ds ds)
;; (def text-col text-col)
;; (def bow-col bow-col)
;; (def options options)
(ds/add-or-update-column
ds
(ds/new-column
bow-col
(ppp/ppmap-with-progress
"text->bow"
1000
#(text->bow-fn % options)
(get ds text-col)))))
([ds text-col bow-col]
(count-vectorize ds text-col bow-col {:text->bow-fn default-text->bow })
)
)
(defn ->vocabulary-top-n [bows n]
"Takes top-n most frequent tokens as vocabulary"
(let [vocabulary
(->>
(apply merge-with + bows)
(sort-by second)
reverse
(take n)
keys)]
vocabulary))
(defn create-vocab-all
"Uses all tokens as the vocabulary"
[bow ]
(keys
(apply merge bow))
)
(defn bow->sparse-and-vocab
"Converts a bag-of-word column `bow-col` to a sparse data column `indices-col`.
The exact transformation to the sparse representtaion is given by `bow->sparse-fn`"
[ds bow-col indices-col bow->sparse-fn {:keys [create-vocab-fn] :or {create-vocab-fn create-vocab-all}} ]
(let [vocabulary-list (create-vocab-fn (get ds bow-col))
vocab->index-map (zipmap vocabulary-list (range))
vocabulary {:vocab vocabulary-list
:vocab->index-map vocab->index-map
:index->vocab-map (clojure.set/map-invert vocab->index-map)
}
vocab->index-map (:vocab->index-map vocabulary)
ds
(ds/add-or-update-column
ds
(ds/new-column
indices-col
(ppp/ppmap-with-progress
"bow->sparse"
1000
#(bow->sparse-fn % vocab->index-map)
(get ds bow-col))))]
{:ds ds
:vocab vocabulary}
))
(defn bow->sparse [ds bow-col indices-col bow->sparse-fn vocabulary]
(let [
vocab->index-map (:vocab->index-map vocabulary)
ds
(ds/add-or-update-column
ds
(ds/new-column
indices-col
(ppp/ppmap-with-progress
"bow->sparse"
1000
#(bow->sparse-fn % vocab->index-map)
(get ds bow-col))))]
{:ds ds
:vocab vocabulary}
))
(defn bow->something-sparse
"Converts a bag-of-word column `bow-col` to a sparse data column `indices-col`.
The exact transformation to the sparse representtaion is given by `bow->sparse-fn`"
[ds bow-col indices-col bow->sparse-fn options]
(let [{:keys [ds vocabulary]}
(bow->sparse-and-vocab ds bow-col indices-col bow->sparse-fn options)]
ds))
(defn tf-map [bows]
(loop [m {} bows bows]
(let [bow (first bows)
token-present (zipmap (keys bow) (repeat 1))]
(if (empty? bows)
m
(recur
(merge-with + m token-present)
(rest bows))))))
(defn idf [tf-map term bows]
(let [n-t (count bows)
n-d (get tf-map term)]
(Math/log10 (/ n-t n-d ))))
(defn tf [term bow]
(/
(get bow term 0)
(apply + (vals bow))))
(defn tfidf [tf-map term bow bows]
(* (tf term bow) (idf tf-map term bows) ))
(defn bow->tfidf
"Calculates the tfidf score from bag-of-words (as token frequency maps)
in column `bow-column` and stores them in a new column `tfid-column` as maps of token->tfidf-score."
[ds bow-column tfidf-column]
(let [bows (get ds bow-column)
tf-map (tf-map bows)
tfidf-column (ds/new-column tfidf-column
(ppp/ppmap-with-progress
"tfidf" 1000
(fn [bow]
(let [terms (keys bow)
tfidfs
(map
#(tfidf tf-map % bow bows)
terms)]
(zipmap terms tfidfs)))
bows))]
(ds/add-or-update-column ds tfidf-column)))
(defn freqs->SparseArray [freq-map vocab->index-map]
(let [sparse-array (SparseArray.)]
(run!
(fn [[token freq]]
(when (contains? vocab->index-map token)
(.append sparse-array ^int (get vocab->index-map token) ^double freq)))
freq-map)
sparse-array))
(defn bow->sparse-indices
"Converts the token-frequencies to the sparse vectors
needed by Maxent"
[bow vocab->index-map]
(->>
(merge-with
(fn [index count]
[index count])
vocab->index-map
bow)
vals
(filter vector?)
(map first)
(into-array Integer/TYPE)))
(comment
(defn- remove-punctuation [sentence]
(->>
sentence
(filter #(or (Character/isLetter %)
(Character/isSpace %)
(Character/isDigit %)))
(apply str)))
(def tokenizer (SimpleTokenizer.))
(def normalizer (SimpleNormalizer/getInstance))
(def stemmer (PorterStemmer.))
(def text "this is a test")
(def sentence-spliter (BreakIteratorSentenceSplitter.))
(.split sentence-spliter "this is my world. and this is anoher text hello.")
(.split tokenizer "hello world")
(default-tokenize "hello My world. this is a tests" {})
(import java.text.BreakIterator)
(import smile.nlp.tokenizer.SimpleSentenceSplitter)
(def i (BreakIterator/getSentenceInstance))
(.setText i "Today is friday. I am carsten.")
(.first i)
(.next i)
(def ssp-1 (SimpleSentenceSplitter/getInstance))
(def ssp-2 (BreakIteratorSentenceSplitter.))
(.split ssp "Today is friday. I am carsten.")
(default-tokenize "Today is friday. I am carsten." {})
)