-
Notifications
You must be signed in to change notification settings - Fork 4
/
metamorph.clj
136 lines (111 loc) · 4.39 KB
/
metamorph.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
(ns scicloj.ml.smile.metamorph
(:require
[scicloj.ml.smile.nlp :as nlp]
[tech.v3.dataset :as ds]
[pppmap.core :as ppp]
))
(defn count-vectorize
"Transforms the text column `text-col` into a map of token frequencies in column
`bow-col`
metamorph |.
-------------------------------------|---------
Behaviour in mode :fit |normal
Behaviour in mode :transform |normal
Reads keys from ctx |none
Writes keys to ctx |none
"
([text-col bow-col options]
(fn [ctx]
(assoc ctx :metamorph/data
(nlp/count-vectorize (:metamorph/data ctx) text-col bow-col options))))
([text-col bow-col]
(count-vectorize text-col bow-col {})))
(defn bow->something-sparse
"Converts a bag-of-word column `bow-col` to a sparse data column `indices-col`.
The exact transformation to the sparse representtaion is given by `bow->sparse-fn`
metamorph |.
-------------------------------------|---------
Behaviour in mode :fit |normal
Behaviour in mode :transform |normal
Reads keys from ctx |none
Writes keys to ctx |:scicloj.ml.smile.metamorph/bow->sparse-vocabulary
"
[bow-col indices-col bow->sparse-fn options]
;; (def bow-col bow-col)
;; (def indices-col indices-col)
;; (def bow->sparse-fn bow->sparse-fn)
;; (def options options)
(fn [{:metamorph/keys [mode data] :as ctx}]
;; (def data data)
;; (def mode mode)
(case mode
:fit
(let [
{:keys [ds vocab]}
(nlp/bow->sparse-and-vocab data
bow-col indices-col
bow->sparse-fn
options)
;; _ (def ds ds)
;; _(def vocab vocab)
]
(assoc ctx :metamorph/data ds
::bow->sparse-vocabulary vocab
))
:transform
(do
;; (def ctx ctx)
;; (def data data)
(let [{:keys [ds vocab]}
(nlp/bow->sparse data bow-col indices-col bow->sparse-fn (::bow->sparse-vocabulary ctx))]
(assoc ctx :metamorph/data ds))
)
)))
(defn bow->sparse-array
"Converts a bag-of-word column `bow-col` to sparse indices column `indices-col`,
as needed by the Maxent model.
`vocab size` is the size of vocabluary used, sorted by token frequency
metamorph |.
-------------------------------------|---------
Behaviour in mode :fit |normal
Behaviour in mode :transform |normal
Reads keys from ctx |none
Writes keys to ctx |:scicloj.ml.smile.metamorph/count-vectorize-vocabulary
"
([bow-col indices-col options]
(bow->something-sparse bow-col indices-col nlp/bow->sparse-indices options)
)
([bow-col indices-col]
(bow->something-sparse bow-col indices-col nlp/bow->sparse-indices {})
))
(defn bow->SparseArray
"Converts a bag-of-word column `bow-col` to sparse indices column `indices-col`,
as needed by the discrete naive bayes model. `vocab size` is the size of vocabluary used, sorted by token frequency
metamorph |.
-------------------------------------|---------
Behaviour in mode :fit |normal
Behaviour in mode :transform |normal
Reads keys from ctx |none
Writes keys to ctx |:scicloj.ml.smile.metamorph/count-vectorize-vocabulary
"
([bow-col indices-col options]
(bow->something-sparse bow-col indices-col nlp/freqs->SparseArray options))
([bow-col indices-col]
(bow->something-sparse bow-col indices-col nlp/freqs->SparseArray {})))
(defn bow->tfidf
"Calculates the tfidf score from bag-of-words (as token frequency maps)
in column `bow-column` and stores them in a new column `tfid-column` as maps of token->tfidf-score.
metamorph |.
-------------------------------------|---------
Behaviour in mode :fit |normal
Behaviour in mode :transform |normal
Reads keys from ctx |none
Writes keys to ctx |none
"
[bow-column tfidf-column]
(fn [ctx]
(assoc ctx :metamorph/data
(nlp/bow->tfidf
(:metamorph/data ctx)
bow-column
tfidf-column))))