forked from blevesearch/bleve
/
stop_words_nl.go
143 lines (133 loc) · 5.09 KB
/
stop_words_nl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package nl
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_nl"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var DutchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Dutch stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large sample of Dutch text.
| Dutch stop words frequently exhibit homonym clashes. These are indicated
| clearly below.
de | the
en | and
van | of, from
ik | I, the ego
te | (1) chez, at etc, (2) to, (3) too
dat | that, which
die | that, those, who, which
in | in, inside
een | a, an, one
hij | he
het | the, it
niet | not, nothing, naught
zijn | (1) to be, being, (2) his, one's, its
is | is
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
op | on, upon, at, in, up, used up
aan | on, upon, to (as dative)
met | with, by
als | like, such as, when
voor | (1) before, in front of, (2) furrow
had | had, past tense all persons sing. of 'hebben' (have)
er | there
maar | but, only
om | round, about, for etc
hem | him
dan | then
zou | should/would, past tense all persons sing. of 'zullen'
of | or, whether, if
wat | what, something, anything
mijn | possessive and noun 'mine'
men | people, 'one'
dit | this
zo | so, thus, in this way
door | through by
over | over, across
ze | she, her, they, them
zich | oneself
bij | (1) a bee, (2) by, near, at
ook | also, too
tot | till, until
je | you
mij | me
uit | out of, from
der | Old Dutch form of 'van der' still found in surnames
daar | (1) there, (2) because
haar | (1) her, their, them, (2) hair
naar | (1) unpleasant, unwell etc, (2) towards, (3) as
heb | present first person sing. of 'to have'
hoe | how, why
heeft | present third person sing. of 'to have'
hebben | 'to have' and various parts thereof
deze | this
u | you
want | (1) for, (2) mitten, (3) rigging
nog | yet, still
zal | 'shall', first and third person sing. of verb 'zullen' (will)
me | me
zij | she, they
nu | now
ge | 'thou', still used in Belgium and south Netherlands
geen | none
omdat | because
iets | something, somewhat
worden | to become, grow, get
toch | yet, still
al | all, every, each
waren | (1) 'were' (2) to wander, (3) wares, (3)
veel | much, many
meer | (1) more, (2) lake
doen | to do, to make
toen | then, when
moet | noun 'spot/mote' and present form of 'to must'
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be'
zonder | without
kan | noun 'can' and present form of 'to be able'
hun | their, them
dus | so, consequently
alles | all, everything, anything
onder | under, beneath
ja | yes, of course
eens | once, one day
hier | here
wie | who
werd | imperfect third person sing. of 'become'
altijd | always
doch | yet, but etc
wordt | present third person sing. of 'become'
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans
kunnen | to be able
ons | us/our
zelf | self
tegen | against, towards, at
na | after, near
reeds | already
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender
kon | could; past tense of 'to be able'
niets | nothing
uw | your
iemand | somebody
geweest | been; past participle of 'be'
andere | other
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DutchStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}