/
01-basic.t
307 lines (277 loc) · 12.2 KB
/
01-basic.t
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
use v6.c;
use Test;
# much testing copied from roast charsets.t
# https://github.com/perl6/roast/blob/master/S05-mass/charsets.t
# latin-chars are characters from first two Unicode code blocks
# which are "Basic Latin" and "Latin-1 Supplement"
my $latin-chars = [~] chr(0)..chr(0xFF);
my @upper-r = 'A' .. 'Z';
my @lower-r = 'a' .. 'z';
my @digit-r = '0' .. '9';
# note - the characters appear in "sort" order in each string
my %charset-str =
upper => $([~] @upper-r),
lower => $([~] @lower-r),
digit => $([~] @digit-r),
punct => q<!"#%&'()*,-./:;?@[\]_{}>,
xdigit => (@digit-r, 'A' .. 'F', 'a' .. 'f').flat.join,
hexdig => (@digit-r, 'A' .. 'F').flat.join,
alpha => (@upper-r, '_', @lower-r).flat.join,
alpha_x => (@upper-r, @lower-r).flat.join,
alnum => (@digit-r, @upper-r, '_', @lower-r).flat.join,
alnum_x => (@digit-r, @upper-r, @lower-r).flat.join,
blank => "\t ",
space => "\t\n\x[0B]\x[0C]\r ",
cntrl => ( chr(0) .. chr(0x1F), chr(0x7F) ).flat.join
;
%charset-str.append:
( graph => (
@digit-r, @upper-r, @lower-r,
%charset-str< punct >.comb
).flat.sort.join
),
( print => (
@digit-r, @upper-r, @lower-r,
%charset-str< punct >.comb,
%charset-str< space >.comb,
).flat.sort.join
),
( vchar => (chr(0x21) .. chr(0x7E)).join ),
;
plan 56;
{
use US-ASCII;
# should be able to loop with interpolation but ...
# grammar g { token t { <[2]> } }; say so "2" ~~ /<g::t>/; my $rname = "t"; say so "2" ~~ /<g::($rname)>/
# see also https://docs.perl6.org/language/packages#index-entry-%3A%3A%28%29
is $latin-chars.comb(/<US-ASCII::alpha>/).join, %charset-str< alpha >,
'alpha with "_" correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::alpha_x>/).join, %charset-str< alpha_x >,
'alpha without "_" correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::upper>/).join, %charset-str< upper >,
'upper correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::lower>/).join, %charset-str< lower >,
'lower correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::digit>/).join, %charset-str< digit >,
'digit correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::xdigit>/).join, %charset-str< xdigit >,
'xdigit correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::hexdig>/).join, %charset-str< hexdig >,
'hexdig correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::alnum>/).join, %charset-str< alnum >,
'alnum with "_" correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::alnum_x>/).join, %charset-str< alnum_x >,
'alnum without "_" correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::punct>/).join, %charset-str< punct >,
'punct chars since unicode 6.1';
is $latin-chars.comb(/<US-ASCII::graph>/).join, %charset-str< graph >,
'graph correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::blank>/).join, %charset-str< blank >,
'blank correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::space>/).join, %charset-str< space >,
'space correct US-ASCII char subset';
ok "\c[CR]\c[LF]" ~~ /<US-ASCII::space>/, 'US-ASCII space matches CRLF';
is $latin-chars.comb(/<US-ASCII::print>/).join, %charset-str< print >,
'print correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::cntrl>/).join, %charset-str< cntrl >,
'cntrl correct US-ASCII char subset';
is $latin-chars.comb(/<US-ASCII::vchar>/).join, %charset-str< vchar >,
'vchar correct US-ASCII char subset';
grammar ascii-by-count does US-ASCII-UC {
token alpha-c { ^ <-ALPHA>*
[ <ALPHA> <-ALPHA>* ] ** { %charset-str< alpha >.chars }
$ }
token alpha_x-c { ^ <-ALPHAx>*
[ <ALPHAx> <-ALPHAx>* ] ** { %charset-str< alpha_x >.chars }
$ }
token upper-c { ^ <-UPPER>*
[ <UPPER> <-UPPER>* ] ** { %charset-str< upper >.chars }
$ }
token lower-c { ^ <-LOWER>*
[ <LOWER> <-LOWER>* ] ** { %charset-str< lower >.chars }
$ }
token digit-c { ^ <-DIGIT>*
[ <DIGIT> <-DIGIT>* ] ** { %charset-str< digit >.chars }
$ }
token xdigit-c { ^ <-XDIGIT>*
[ <XDIGIT> <-XDIGIT>* ] ** { %charset-str< xdigit >.chars }
$ }
token hexdig-c { ^ <-HEXDIG>*
[ <HEXDIG> <-HEXDIG>* ] ** { %charset-str< hexdig >.chars }
$ }
token alnum-c { ^ <-ALNUM>*
[ <ALNUM> <-ALNUM>* ] ** { %charset-str< alnum >.chars }
$ }
token alnum_x-c { ^ <-ALNUMx>*
[ <ALNUMx> <-ALNUMx>* ] ** { %charset-str< alnum_x >.chars }
$ }
token punct-c { ^ <-PUNCT>*
[ <PUNCT> <-PUNCT>* ] ** { %charset-str< punct >.chars }
$ }
token graph-c { ^ <-GRAPH>*
[ <GRAPH> <-GRAPH>* ] ** { %charset-str< graph >.chars }
$ }
token blank-c { ^ <-BLANK>*
[ <BLANK> <-BLANK>* ] ** { %charset-str< blank >.chars }
$ }
token space-c { ^ <-SPACE>*
[ <SPACE> <-SPACE>* ] ** { %charset-str< space >.chars }
$ }
token print-c { ^ <-PRINT>*
[ <PRINT> <-PRINT>* ] ** { %charset-str< print >.chars }
$ }
token cntrl-c { ^ <-CNTRL>*
[ <CNTRL> <-CNTRL>* ] ** { %charset-str< cntrl >.chars }
$ }
token vchar-c { ^ <-VCHAR>*
[ <VCHAR> <-VCHAR>* ] ** { %charset-str< vchar >.chars }
$ }
}
subtest {
ok $latin-chars ~~ /<ascii-by-count::alpha-c>/,
'ALPHA subset has right size';
ok %charset-str< alpha > ~~ /<ascii-by-count::alpha-c>/,
'ALPHA subset has right elements';
}, 'ALPHA char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::alpha_x-c>/,
'ALPHAx subset has right size';
ok %charset-str< alpha_x > ~~ /<ascii-by-count::alpha_x-c>/,
'ALPHAx subset has right elements';
}, 'ALPHAx char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::upper-c>/,
'UPPER subset has right size';
ok %charset-str< upper > ~~ /<ascii-by-count::upper-c>/,
'UPPER subset has right elements';
}, 'UPPER char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::lower-c>/,
'LOWER subset has right size';
ok %charset-str< lower > ~~ /<ascii-by-count::lower-c>/,
'LOWER subset has right elements';
}, 'LOWER char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::digit-c>/,
'DIGIT subset has right size';
ok %charset-str< digit > ~~ /<ascii-by-count::digit-c>/,
'DIGIT subset has right elements';
}, 'DIGIT char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::xdigit-c>/,
'XDIGIT subset has right size';
ok %charset-str< xdigit > ~~ /<ascii-by-count::xdigit-c>/,
'XDIGIT subset has right elements';
}, 'XDIGIT char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::hexdig-c>/,
'HEXDIG subset has right size';
ok %charset-str< hexdig > ~~ /<ascii-by-count::hexdig-c>/,
'HEXDIG subset has right elements';
}, 'HEXDIG char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::alnum-c>/,
'ALNUM subset has right size';
ok %charset-str< alnum > ~~ /<ascii-by-count::alnum-c>/,
'ALNUM subset has right elements';
}, 'ALNUM char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::alnum_x-c>/,
'ALNUMx subset has right size';
ok %charset-str< alnum_x > ~~ /<ascii-by-count::alnum_x-c>/,
'ALNUMx subset has right elements';
}, 'ALNUMx char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::punct-c>/,
'PUNCT subset has right size';
ok %charset-str< punct > ~~ /<ascii-by-count::punct-c>/,
'PUNCT subset has right elements';
}, 'PUNCT char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::graph-c>/,
'GRAPH subset has right size';
ok %charset-str< graph > ~~ /<ascii-by-count::graph-c>/,
'GRAPH subset has right elements';
}, 'GRAPH char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::blank-c>/,
'BLANK subset has right size';
ok %charset-str< blank > ~~ /<ascii-by-count::blank-c>/,
'BLANK subset has right elements';
}, 'BLANK char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::space-c>/,
'SPACE subset has right size';
ok %charset-str< space > ~~ /<ascii-by-count::space-c>/,
'SPACE subset has right elements';
}, 'SPACE char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::print-c>/,
'PRINT subset has right size';
ok %charset-str< print > ~~ /<ascii-by-count::print-c>/,
'PRINT subset has right elements';
}, 'PRINT char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::cntrl-c>/,
'CNTRL subset has right size';
ok %charset-str< cntrl > ~~ /<ascii-by-count::cntrl-c>/,
'CNTRL subset has right elements';
}, 'CNTRL char class';
subtest {
ok $latin-chars ~~ /<ascii-by-count::vchar-c>/,
'VCHAR subset has right size';
ok %charset-str< vchar > ~~ /<ascii-by-count::vchar-c>/,
'CNTRL subset has right elements';
}, 'CNTRL char class';
ok "\c[CR]\c[LF]" ~~ /<ascii-by-count::SPACE>/,
'US-ASCII SPACE from role matches CRLF';
throws-like { 'a' ~~ /<ALPHA>/ }, X::Method::NotFound,
'only export UC on request';
}
{
use US-ASCII :UC;
is $latin-chars.comb(/<ALPHA>/).join, %charset-str< alpha >,
'alpha correct US-ASCII char subset';
is $latin-chars.comb(/<UPPER>/).join, %charset-str< upper >,
'upper correct US-ASCII char subset';
is $latin-chars.comb(/<LOWER>/).join, %charset-str< lower >,
'lower correct US-ASCII char subset';
is $latin-chars.comb(/<DIGIT>/).join, %charset-str< digit >,
'digit correct US-ASCII char subset';
is $latin-chars.comb(/<XDIGIT>/).join, %charset-str< xdigit >,
'xdigit correct US-ASCII char subset';
is $latin-chars.comb(/<HEXDIG>/).join, %charset-str< hexdig >,
'hexdig correct US-ASCII char subset';
is $latin-chars.comb(/<ALNUM>/).join, %charset-str< alnum >,
'alnum correct US-ASCII char subset';
is $latin-chars.comb(/<PUNCT>/).join, %charset-str< punct >,
'punct chars since unicode 6.1';
is $latin-chars.comb(/<GRAPH>/).join, %charset-str< graph >,
'graph correct US-ASCII char subset';
is $latin-chars.comb(/<BLANK>/).join, %charset-str< blank >,
'blank correct US-ASCII char subset';
is $latin-chars.comb(/<SPACE>/).join, %charset-str< space >,
'space correct US-ASCII char subset';
ok "\c[CR]\c[LF]" ~~ /<SPACE>/, 'US-ASCII SPACE matches CRLF';
is $latin-chars.comb(/<PRINT>/).join, %charset-str< print >,
'print correct US-ASCII char subset';
is $latin-chars.comb(/<CNTRL>/).join, %charset-str< cntrl >,
'cntrl correct US-ASCII char subset';
is $latin-chars.comb(/<VCHAR>/).join, %charset-str< vchar >,
'vchar correct US-ASCII char subset';
}
{
use US-ASCIIx :POSIX;
is $latin-chars.comb(/<US-ASCIIx::alpha>/).join, %charset-str< alpha_x >,
'alpha correct US-ASCIIx char subset';
is $latin-chars.comb(/<US-ASCIIx::alnum>/).join, %charset-str< alnum_x >,
'alnum correct US-ASCIIx char subset';
is $latin-chars.comb(/<US-ASCIIx::cntrl>/).join, %charset-str< cntrl >,
'cntrl correct US-ASCII char subset';
is $latin-chars.comb(/<ALPHA>/).join, %charset-str< alpha_x >,
'alpha correct US-ASCIIx char subset';
is $latin-chars.comb(/<ALNUM>/).join, %charset-str< alnum_x >,
'alnum correct US-ASCIIx char subset';
is $latin-chars.comb(/<CNTRL>/).join, %charset-str< cntrl >,
'cntrl correct US-ASCII char subset';
}