-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
PTBLexer.flex
1407 lines (1338 loc) · 73.3 KB
/
PTBLexer.flex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package edu.stanford.nlp.process;
// Stanford English Tokenizer -- a deterministic, fast, high-quality tokenizer.
// Copyright (c) 2002-2017 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// For more information, bug reports, fixes, contact:
// Christopher Manning
// Dept of Computer Science, Gates 2A
// Stanford CA 94305-9020
// USA
// java-nlp-support@lists.stanford.edu
// http://nlp.stanford.edu/software/
import java.io.Reader;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
/** Provides a tokenizer or lexer that does a pretty good job at
* deterministically tokenizing English according to Penn Treebank conventions.
* The class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> from the specification file
* {@code PTBLexer.flex}. As well as copying what is in the Treebank,
* it now contains many extensions to deal with modern text and encoding
* issues, such as recognizing URLs and common Unicode characters, and a
* variety of options for doing or suppressing certain normalizations.
* Although they shouldn't really be there, it also interprets certain of the
* characters between U+0080 and U+009F as Windows CP1252 characters, since many
* LDC corpora actually mix CP1252 content into supposedly utf-8 text.
* <p>
* <i>Fine points:</i> Output normalized tokens should not contain spaces,
* providing the normalizeSpace option is true. The space will be turned
* into a non-breaking space (U+00A0). Otherwise, they can appear in
* a couple of token classes (phone numbers, fractions).
* The original
* PTB tokenization (messy) standard also escapes certain other characters,
* such as * and /, and normalizes things like " to `` or ''. By default,
* this tokenizer does most of these things. However, you can turn them
* off by using the ptb3Escaping=false option, or, parts of it on or off,
* or unicode character alternatives on with different options. Or you can turn
* everything on for strict Penn Treebank 3 tokenization. You can also build an
* invertible tokenizer, with which you can still access the original
* character sequence and the non-token whitespace around it in a CoreLabel.
* And you can ask for newlines to be tokenized.
* <p>
* <i>Character entities:</i> For legacy reasons, this file will parse and interpret
* some simple SGML/XML/HTML tags and character entities. For modern formats
* like XML, you are better off doing XML parsing, and then running the
* tokenizer on text elements. But, we and others frequently work with simple
* SGML text corpora that are not XML (like LDC text collections). In practice,
* they only include very simple markup and a few simple entities, and the
* minimal character entity
* support in this file is enough to handle them. So we leave this functionality
* in, even though it could conceivably mess with a correct XML file if the
* output of decoding had things that look like character entities. In general,
* handled symbols are changed to ASCII/Unicode forms, but handled accented
* letters are just left as character entities in words.
* <p>
* <i>Character support:</i> PTBLexer works for a broad range of common Unicode
* characters. It recognizes all characters that are classed as letter (alphabetic)
* or digit in Unicode.
* It also matches all defined characters in the Unicode range U+0000-U+07FF
* excluding most control characters except the ones very standardly found in
* plain text documents. Finally, a fair range of other characters, such as many
* symbols commonly found in English Unicode text and emoji are also recognized.
* <p>
* <i>Implementation note:</i> The scanner is caseless, but note, if adding
* or changing regexps, that caseless does not extend inside character
* classes. From the manual: "The %caseless option does not change the
* matched text and does not effect character classes. So [a] still only
* matches the character a and not A, too." Note that some character
* classes deliberately don't have both cases, so the scanner's
* operation isn't completely case-independent, though it mostly is.
* <p>
* <i>Implementation note:</i> This Java class is automatically generated
* from PTBLexer.flex using jflex. DO NOT EDIT THE JAVA SOURCE. This file
* has now been updated for JFlex 1.6.1+.
*
* @author Tim Grow
* @author Christopher Manning
* @author Jenny Finkel
*/
%%
%class PTBLexer
%unicode
%function next
%type Object
%char
%caseless
%state YyTokenizePerLine YyNotTokenizePerLine
%{
/**
* Constructs a new PTBLexer. You specify the type of result tokens with a
* LexedTokenFactory, and can specify the treatment of tokens by boolean
* options given in a comma separated String
* (e.g., "invertible,normalizeParentheses=true").
* If the String is {@code null} or empty, you get the traditional
* PTB3 normalization behaviour (i.e., you get ptb3Escaping=false). If you
* want no normalization, then you should pass in the String
* "ptb3Escaping=false". See the documentation in the {@link PTBTokenizer}
* class for full discussion of all the available options.
*
* @param r The Reader to tokenize text from
* @param tf The LexedTokenFactory that will be invoked to convert
* each substring extracted by the lexer into some kind of Object
* (such as a Word or CoreLabel).
* @param options Options to the tokenizer (see {@link PTBTokenizer})
*/
public PTBLexer(Reader r, LexedTokenFactory<?> tf, String options) {
this(r);
this.tokenFactory = tf;
if (options == null) {
options = "";
}
Properties prop = StringUtils.stringToProperties(options);
Set<Map.Entry<Object,Object>> props = prop.entrySet();
for (Map.Entry<Object,Object> item : props) {
String key = (String) item.getKey();
String value = (String) item.getValue();
boolean val = Boolean.valueOf(value);
if ("".equals(key)) {
// allow an empty item
} else if ("invertible".equals(key)) {
invertible = val;
} else if ("tokenizeNLs".equals(key)) {
tokenizeNLs = val;
} else if ("tokenizePerLine".equals(key)) {
tokenizePerLine = val;
} else if ("ptb3Escaping".equals(key)) {
normalizeSpace = val;
normalizeAmpersandEntity = val;
// normalizeCurrency = val; // [cdm 2018]: We no longer do this as a default ptb3escaping
normalizeFractions = val;
normalizeParentheses = val;
normalizeOtherBrackets = val;
latexQuotes = val;
unicodeQuotes = val;
asciiQuotes = val;
ptb3Ellipsis = val;
unicodeEllipsis = val;
ptb3Dashes = val;
} else if ("americanize".equals(key)) {
americanize = val;
} else if ("normalizeSpace".equals(key)) {
normalizeSpace = val;
} else if ("normalizeAmpersandEntity".equals(key)) {
normalizeAmpersandEntity = val;
} else if ("normalizeCurrency".equals(key)) {
normalizeCurrency = val;
} else if ("normalizeFractions".equals(key)) {
normalizeFractions = val;
} else if ("normalizeParentheses".equals(key)) {
normalizeParentheses = val;
} else if ("normalizeOtherBrackets".equals(key)) {
normalizeOtherBrackets = val;
} else if ("latexQuotes".equals(key)) {
latexQuotes = val;
} else if ("unicodeQuotes".equals(key)) {
unicodeQuotes = val;
if (val) {
latexQuotes = false; // need to override default
}
} else if ("asciiQuotes".equals(key)) {
asciiQuotes = val;
if (val) {
latexQuotes = false; // need to override default
unicodeQuotes = false;
}
} else if ("splitAssimilations".equals(key)) {
splitAssimilations = val;
} else if ("splitHyphenated".equals(key)) {
splitHyphenated = val;
} else if ("ptb3Ellipsis".equals(key)) {
ptb3Ellipsis = val;
} else if ("unicodeEllipsis".equals(key)) {
unicodeEllipsis = val;
} else if ("ptb3Dashes".equals(key)) {
ptb3Dashes = val;
} else if ("escapeForwardSlashAsterisk".equals(key)) {
escapeForwardSlashAsterisk = val;
} else if ("untokenizable".equals(key)) {
switch (value) {
case "noneDelete":
untokenizable = UntokenizableOptions.NONE_DELETE;
break;
case "firstDelete":
untokenizable = UntokenizableOptions.FIRST_DELETE;
break;
case "allDelete":
untokenizable = UntokenizableOptions.ALL_DELETE;
break;
case "noneKeep":
untokenizable = UntokenizableOptions.NONE_KEEP;
break;
case "firstKeep":
untokenizable = UntokenizableOptions.FIRST_KEEP;
break;
case "allKeep":
untokenizable = UntokenizableOptions.ALL_KEEP;
break;
default:
throw new IllegalArgumentException("PTBLexer: Invalid option value in constructor: " + key + ": " + value);
}
} else if ("strictTreebank3".equals(key)) {
strictTreebank3 = val;
} else {
throw new IllegalArgumentException("PTBLexer: Invalid options key in constructor: " + key);
}
}
if (invertible) {
if ( ! (tf instanceof CoreLabelTokenFactory)) {
throw new IllegalArgumentException("PTBLexer: the invertible option requires a CoreLabelTokenFactory");
}
prevWord = (CoreLabel) tf.makeToken("", 0, 0);
prevWordAfter = new StringBuilder();
}
if (tokenizePerLine) {
yybegin(YyTokenizePerLine);
} else {
yybegin(YyNotTokenizePerLine);
}
}
/** Turn on to find out how things were tokenized. */
private static final boolean DEBUG = false;
/** A logger for this class */
private static final Redwood.RedwoodChannels logger = Redwood.channels(PTBLexer.class);
private LexedTokenFactory<?> tokenFactory;
private CoreLabel prevWord;
private StringBuilder prevWordAfter;
private boolean seenUntokenizableCharacter; // = false;
private enum UntokenizableOptions { NONE_DELETE, FIRST_DELETE, ALL_DELETE, NONE_KEEP, FIRST_KEEP, ALL_KEEP }
private UntokenizableOptions untokenizable = UntokenizableOptions.FIRST_DELETE;
/* Flags begin with historical ptb3Escaping behavior. */
private boolean invertible;
private boolean tokenizeNLs;
private boolean tokenizePerLine;
private boolean americanize = false;
private boolean normalizeSpace = true;
private boolean normalizeAmpersandEntity = true;
private boolean normalizeCurrency = false; // only $ and # in Penn Treebank 3 data, but we now allow other currency
private boolean normalizeFractions = true;
private boolean normalizeParentheses = true;
private boolean normalizeOtherBrackets = true;
private boolean latexQuotes = true;
private boolean unicodeQuotes;
private boolean asciiQuotes;
private boolean ptb3Ellipsis = true;
private boolean unicodeEllipsis;
private boolean ptb3Dashes = true;
private boolean escapeForwardSlashAsterisk = false; // this is true in Penn Treebank 3 but we don't do it now
private boolean strictTreebank3 = false;
private boolean splitAssimilations = true;
private boolean splitHyphenated = false;
/* Bracket characters and forward slash and asterisk:
*
* Original Treebank 3 WSJ
* Uses -LRB- -RRB- as the representation for ( ) and -LCB- -RCB- as the representation for { }.
* There are no occurrences of [ ], though there is some mention of -LSB- -RSB- in early documents.
* There are no occurrences of < >.
* All brackets are tagged -LRB- -RRB- [This stays constant.]
* Forward slash and asterisk are escaped by a preceding \ (as \/ and \*)
*
* Treebank 3 Brown corpus
* Has -LRB- -RRB-
* Has a few instances of unescaped [ ] in compounds (the token "A[fj]"
* Neither forward slash or asterisk appears.
*
* Ontonotes (r4)
* Uses -LRB- -RRB- -LCB- -RCB- -LSB- -RSB-.
* Has a very few uses of < and > in longer tokens, which are not escaped.
* Slash is not escaped. Asterisk is not escaped.
*
* LDC2012T13-eng_web_tbk (Google web treebank)
* Has -LRB- -RRB-
* Has { and } used unescaped, treated as brackets.
* Has < and > used unescaped, sometimes treated as brackets. Sometimes << and >> are treated as brackets!
* Has [ and ] used unescaped, treated as brackets.
* Slash is not escaped. Asterisk is not escaped.
*
* Reasonable conclusions for now:
* - Never escape < >
* - Still by default escape [ ] { } but it can be turned off. Use -LSB- -RSB- -LCB- -RCB-.
* Move to not escaping slash and asterisk, and delete escaping in PennTreeReader.
*/
public static final String openparen = "-LRB-";
public static final String closeparen = "-RRB-";
public static final String openbrace = "-LCB-";
public static final String closebrace = "-RCB-";
public static final String ptbmdash = "--";
public static final String ptb3EllipsisStr = "...";
public static final String unicodeEllipsisStr = "\u2026";
/* This pattern now also include newlines, since we sometimes allow them in SGML tokens.... */
private static final Pattern SINGLE_SPACE_PATTERN = Pattern.compile("[ \r\n]");
private static final Pattern LEFT_PAREN_PATTERN = Pattern.compile("\\(");
private static final Pattern RIGHT_PAREN_PATTERN = Pattern.compile("\\)");
/* -- upto (2017) -- */
private void breakByHyphens(String in) {
if (splitHyphenated) {
int firstHyphen = in.indexOf('-');
yypushback(in.length() - firstHyphen);
}
}
/**
* If an apparent negative number is generated from a hyphenated word, tokenize the hyphen.
*/
private void handleHyphenatedNumber(String in) {
// Strip dashes from hyphenated words
if (prevWord != null && in.length() >= 2 && in.charAt(0) == '-' && in.charAt(1) != '-') {
String lastWord = prevWord.originalText();
switch (lastWord) {
case "mid":
case "late":
case "early":
yypushback(in.length() - 1);
default:
if (lastWord.length() > 0 &&
lastWord.charAt(0) <= 57 && lastWord.charAt(0) >= 48 &&
prevWordAfter != null && prevWordAfter.length() == 0) { // last word is a number as well
yypushback(in.length() - 1);
}
break;
}
}
}
private static String removeFromNumber(String in) {
StringBuilder out = null;
if ("-".equals(in)) {
// Shortcut for if we split on hyphens
return in;
}
// \u00AD is the soft hyphen character, which we remove, regarding it as inserted only for line-breaking
// \u066C\u2009\u202F are thousands separator characters that it seems safe to remove.
int length = in.length();
for (int i = 0; i < length; i++) {
char ch = in.charAt(i);
if (ch == '\u00AD' || ch == '\u066C' || ch == '\u2009' || ch == '\u202F') {
if (out == null) {
out = new StringBuilder(length);
out.append(in.substring(0, i));
}
} else if (out != null) {
out.append(ch);
}
}
if (out == null) {
return in;
}
return out.toString().trim();
}
/*
* This class has now been extended to cover the main Windows CP1252 characters,
* at either their correct Unicode codepoints, or in their invalid
* positions as 8 bit chars inside the iso-8859 control region.
*
* ellipsis 85 0133 2026 8230 COMPLICATED!! Also a newline character for IBM 390; we let ellipsis win
* dagger 86 2020
* double dagger 87 2021
* single quote curly starting 91 0145 2018 8216
* single quote curly ending 92 0146 2019 8217
* double quote curly starting 93 0147 201C 8220
* double quote curly ending 94 0148 201D 8221
* bullet 95
* en dash 96 0150 2013 8211
* em dash 97 0151 2014 8212
*/
private static final Pattern singleQuote = Pattern.compile("'|'");
// If they typed `` they probably meant it, but if it's '' or mixed, we use our heuristics.
private static final Pattern doubleQuote = Pattern.compile("\"|''|'`|`'|"");
// 82,84,91,92,93,94 aren't valid unicode points, but sometimes they show
// up from cp1252 and need to be translated
private static final Pattern leftSingleQuote = Pattern.compile("[\u0082\u008B\u0091\u2018\u201A\u201B\u2039]");
private static final Pattern rightSingleQuote = Pattern.compile("[\u0092\u009B\u00B4\u2019\u203A]");
private static final Pattern leftDoubleQuote = Pattern.compile("[\u0084\u0093\u201C\u201E\u00AB]|[\u0091\u2018]'");
private static final Pattern rightDoubleQuote = Pattern.compile("[\u0094\u201D\u00BB]|[\u0092\u2019]'");
private static String latexQuotes(String in, boolean probablyLeft) {
// System.err.println("Handling quote on " + in + " probablyLeft=" + probablyLeft);
String s1 = in;
if (probablyLeft) {
s1 = singleQuote.matcher(s1).replaceAll("`");
s1 = doubleQuote.matcher(s1).replaceAll("``");
} else {
s1 = singleQuote.matcher(s1).replaceAll("'");
s1 = doubleQuote.matcher(s1).replaceAll("''");
}
s1 = leftSingleQuote.matcher(s1).replaceAll("`");
s1 = rightSingleQuote.matcher(s1).replaceAll("'");
s1 = leftDoubleQuote.matcher(s1).replaceAll("``");
s1 = rightDoubleQuote.matcher(s1).replaceAll("''");
// System.err.println(" Mapped to " + s1);
return s1;
}
// U+00B4 should be acute accent, but stuff happens
private static final Pattern asciiSingleQuote = Pattern.compile("'|[\u0082\u008B\u0091\u00B4\u2018\u0092\u2019\u009B\u201A\u201B\u2039\u203A']");
private static final Pattern asciiDoubleQuote = Pattern.compile(""|[\u0084\u0093\u201C\u0094\u201D\u201E\u00AB\u00BB\"]");
private static String asciiQuotes(String in) {
String s1 = in;
s1 = asciiSingleQuote.matcher(s1).replaceAll("'");
s1 = asciiDoubleQuote.matcher(s1).replaceAll("\"");
return s1;
}
private static final Pattern unicodeLeftSingleQuote = Pattern.compile("\u0091");
private static final Pattern unicodeRightSingleQuote = Pattern.compile("\u0092");
private static final Pattern unicodeLeftDoubleQuote = Pattern.compile("\u0093");
private static final Pattern unicodeRightDoubleQuote = Pattern.compile("\u0094");
private static final Pattern leftDuck = Pattern.compile("\u008B");
private static final Pattern rightDuck = Pattern.compile("\u009B");
private static String unicodeQuotes(String in, boolean probablyLeft) {
String s1 = in;
if (probablyLeft) {
s1 = singleQuote.matcher(s1).replaceAll("\u2018");
s1 = doubleQuote.matcher(s1).replaceAll("\u201c");
} else {
s1 = singleQuote.matcher(s1).replaceAll("\u2019");
s1 = doubleQuote.matcher(s1).replaceAll("\u201d");
}
s1 = unicodeLeftSingleQuote.matcher(s1).replaceAll("\u2018");
s1 = unicodeRightSingleQuote.matcher(s1).replaceAll("\u2019");
s1 = unicodeLeftDoubleQuote.matcher(s1).replaceAll("\u201c");
s1 = unicodeRightDoubleQuote.matcher(s1).replaceAll("\u201d");
s1 = leftDuck.matcher(s1).replaceAll("\u2039");
s1 = rightDuck.matcher(s1).replaceAll("\u203A");
return s1;
}
private String handleQuotes(String tok, boolean probablyLeft) {
if (latexQuotes) {
return latexQuotes(tok, probablyLeft);
} else if (unicodeQuotes) {
return unicodeQuotes(tok, probablyLeft);
} else if (asciiQuotes) {
return asciiQuotes(tok);
} else {
return tok;
}
}
private Object handleEllipsis(final String tok) {
if (ptb3Ellipsis) {
return getNext(ptb3EllipsisStr, tok);
} else if (unicodeEllipsis) {
return getNext(unicodeEllipsisStr, tok);
} else {
return getNext(tok, tok);
}
}
private int indexOfSpace(String txt) {
for (int i = 0, len = txt.length(); i < len; i++) {
char ch = txt.charAt(i);
if (ch == ' ' || ch == '\u00A0') {
return i;
}
}
return -1;
}
private Object getNext() {
final String txt = yytext();
return getNext(txt, txt);
}
/** Make the next token.
* @param txt What the token should be
* @param originalText The original String that got transformed into txt
*/
private Object getNext(String txt, String originalText) {
if (invertible) {
String str = prevWordAfter.toString();
prevWordAfter.setLength(0);
CoreLabel word = (CoreLabel) tokenFactory.makeToken(txt, yychar, yylength());
word.set(CoreAnnotations.OriginalTextAnnotation.class, originalText);
word.set(CoreAnnotations.BeforeAnnotation.class, str);
prevWord.set(CoreAnnotations.AfterAnnotation.class, str);
prevWord = word;
return word;
} else {
Object word = tokenFactory.makeToken(txt, yychar, yylength());
if (word instanceof CoreLabel) {
prevWord = (CoreLabel) word;
}
return word;
}
}
private void fixJFlex4SpaceAfterTokenBug() {
// try to work around an apparent jflex bug where it
// gets a space at the token end by getting
// wrong the length of the trailing context.
while (yylength() > 0) {
char last = yycharat(yylength()-1);
if (last == ' ' || last == '\t' || (last >= '\n' && last <= '\r' || last == '\u0085')) {
yypushback(1);
} else {
break;
}
}
}
private Object processAcronym() {
fixJFlex4SpaceAfterTokenBug();
String s;
if (yylength() == 2) { // "I.", etc.
yypushback(1); // return a period next time;
s = yytext(); // return the word without the final period
} else if (strictTreebank3 && ! "U.S.".equals(yytext())) {
yypushback(1); // return a period for next time
s = yytext(); // return the word without the final period
} else {
s = yytext(); // return the word WITH the final period
yypushback(1); // (reduplication:) also return a period for next time
}
return getNext(s, yytext());
}
private Object processAbbrev3() {
fixJFlex4SpaceAfterTokenBug();
return getNext();
}
private Object processAbbrev1() {
String s;
if (strictTreebank3 && ! "U.S.".equals(yytext())) {
yypushback(1); // return a period for next time
s = yytext();
} else {
s = yytext();
yypushback(1); // return a period for next time
}
return getNext(s, yytext());
}
%}
/* Todo: Really SGML shouldn't be here at all, it's kind of legacy.
But we continue to tokenize some simple standard forms of concrete
SGML syntax, since it tends to give robustness. */
/* ---
( +([A-Za-z][A-Za-z0-9:.-]*( *= *['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]| *\/))*
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]*([ ]+([A-Za-z][A-Za-z0-9:.-]*([ ]*=[ ]*['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)>
( +[A-Za-z][A-Za-z0-9:.-]*)*
FOO = ([ ]+[A-Za-z][A-Za-z0-9:.-]*)*
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]* *)>
SGML = \<([!\?][A-Za-z\-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*['\"][^\r\n'\"]*['\"]|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)\>
--- */
// <STORYID cat=w pri=u>
// SGML1 allows attribute value match over newline; SGML2 does not.
SGML1 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*('[^']*'|\"[^\"]*\"|[A-Za-z][A-Za-z0-9_:\.\-]*)))*[ ]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ ]*\>
SGML2 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z][A-Za-z0-9_:\.\-]*)))*[ ]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ ]*\>
SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
SPAMP = &
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
SPLET = &[aeiouAEIOU](acute|grave|uml);
/* \u3000 is ideographic space */
SPACE = [ \t\u00A0\u2000-\u200A\u3000]
SPACES = {SPACE}+
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
SPACENL = ({SPACE}|{NEWLINE})
SPACENLS = {SPACENL}+
/* These next ones are useful to get a fixed length trailing context. */
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
SENTEND1 = {SPACENL}({SPACENL}|[:uppercase:]|{SGML1})
SENTEND2 = {SPACE}({SPACE}|[:uppercase:]|{SGML2})
DIGIT = [:digit:]|[\u07C0-\u07C9]
DATE = {DIGIT}{1,2}[\-\/]{DIGIT}{1,2}[\-\/]{DIGIT}{2,4}
/* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
NUM = {DIGIT}*([.,\u066B\u066C]{DIGIT}+)+|{DIGIT}+([.:,\u00AD\u066B\u066C\u2009\u202F]{DIGIT}+)*
/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
years or times in parentheses), and having them in tokens messes up
treebank parsing.
NUMBER = [\-+]?{NUM}|\({NUM}\) */
NUMBER = [\-+]?{NUM}
SUBSUPNUM = [\u207A\u207B\u208A\u208B]?([\u2070\u00B9\u00B2\u00B3\u2074-\u2079]+|[\u2080-\u2089]+)
/* Constrain fraction to only match likely fractions. Full one allows hyphen, space, or non-breaking space between integer and fraction part, but strictTreebank3 allows only hyphen. */
FRAC = ({DIGIT}{1,4}[- \u00A0])?{DIGIT}{1,4}(\\?\/|\u2044){DIGIT}{1,4}
FRAC2 = [\u00BC\u00BD\u00BE\u2153-\u215E]
DOLSIGN = ([A-Z]*\$|#)
/* These are cent and pound; currency, yen; CP1252 euro, ECU, new shekel, euro; rupee ... Lira */
DOLSIGN2 = [\u00A2\u00A3\u00A4\u00A5\u0080\u20A0\u20AA\u20AC\u20B9\u060B\u0E3F\u20A4\uFFE0\uFFE1\uFFE5\uFFE6\u20BD\u20A9]
/* not used DOLLAR {DOLSIGN}[ \t]*{NUMBER} */
/* |\( ?{NUMBER} ?\)) # is for pound signs */
/* For some reason U+0237-U+024F (dotless j) isn't in [:letter:]. Recent additions? */
LETTER = ([:letter:]|{SPLET}|[\u00AD\u0237-\u024F\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02FF\u0300-\u036F\u0370-\u037D\u0384\u0385\u03CF\u03F6\u03FC-\u03FF\u0483-\u0487\u04CF\u04F6-\u04FF\u0510-\u0525\u055A-\u055F\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0615-\u061A\u063B-\u063F\u064B-\u065E\u0670\u06D6-\u06EF\u06FA-\u06FF\u070F\u0711\u0730-\u074F\u0750-\u077F\u07A6-\u07B1\u07CA-\u07F5\u07FA\u0900-\u0903\u093C\u093E-\u094E\u0951-\u0955\u0962-\u0963\u0981-\u0983\u09BC-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A4F\u0A81-\u0A83\u0ABC-\u0ACF\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0C01-\u0C03\u0C3E-\u0C56\u0D3E-\u0D44\u0D46-\u0D48\u0E30-\u0E3A\u0E47-\u0E4E\u0EB1-\u0EBC\u0EC8-\u0ECD])
/* Allow in the zero-width (non-)joiner characters. */
WORD = {LETTER}({LETTER}|{DIGIT})*([.!?\u200c\u200d]{LETTER}({LETTER}|{DIGIT})*)*
FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
FILENAME = [\p{Alpha}\p{Digit}]+([-._/][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
/* Curse of intelligent tokenization, here we come. To model what LDC does, we separate out some \p{Digit}+\p{Alpha}+ tokens as 2 words */
/* Go with just the top 20 currencies. */
SEP_CURRENCY = (USD|EUR|JPY|GBP|AUD|CAD|CHF|CNY|SEK|NZD|MXN|SGD|HKD|NOK|KRW|TRY|RUB|INR|BRL|ZAR)
/* Can't include s for seconds as too many iPhone 6s, 1990s, etc. */
SEP_UNITS = (lbs?|ltr|mins?|[kcm][gml]|[MGTP]([B]|[H][z])|fps|bpm|[MG][b][p][s])
SEP_OTHER = ([ap]m|hrs?|words?|m(on)?ths?|y(ea)?rs?|pts?)
/* If there is a longer alphabetic match, another longer pattern will match so don't need to filter that. */
SEP_SUFFIX = ({SEP_CURRENCY}|{SEP_UNITS}|{SEP_OTHER})
/* THING: The $ was for things like New$;
WAS: only keep hyphens with short one side like co-ed
But treebank just allows hyphenated things as words!
THING allows d'Avignon or NUMBER before HYPHEN and the same things after it. Only first number can be negative. */
THING = ([dDoOlL]{APOSETCETERA}[\p{Alpha}\p{Digit}])?([\p{Alpha}\p{Digit}]+|{NUMBER})({HYPHEN}([dDoOlL]{APOSETCETERA}[\p{Alpha}\p{Digit}])?([\p{Alpha}\p{Digit}]+|{NUM}))*
THINGA = [A-Z]+(([+&]|{SPAMP})[A-Z]+)+
THING3 = [\p{Alpha}\p{Digit}]+(-[\p{Alpha}]+){0,2}(\\?\/[\p{Alpha}\p{Digit}]+(-[\p{Alpha}]+){0,2}){1,2}
APOS = ['\u0092\u2019´]|' /* ASCII straight quote, single right curly quote in CP1252 (wrong) or Unicode or reversed quote or HTML SGML escape */
/* Includes extra ones that may appear inside a word, rightly or wrongly */
APOSETCETERA = {APOS}|[`\u0091\u2018\u201B]
/* HTHING recognizes hyphenated words, including ones with various kinds of numbers in them.
It's not quite clear what this recognizes that THING doesn't. Delete this one?!? */
HTHING = [\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit}.,\u00AD]*(-([\p{Alpha}\p{Digit}\u00AD]+(\.[:digit:]+)?|{ACRO2}\.))+
/* from the CLEAR (biomedical?) treebank documentation */
/* we're going to split on most hypens except a few */
/* From Supplementary Guidelines for ETTB 2.0 (Justin Mott, Colin Warner, Ann Bies; Ann Taylor) */
/*
Hyphenated words that are allowed to be kept together match these patterns.
Note that this list is case-insensitive and non-exhaustive.
a- adeno- agro- ante- anti- aorto- arch- ambi- -able -ahol -aholic -ation axio- be- bi- bio- broncho-
co- counter- cross- centi- -centric circum- cis- colo- contra- cortico- cran- crypto- -cracy -crat cyber-
de- deca- demi- dis- -dom e- eco- electro- ennea- -esque -ette ex- extra- -er -ery ferro- -ful -fest -fold
gastro- -gate -gon giga- hepta- hemi- hypo- hexa- -hood
in- inter- intra- -ian -ible -ing -isation -ise -ising -ism -ist - itis -ization -ize -izing ideo- idio- infra- iso-
-less -logist -logy -ly judeo- macro- mega- micro- mini- mono- musculo- mm-hm mm-mm -most multi- medi- milli-
neo- neuro- nitro- non- novem- octa- octo- o-kay -o-torium ortho- over-
paleo- pan- para- pelvi- penta- peri- pheno- phospho- pica- pneumo- poly- post- pre- preter- pro- pseudo-
quasi- quadri- quinque- -rama re- recto- salpingo- sero- semi- sept- soci- sub- super- supra- sur-
tele- tera- tetra- tri- u- uber- uh-huh uh-oh ultra- un- uni- vice- veno- ventriculo- -wise x-
*/
HTHINGEXCEPTIONPREFIXED = (e|a|u|x|agro|ante|anti|arch|be|bi|bio|co|counter|cross|cyber|de|eco|ex|extra|inter|intra|macro|mega|micro|mini|multi|neo|non|over|pan|para|peri|post|pre|pro|pseudo|quasi|re|semi|sub|super|tri|ultra|un|uni|vice)(-([\p{Alpha}\p{Digit}\u00AD]+|{ACRO2}\.))+
HTHINGEXCEPTIONSUFFIXED = ([\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit}.,\u00AD]*)(-)(esque|ette|fest|fold|gate|itis|less|most|o-torium|rama|wise)(s|es|d|ed)?
HTHINGEXCEPTIONWHOLE = (mm-hm|mm-mm|o-kay|uh-huh|uh-oh)(s|es|d|ed)?
/* things like 'll and 'm */
REDAUX = {APOS}([msdMSD]|re|ve|ll)
/* For things that will have n't on the end. They can't end in 'n' */
/* \u00AD is soft hyphen */
SWORD = [\p{Alpha}\u00AD]*[A-MO-Za-mo-z](\u00AD)*
SREDAUX = n{APOSETCETERA}t
/* Tokens you want but already okay: C'mon 'n' '[2-9]0s '[eE]m 'till?
[Yy]'all 'Cause Shi'ite B'Gosh o'clock. Here now only need apostrophe
final words. */
/* Note that Jflex doesn't support {2,} form. Only {2,k}. */
/* [yY]' is for Y'know, y'all and I for I. So exclude from one letter first */
/* Rest are for French borrowings. n allows n'ts in "don'ts" */
/* Arguably, c'mon should be split to "c'm" + "on", but not yet. */
APOWORD = {APOS}n{APOS}?|[lLdDjJ]{APOS}|Dunkin{APOS}|somethin{APOS}|ol{APOS}|{APOS}em|diff{APOSETCETERA}rent|[A-HJ-XZn]{APOSETCETERA}[:letter:]{2}[:letter:]*|{APOS}[2-9]0s|{APOS}till?|[:letter:][:letter:]*[aeiouyAEIOUY]{APOSETCETERA}[aeiouA-Z][:letter:]*|{APOS}cause|cont'd\.?|nor'easter|c'mon|e'er|s'mores|ev'ry|li'l|nat'l|O{APOSETCETERA}o
APOWORD2 = y{APOS}
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
FULLURL = (ftp|svn|svn\+ssh|http|https|mailto):\/\/[^ \t\n\f\r<>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+[^ \t\n\f\r<>|.!?,;:&`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
LIKELYURL = ((www\.([^ \t\n\f\r`<>|.!?,\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+[a-zA-Z]{2,4})|(([^ \t\n\f\r`<>|.!?,:\/$\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+(com|net|org|edu)))(\/[^ \t\n\f\r`<>|]+[^ \t\n\f\r`<>|.!?,;:&\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-])?
/* <,< should match >,>, but that's too complicated */
/* EMAIL = (<|<)?[a-zA-Z0-9][^ \t\n\f\r\"<>|()\u00A0{}]*@([^ \t\n\f\r\"<>|(){}.\u00A0]+\.)*([^ \t\n\f\r\"<>|(){}\[\].,;:\u00A0]+)(>|>)? */
EMAIL = (<|<)?(mailto:)?[a-zA-Z0-9._%+-]+@[A-Za-z0-9][A-Za-z0-9.-]*[A-Za-z0-9](>|>)?
/* Technically, names should be capped at 15 characters and can be any non-zero string of ASCII letters, numbers
and underscores. However, if you length limit then you get into weirdness with what happens to the rest of the
characters, and allowing ones starting with numbers disables using @ for "at" before numeric quantities, so we
just special case in a couple of people like that. */
TWITTER_NAME = [@\uFF20]([A-Za-z_][a-zA-Z_0-9]*|50cent)
TWITTER_HASHTAG = [#\uFF03]{LETTER}({LETTER}|{DIGIT}|_)*({LETTER}|{DIGIT})
TWITTER = {TWITTER_NAME}|{TWITTER_HASHTAG}
ISO8601DATETIME = [0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[x0-9]{2}:[0-9]{2}Z?
DEGREES = °[CF]
/* --- This block becomes ABBREV1 and is usually followed by lower case words. --- */
/* Abbreviations - originally induced from 1987 WSJ by hand; since variously expanded */
ABMONTH = Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec
/* "May." isn't an abbreviation. "Jun." and "Jul." barely occur, but don't seem dangerous */
ABDAYS = Mon|Tue|Tues|Wed|Thu|Thurs|Fri
/* Sat. and Sun. barely occur and can easily lead to errors, so we omit them */
/* In caseless, |a\.m|p\.m handled as ACRO, and this is better as can often
be followed by capitalized. */
/* Ma. or Me. isn't included as too many errors, and most sources use Mass. etc. */
/* Fed. is tricky. Usually sentence end, but not before "Governor" or "Natl. Mtg. Assn." */
/* Make some states case sensitive, since they're also reasonably common words */
/* Only allow La since you also get LA for Los Angeles. */
ABSTATE = Ala|Ariz|[A]z|[A]rk|Calif|Colo|Conn|Ct|Dak|[D]el|Fla|Ga|[I]ll|Ind|Kans?|Ky|[L][a]|[M]ass|Md|Mich|Minn|[M]iss|Mo|Mont|Neb|Nev|Okla|[O]re|[P]a|Penn|Tenn|[T]ex|Va|Vt|[W]ash|Wisc?|Wyo
/* Bhd is Malaysian companies! Rt. is Hungarian? */
/* Special case: Change the class of Pty when followed by Ltd to not sentence break (in main code below)... */
ABCOMP = Inc|Cos?|Corp|Pp?t[ye]s?|Ltd|Plc|Rt|Bancorp|Bhd|Assn|Univ|Intl|Sys
/* Don't include fl. oz. since Oz turns up too much in caseless tokenizer. ft now allows upper after it for "Fort" use. */
ABNUM = tel|est|ext|sq
/* p used to be in ABNUM list, but it can't be any more, since the lexer
is now caseless. We don't want to have it recognized for P. Both
p. and P. are now under ABBREV4. ABLIST also went away as no-op [a-e] */
ABPTIT = Jr|Sr|Bros|(Ed|Ph)\.D|Esq
/* ss?p and aff are for bio taxonomy; also gen and cf but appear elsewhere as ABBREV4 already; fl for flourished */
ABTAXONOMY = (s(ub)?)?spp?|aff|[f][l]
/* ABBREV1 abbreviations are normally followed by lower case words.
* If they're followed by an uppercase one, we assume there is also a
* sentence boundary.
* Notes: many misspell etc. ect.; kr. is some other currency
*/
ABBREV1 = ({ABMONTH}|{ABDAYS}|{ABSTATE}|{ABCOMP}|{ABNUM}|{ABPTIT}|{ABTAXONOMY}|etc|ect|al|seq|Bldg|Pls|wrt|orig|incl|t[b]?[s][p]|kr)\.
/* --- This block becomes ABBREV2 and is usually followed by upper case words. --- */
/* In the caseless world S.p.A. "Società Per Azioni (Italian: shared company)" is got as a regular acronym */
/* ACRO Is a bad case -- can go either way! */
ACRO = [A-Za-z](\.[A-Za-z])*|(Canada|Sino|Korean|EU|Japan|non)-U\.S|U\.S\.-(U\.K|U\.S\.S\.R)
ACRO2 = [A-Za-z](\.[A-Za-z])+|(Canada|Sino|Korean|EU|Japan|non)-U\.S|U\.S\.-(U\.K|U\.S\.S\.R)
/* ABTITLE is mainly person titles, but also Mt for mountains and Ft for Fort. St[ae] does Saint, Santa, suite, etc. */
ABTITLE = Mr|Mrs|Ms|Mx|[M]iss|Drs?|Profs?|Sens?|Reps?|Attys?|Lt|Col|Gen|Messrs|Govs?|Adm|Rev|Maj|Sgt|Cpl|Pvt|Capt|St[ae]?|Ave|Pres|Lieut|Rt|Hon|Brig|Co?mdr|Pfc|Spc|Supts?|Det|Mt|Ft|Adj|Adv|Asst|Assoc|Ens|Insp|Mlle|Mme|Msgr|Sfc
ABCOMP2 = Invt|Elec|Natl|M[ft]g|Dept|Blvd|Rd|Ave|[P][l]|viz
/* ABRREV2 abbreviations are normally followed by an upper case word.
* We assume they aren't used sentence finally. Ph is in there for Ph. D Sc for B.Sc.
*/
ABBREV4 = {ABTITLE}|vs|[v]|Wm|Jos|Cie|a\.k\.a|cf|TREAS|Ph|[S][c]|{ACRO}|{ABCOMP2}
ABBREV2 = {ABBREV4}\.
ACRONYM = ({ACRO})\.
/* Cie. is used by French companies sometimes before and sometimes at end as in English Co. But we treat as allowed to have Capital following without being sentence end. Cia. is used in Spanish/South American company abbreviations, which come before the company name, but we exclude that and lose, because in a caseless segmenter, it's too confusable with CIA. */
/* Added Wm. for William and Jos. for Joseph */
/* In tables: Mkt. for market Div. for division of company, Chg., Yr.: year */
/* --- ABBREV3 abbreviations are allowed only before numbers. ---
* Otherwise, they aren't recognized as abbreviations (unless they also
* appear in ABBREV1 or ABBREV2).
* est. is "estimated" -- common in some financial contexts. ext. is extension, ca. is circa.
* "Art(s)." is for "article(s)" -- common in legal context, Sec(t). for section(s)
*/
/* Maybe also "op." for "op. cit." but also get a photo op. Rs. for Rupees */
/* Pt for part needs to be case sensitive (vs. country code for Portugal). */
ABBREV3 = (ca|figs?|prop|nos?|sect?s?|arts?|paras?|bldg|prop|pp|op|approx|[P][t]|rs|Apt|Rt)\.
/* Case for south/north before a few places. */
ABBREVSN = So\.|No\.
/* See also a couple of special cases for pty. in the code below. */
/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. */
PHONE = (\([0-9]{2,3}\)[ \u00A0]?|(\+\+?)?([0-9]{2,4}[\- \u00A0])?[0-9]{2,4}[\- \u00A0])[0-9]{3,4}[\- \u00A0]?[0-9]{3,5}|((\+\+?)?[0-9]{2,4}\.)?[0-9]{2,4}\.[0-9]{3,4}\.[0-9]{3,5}
/* Fake duck feet appear sometimes in WSJ, and aren't likely to be SGML, less than, etc., so group. */
FAKEDUCKFEET = <<|>>
LESSTHAN = <|<
GREATERTHAN = >|>
COREHYPHEN = [-\u058A\u2010\u2011]
HYPHEN = {COREHYPHEN}|\-
HYPHENS = {COREHYPHEN}+
LDOTS = \.\.\.+|[\u0085\u2026]
SPACEDLDOTS = \.[ \u00A0](\.[ \u00A0])+\.
ATS = @+
UNDS = _+
ASTS = \*+|(\\\*){1,3}
HASHES = #+
FNMARKS = {ATS}|{HASHES}|{UNDS}
INSENTP = [,;:\u3001]
QUOTES = {APOS}|[`\u2018-\u201F\u0082\u0084\u0091-\u0094\u2039\u203A\u00AB\u00BB]{1,2}
DBLQUOT = \"|"|[`'\u0091\u0092\u2018\u2019]'
/* Cap'n for captain, c'est for french */
TBSPEC = -(RRB|LRB|RCB|LCB|RSB|LSB)-|C\.D\.s|pro-|anti-|S(&|&)P-500|S(&|&)Ls|Cap{APOS}n|c{APOS}est
SWEARING = f[-*][-c*]k(in[g']?|e[dr])?|(bull|dip)?sh[-\*]t(ty|e|box)?|c[-*]nts?|p[-*]ss(e[sd]|ing)?|c[-*]ck|b[-*]tch|t[-*]ts|tw[-*]ts?|cr[-*]p|d[-*]cks?|b[-*][-*s]t[-*]rds?|pr[-*]ck|d[-*]mn|bl[-*]{2,2}dy
TBSPEC2 = {APOS}[0-9][0-9]
BANGWORDS = (E|Yahoo|Jeopardy)\!
BANGMAGAZINES = OK\!
/* Smileys (based on Chris Potts' sentiment tutorial, but much more restricted set - e.g., no "8)", "do:" or "):", too ambiguous) and simple Asian smileys */
SMILEY = [<>]?[:;=][\-o\*']?[\(\)DPdpO\\{@\|\[\]]
ASIANSMILEY = [\^x=~<>]\.\[\^x=~<>]|[\-\^x=~<>']_[\-\^x=~<>']|\([\-\^x=~<>'][_.]?[\-\^x=~<>']\)|\([\^x=~<>']-[\^x=~<>'`]\)|¯\\_\(ツ\)_\/¯
/* Slightly generous but generally reasonable emoji parsing */
/* These are human emoji that can have a zwj gender (as well as skin color) */
EMOJI_GENDERED = [\u26F9\u{01F3C3}-\u{01F3C4}\u{01F3CA}-\u{01F3CC}\u{01F466}-\u{01F469}\u{01F46E}-\u{01F46F}\u{01F471}\u{01F473}\u{01F477}\u{01F481}-\u{01F482}\u{01F486}-\u{01F487}\u{01F575}\u{01F645}-\u{01F647}\u{01F64B}\u{01F64D}-\u{01F64E}\u{01F6A3}\u{01F6B4}-\u{01F6B6}\u{01F926}\u{01F937}-\u{01F939}\u{01F93C}-\u{01F93E}\u{01F9D6}-\u{01F9DF}]
/* Emoji follower is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
EMOJI_FOLLOW = [\uFE0E\uFE0F\u{01F3FB}-\u{01F3FF}]
/* Just things followed by the keycap surrounding char - note that if not separated by space beforehand, may be mistokenized */
EMOJI_KEYCAPS = [\u0023\u002A\u0030-\u0039]\uFE0F?\u20E3
/* Two geographic characters as a flag or GB regions as flags */
EMOJI_FLAG = [\u{01F1E6}-\u{01F1FF}]{2,2}|\u{01F3F4}\u{0E0067}\u{0E0062}[\u{0E0061}-\u{0E007A}]+\u{0E007F}
/* Rainbow flag etc. */
EMOJI_MISC = [\u{01F3F3}\u{01F441}][\uFE0E\uFE0F]?\u200D[\u{01F308}\u{01F5E8}][\uFE0E\uFE0F]?|{EMOJI_KEYCAPS}
/* Things that have an emoji presentation form */
EMOJI_PRESENTATION = [\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9-\u21AA\u231A-\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA-\u25AB\u25B6\u25C0\u25FB-\u27BF\u2934-\u2935\u2B05-\u2B07\u2B1B-\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\u{01F000}-\u{01F9FF}]
/* Human modifier is something that appears after a zero-width joiner (zwj) U+200D */
HUMAN_MODIFIER = [\u2640\u2642\u2695-\u2696\u2708\u2764\u{01F33E}\u{01F373}\u{01F393}\u{01F3A4}\u{01F3A8}\u{01F3EB}\u{01F3ED}\u{01F468}-\u{01F469}\u{01F48B}\u{01F4BB}-\u{01F4BC}\u{01F527}\u{01F52C}\u{01F680}\u{01F692}][\uFE0E\uFE0F]?
/* flag | emoji optionally with follower | precomposed gendered/family consisting of human followed by one or more of zero width joiner then another human/profession | Misc */
EMOJI = {EMOJI_FLAG}|{EMOJI_PRESENTATION}{EMOJI_FOLLOW}?|{EMOJI_GENDERED}{EMOJI_FOLLOW}?(\u200D([\u{01F466}-\u{01F469}]{EMOJI_FOLLOW}?|{HUMAN_MODIFIER})){1,3}|{EMOJI_MISC}
/* U+2200-U+2BFF has a lot of the various mathematical, etc. symbol ranges */
MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600-\u0603\u0606-\u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703-\u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u1FBD\u2016\u2017\u2020-\u2025\u2030-\u2038\u203B\u203C\u2043\u203E-\u2042\u2044\u207A-\u207F\u208A-\u208E\u2100-\u214F\u2190-\u21FF\u2200-\u2BFF\u3001-\u3006\u3008-\u3020\u30FB\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF65]
/* \uFF65 is Halfwidth katakana middle dot; \u30FB is Katakana middle dot */
/* Math and other symbols that stand alone: °²× ∀ */
PROG_LANGS = c[+][+]|(c|f)#
ASSIMILATIONS3 = cannot|'twas|dunno
/* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno
/* CP1252: dagger, double dagger, per mille, bullet, small tilde, trademark */
CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
/* CP1252 letters */
/* 83 = f with hook --> U+0192; 8a = S with Caron --> U+0160; 9c = ligature oe --> U+0153; */
/* CP1252LETTER = [\u0083\u008A\u009C] */
%%
{PROG_LANGS} { String tok = yytext();
if (DEBUG) { logger.info("Used {PROG_LANGS} to recognize " + tok); }
return getNext(tok, tok);
}
{ASSIMILATIONS3} { if (splitAssimilations) {
yypushback(3);
}
String tok = yytext();
if (DEBUG) { logger.info("Used {ASSIMILATIONS3} to recognize " + tok +
"; splitAssimilations=" + splitAssimilations); }
return getNext(tok, tok);
}
{ASSIMILATIONS2}/[^\p{Alpha}]
{ if (splitAssimilations) {
yypushback(2);
}
String tok = yytext();
if (DEBUG) { logger.info("Used {ASSIMILATIONS2} to recognize " + tok + " as " + tok +
"; splitAssimilations=" + splitAssimilations); }
return getNext(tok, tok);
}
<YyNotTokenizePerLine>{SGML1}
{ final String origTxt = yytext();
String txt = origTxt;
if (normalizeSpace) {
txt = SINGLE_SPACE_PATTERN.matcher(txt).replaceAll("\u00A0"); // change to non-breaking space
}
if (DEBUG) { logger.info("Used {SGML1} to recognize " + origTxt + " as " + txt); }
return getNext(txt, origTxt);
}
<YyTokenizePerLine>{SGML2}
{ final String origTxt = yytext();
String txt = origTxt;
if (normalizeSpace) {
txt = txt.replace(' ', '\u00A0'); // change space to non-breaking space
}
if (DEBUG) { logger.info("Used {SGML2} to recognize " + origTxt + " as " + txt); }
return getNext(txt, origTxt);
}
{SPMDASH} { if (ptb3Dashes) {
return getNext(ptbmdash, yytext()); }
else {
return getNext();
}
}
{SPAMP} { final String origTxt = yytext();
String tok;
if (normalizeAmpersandEntity) {
tok = LexerUtils.normalizeAmp(origTxt);
} else {
tok = origTxt;
}
if (DEBUG) { logger.info("Used {SPAMP} to recognize " + origTxt + " as " + tok); }
return getNext(tok, origTxt);
}
{SPPUNC} { return getNext(); }
{WORD}/{REDAUX} { final String origTxt = yytext();
String tok = LexerUtils.removeSoftHyphens(origTxt);
if (americanize) {
tok = Americanize.americanize(tok);
}
if (DEBUG) { logger.info("Used {WORD} to recognize " + origTxt + " as " + tok); }
return getNext(tok, origTxt);
}
{SWORD}/{SREDAUX} { final String origTxt = yytext();
String tok = LexerUtils.removeSoftHyphens(origTxt);
if (DEBUG) { logger.info("Used {SWORD} to recognize " + origTxt + " as " + tok); }
return getNext(tok, origTxt);
}
{DIGIT}+/{SEP_SUFFIX} { String txt = yytext();
if (DEBUG) { logger.info("Used {DIGIT}/{SEP_SUFFIX} to recognize " + txt); }
return getNext(txt, txt);
}
{WORD} { final String origTxt = yytext();
String tok = LexerUtils.removeSoftHyphens(origTxt);
if (americanize) {
tok = Americanize.americanize(tok);
}
if (DEBUG) { logger.info("Used {WORD} (2) to recognize " + origTxt + " as " + tok); }
return getNext(tok, origTxt);
}
{APOWORD} { String tok = yytext();
String norm = handleQuotes(tok, false);
if (DEBUG) { logger.info("Used {APOWORD} to recognize " + tok + " as " + norm +
"; probablyLeft=" + false); }
return getNext(norm, tok);
}
{APOWORD2}/[:letter:] { String txt = yytext();
if (DEBUG) { logger.info("Used {APOWORD2} to recognize " + txt); }
return getNext(txt, txt);
}
{FULLURL} { String txt = yytext();
String norm = txt;
if (escapeForwardSlashAsterisk) {
norm = LexerUtils.escapeChar(norm, '/');
norm = LexerUtils.escapeChar(norm, '*');
}
if (DEBUG) { logger.info("Used {FULLURL} to recognize " + txt + " as " + norm); }
return getNext(norm, txt);
}
{LIKELYURL}/[^\p{Alpha}] { String txt = yytext();
String norm = txt;
if (escapeForwardSlashAsterisk) {
norm = LexerUtils.escapeChar(norm, '/');
norm = LexerUtils.escapeChar(norm, '*');
}
if (DEBUG) { logger.info("Used {LIKELYURL} to recognize " + txt + " as " + norm); }
return getNext(norm, txt);
}
{EMAIL} { String tok = yytext();
if (DEBUG) { logger.info("Used {EMAIL} to recognize " + tok); }
return getNext(tok, tok);
}
{TWITTER} { return getNext(); }
{REDAUX}/[^\p{Alpha}] { String tok = yytext();
String norm = handleQuotes(tok, false);
if (DEBUG) { logger.info("Used {REDAUX} to recognize " + tok + " as " + norm +
"; probablyLeft=" + false); }
return getNext(norm, tok);
}
{SREDAUX}/[^\p{Alpha}] { String tok = yytext();
String norm = handleQuotes(tok, false);
if (DEBUG) { logger.info("Used {SREDAUX} to recognize " + tok + " as " + norm +
"; probablyLeft=" + false); }
return getNext(norm, tok);
}
{DATE} { String origTxt = yytext();
String txt;
if (escapeForwardSlashAsterisk) {
txt = LexerUtils.escapeChar(origTxt, '/');
} else {
txt = origTxt;
}
if (DEBUG) { logger.info("Used {DATE} to recognize " + origTxt + " as " + txt); }
return getNext(txt, origTxt);
}
/* Malaysian currency */
RM/{NUM} { String txt = yytext();
return getNext(txt, txt);
}
{NUMBER} { handleHyphenatedNumber(yytext());
if (DEBUG) { logger.info("Used {NUMBER} to recognize " + yytext() + " as " + removeFromNumber(yytext())); }
return getNext(removeFromNumber(yytext()), yytext()); }
{SUBSUPNUM} { return getNext(); }
{FRAC} { String txt = yytext();
// if we are in strictTreebank3 mode, we need to reject everything after a space or non-breaking space...
if (strictTreebank3) {
int spaceIndex = indexOfSpace(txt);
if (spaceIndex >= 0) {
yypushback(txt.length() - spaceIndex);
return getNext();
}
}