src/edu/stanford/nlp/process/PTBLexer.flex

package edu.stanford.nlp.process;

// Stanford English Tokenizer -- a deterministic, fast, high-quality tokenizer.
// Copyright (c) 2002-2021 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 2A
//    Stanford CA 94305-9020
//    USA
//    java-nlp-support@lists.stanford.edu
//    https://nlp.stanford.edu/software/


import java.io.Reader;
import java.text.Normalizer;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;


/** Provides a tokenizer or lexer that does a pretty good job at
 *  deterministically tokenizing English according to Penn Treebank conventions.
 *  The class is a scanner generated by
 *  <a href="http://www.jflex.de/">JFlex</a> from the specification file
 *  {@code PTBLexer.flex}.  As well as copying what is in the Treebank,
 *  it now contains many extensions to deal with modern text and encoding
 *  issues, such as recognizing URLs and common Unicode characters, and a
 *  variety of options for doing or suppressing certain normalizations.
 *  Although they shouldn't really be there, it also interprets certain of the
 *  characters between U+0080 and U+009F as Windows CP1252 characters, since many
 *  LDC corpora actually mix CP1252 content into supposedly utf-8 text.
 *  <p>
 *  <i>Fine points:</i> Output normalized tokens should not contain spaces,
 *  providing the normalizeSpace option is true. The space will be turned
 *  into a non-breaking space (U+00A0). Otherwise, they can appear in
 *  a couple of token classes (phone numbers, fractions).
 *  The original PTB tokenization (messy) standard also escapes certain other characters,
 *  such as * and /, and normalizes things like " to `` or ''.  This tokenizer
 *  can do all of these things, but, by default, it now leaves most things as they are.
 *  You can set these behaviors by using the ptb3Escaping={false|true} option, or, choose
 *  unicode character alternatives with individual different options. Or you can turn
 *  everything on for strict Penn Treebank 3 tokenization. You can also build an
 *  invertible tokenizer, with which you can still access the original
 *  character sequence and the non-token whitespace around it in a CoreLabel.
 *  And you can ask for newlines to be tokenized.
 *  <p>
 *  <i>Character entities:</i> For legacy reasons, this file will parse and interpret
 *  some simple SGML/XML/HTML tags and character entities.  For modern formats
 *  like XML, you are better off doing XML parsing, and then running the
 *  tokenizer on text elements.  But, we and others frequently work with simple
 *  SGML text corpora that are not XML (like LDC text collections).  In practice,
 *  they only include very simple markup and a few simple entities, and the
 *  minimal character entity
 *  support in this file is enough to handle them. So we leave this functionality
 *  in, even though it could conceivably mess with a correct XML file if the
 *  output of decoding had things that look like character entities.  In general,
 *  handled symbols are changed to ASCII/Unicode forms, but handled accented
 *  letters are just left as character entities in words.
 *  <p>
 *  <i>Character support:</i> PTBLexer works for a broad range of common Unicode
 *  characters. It recognizes all characters that are classed as letter (alphabetic)
 *  or digit in Unicode.
 *  It also matches all defined characters in the Unicode range U+0000-U+07FF
 *  excluding most control characters except the ones very standardly found in
 *  plain text documents. Finally, a fair range of other characters, such as many
 *  symbols commonly found in English Unicode text and emoji are also recognized.
 *  <p>
 *  <i>Implementation note:</i> The scanner is caseless, but note, if adding
 *  or changing regexps, that caseless does not extend inside character
 *  classes.  From the manual: "The %caseless option does not change the
 *  matched text and does not effect character classes. So [a] still only
 *  matches the character a and not A, too."  Note that some character
 *  classes deliberately don't have both cases, so the scanner's
 *  operation isn't completely case-independent, though it mostly is.
 *  <p>
 *  <i>Implementation note:</i> This Java class is automatically generated
 *  from PTBLexer.flex using jflex.  DO NOT EDIT THE JAVA SOURCE.  This file
 *  has now been updated for JFlex 1.6.1+.
 *
 *  @author Tim Grow
 *  @author Christopher Manning
 *  @author Jenny Finkel
 */

%%

%class PTBLexer
%unicode
%function next
%type Object
%char
%caseless
%state YyTokenizePerLine YyNotTokenizePerLine

%{

  /**
   * Constructs a new PTBLexer.  You specify the type of result tokens with a
   * LexedTokenFactory, and can specify the treatment of tokens by
   * options given in a comma separated String
   * (e.g., "invertible,normalizeParentheses=true").
   * If the String is {@code null} or empty, you get UD
   * normalization behaviour (i.e., you get ud=true).  If you
   * want no normalization, then you should pass in the String
   * "ptb3Escaping=false".  See the documentation in the {@link PTBTokenizer}
   * class for full discussion of all the available options.
   *
   * @param r The Reader to tokenize text from
   * @param tf The LexedTokenFactory that will be invoked to convert
   *    each substring extracted by the lexer into some kind of Object
   *    (such as a Word or CoreLabel).
   * @param options Options to the tokenizer (see {@link PTBTokenizer})
   */
  public PTBLexer(Reader r, LexedTokenFactory<?> tf, String options) {
    this(r);
    this.tokenFactory = tf;
    if (options == null) {
      options = "";
    }
    Properties prop = StringUtils.stringToProperties(options);
    Set<Map.Entry<Object,Object>> props = prop.entrySet();
    for (Map.Entry<Object,Object> item : props) {
      String key = (String) item.getKey();
      String value = (String) item.getValue();
      boolean val = Boolean.valueOf(value);
      if ("".equals(key)) {
        // allow an empty item
      } else if ("invertible".equals(key)) {
        invertible = val;
      } else if ("tokenizeNLs".equals(key)) {
        tokenizeNLs = val;
      } else if ("tokenizePerLine".equals(key)) {
        tokenizePerLine = val;
      } else if ("ptb3Escaping".equals(key)) {
        normalizeSpace = val;
        normalizeAmpersandEntity = val;
        // normalizeCurrency = val; // [cdm 2018]: We no longer do this as a default ptb3escaping
        normalizeFractions = val;
        normalizeParentheses = val;
        normalizeOtherBrackets = val;
        quoteStyle = val ? LexerUtils.QuotesEnum.LATEX : LexerUtils.QuotesEnum.ORIGINAL;
        ellipsisStyle = val ? LexerUtils.EllipsesEnum.PTB3 : LexerUtils.EllipsesEnum.ORIGINAL;
        dashesStyle = val ? LexerUtils.DashesEnum.PTB3 : LexerUtils.DashesEnum.ORIGINAL;
        splitHyphenated = ! val;
        splitForwardSlash = ! val;
      } else if ("ud".equals(key)) {
        normalizeSpace = val;
        normalizeAmpersandEntity = val;
        normalizeFractions = val;
        normalizeParentheses = ! val;
        normalizeOtherBrackets = ! val;
        quoteStyle = val ? LexerUtils.QuotesEnum.NOT_CP1252 : LexerUtils.QuotesEnum.ORIGINAL;
        ellipsisStyle = val ? LexerUtils.EllipsesEnum.NOT_CP1252 : LexerUtils.EllipsesEnum.ORIGINAL;
        dashesStyle = val ? LexerUtils.DashesEnum.NOT_CP1252: LexerUtils.DashesEnum.ORIGINAL;
        splitHyphenated=val;
        splitForwardSlash=val;
      } else if ("americanize".equals(key)) {
        americanize = val;
      } else if ("normalizeSpace".equals(key)) {
        normalizeSpace = val;
      } else if ("normalizeAmpersandEntity".equals(key)) {
        normalizeAmpersandEntity = val;
      } else if ("normalizeCurrency".equals(key)) {
        normalizeCurrency = val;
      } else if ("normalizeFractions".equals(key)) {
        normalizeFractions = val;
      } else if ("normalizeParentheses".equals(key)) {
        normalizeParentheses = val;
      } else if ("normalizeOtherBrackets".equals(key)) {
        normalizeOtherBrackets = val;
      } else if ("quotes".equals(key)) {
        try {
          quoteStyle = LexerUtils.QuotesEnum.valueOf(value.trim().toUpperCase(Locale.ROOT));
        } catch (IllegalArgumentException iae) {
          throw new IllegalArgumentException ("Not a valid quotes style: " + value);
        }
      } else if ("splitAssimilations".equals(key)) {
        splitAssimilations = val;
      } else if ("splitHyphenated".equals(key)) {
        splitHyphenated = val;
      } else if ("splitForwardSlash".equals(key)) {
        splitForwardSlash = val;
      } else if ("ellipses".equals(key)) {
        try {
          ellipsisStyle = LexerUtils.EllipsesEnum.valueOf(value.trim().toUpperCase(Locale.ROOT));
        } catch (IllegalArgumentException iae) {
          throw new IllegalArgumentException ("Not a valid ellipses style: " + value);
        }
      } else if ("dashes".equals(key)) {
        try {
          dashesStyle = LexerUtils.DashesEnum.valueOf(value.trim().toUpperCase(Locale.ROOT));
        } catch (IllegalArgumentException iae) {
          throw new IllegalArgumentException ("Not a valid dashes style: " + value);
        }
      } else if ("escapeForwardSlashAsterisk".equals(key)) {
        escapeForwardSlashAsterisk = val;
      } else if ("untokenizable".equals(key)) {
        switch (value) {
          case "noneDelete":
            untokenizable = UntokenizableOptions.NONE_DELETE;
            break;
          case "firstDelete":
            untokenizable = UntokenizableOptions.FIRST_DELETE;
            break;
          case "allDelete":
            untokenizable = UntokenizableOptions.ALL_DELETE;
            break;
          case "noneKeep":
            untokenizable = UntokenizableOptions.NONE_KEEP;
            break;
          case "firstKeep":
            untokenizable = UntokenizableOptions.FIRST_KEEP;
            break;
          case "allKeep":
            untokenizable = UntokenizableOptions.ALL_KEEP;
            break;
          default:
            throw new IllegalArgumentException("PTBLexer: Invalid option value in constructor: " + key + ": " + value);
        }
      } else if ("strictTreebank3".equals(key)) {
        strictFraction = val;
        strictAcronym = val;
      } else if ("strictFraction".equals(key)) {
        strictFraction = val;
      } else if ("strictAcronym".equals(key)) {
        strictAcronym = val;
      } else {
        throw new IllegalArgumentException("PTBLexer: Invalid options key in constructor: " + key);
      }
    }
    if (invertible) {
      if ( ! (tf instanceof CoreLabelTokenFactory)) {
        throw new IllegalArgumentException("PTBLexer: the invertible option requires a CoreLabelTokenFactory");
      }
      prevWord = (CoreLabel) tf.makeToken("", 0, 0);
      prevWordAfter = new StringBuilder();
    }
    if (tokenizePerLine) {
      yybegin(YyTokenizePerLine);
    } else {
      yybegin(YyNotTokenizePerLine);
    }
  }


  /** Turn on to find out how things were tokenized. */
  private static final boolean DEBUG = false;

  /** A logger for this class */
  private static final Redwood.RedwoodChannels logger = Redwood.channels(PTBLexer.class);

  private LexedTokenFactory<?> tokenFactory;
  private CoreLabel prevWord;
  private StringBuilder prevWordAfter;
  private boolean seenUntokenizableCharacter; // = false;
  private enum UntokenizableOptions { NONE_DELETE, FIRST_DELETE, ALL_DELETE, NONE_KEEP, FIRST_KEEP, ALL_KEEP }
  private UntokenizableOptions untokenizable = UntokenizableOptions.FIRST_DELETE;

  /* Flags used to begin with historical ptb3Escaping behavior, now got with option -tokenizerOptions ptb3Escaping.
   * Starting with CoreNLP 4.0, flags begin as UD tokenization default.
   * This is like "new LDC treebank" tokenization except that we do not escape parentheses except on
   * s-expression tree input/output.
   */
  private boolean invertible = true;
  private boolean tokenizeNLs;
  private boolean tokenizePerLine;
  private boolean americanize = false;
  private boolean normalizeSpace = true;
  private boolean normalizeAmpersandEntity = true;
  private boolean normalizeCurrency = false; // only $ and # in Penn Treebank 3 data, but we now allow other currency
  private boolean normalizeFractions = true;
  private boolean normalizeParentheses = false;
  private boolean normalizeOtherBrackets = false;
  private LexerUtils.QuotesEnum quoteStyle = LexerUtils.QuotesEnum.NOT_CP1252;
  private LexerUtils.EllipsesEnum ellipsisStyle = LexerUtils.EllipsesEnum.NOT_CP1252;
  private LexerUtils.DashesEnum dashesStyle = LexerUtils.DashesEnum.NOT_CP1252;
  private boolean escapeForwardSlashAsterisk = false; // this is true in Penn Treebank 3 but we don't do it now
  // strictTreebank3 represents 2 separate modifications:
  //   stricter handling of acronyms
  //   stricter handling of fractions
  private boolean strictAcronym = false;
  private boolean strictFraction = false;
  private boolean splitAssimilations = true;
  private boolean splitHyphenated = true; // = false; // This is for "new" Penn Treebank tokenization (Ontonotes, etc.)
  private boolean splitForwardSlash = true; // = false; // This is for "new" Penn Treebank tokenization (Ontonotes, etc.)

  /* Bracket characters and forward slash and asterisk:
   *
   * Original Treebank 3 WSJ
   * Uses -LRB- -RRB- as the representation for ( ) and -LCB- -RCB- as the representation for { }.
   * There are no occurrences of [ ], though there is some mention of -LSB- -RSB- in early documents.
   * There are no occurrences of < >.
   * All brackets are tagged -LRB- -RRB-  [This stays constant.]
   * Forward slash and asterisk are escaped by a preceding \ (as \/ and \*)
   *
   * Treebank 3 Brown corpus
   * Has -LRB- -RRB-
   * Has a few instances of unescaped [ ] in compounds (the token "A[fj]"
   * Neither forward slash or asterisk appears.
   *
   * Ontonotes (r4)
   * Uses -LRB- -RRB- -LCB- -RCB- -LSB- -RSB-.
   * Has a very few uses of < and > in longer tokens, which are not escaped.
   * Slash is not escaped. Asterisk is not escaped.
   *
   * LDC2012T13-eng_web_tbk (Google web treebank)
   * Has -LRB- -RRB-
   * Has { and } used unescaped, treated as brackets.
   * Has < and > used unescaped, sometimes treated as brackets.  Sometimes << and >> are treated as brackets!
   * Has [ and ] used unescaped, treated as brackets.
   * Slash is not escaped. Asterisk is not escaped.
   *
   * Reasonable conclusions for now:
   * - Never escape < >
   * - Still by default escape [ ] { } but it can be turned off.  Use -LSB- -RSB- -LCB- -RCB-.
   * - Move to not escaping slash and asterisk, and delete escaping in PennTreeReader.
   */

  public static final String openparen = "-LRB-";
  public static final String closeparen = "-RRB-";
  public static final String openbrace = "-LCB-";
  public static final String closebrace = "-RCB-";

  /* This pattern now also include newlines, since we sometimes allow them in SGML tokens.... */
  private static final Pattern SINGLE_SPACE_PATTERN = Pattern.compile("[ \r\n]");
  private static final Pattern HYPHENS = Pattern.compile("[-\u2010-\u2012]");
  private static final Pattern FORWARD_SLASH = Pattern.compile("/");
  private static final Pattern HYPHENS_FORWARD_SLASH = Pattern.compile("[-\u2010-\u2012/]");
  private static final Pattern HYPHENS_DASHES = Pattern.compile("[-\u2010-\u2015]");
  private static final Pattern NUMBER = Pattern.compile("\\d+");


  /** This is the method usually called to split up something hyphenated, returning material to buffer. */
  private void breakByHyphensSlashes(String in) {
    Pattern p = null;
    if (splitHyphenated) {
      if (splitForwardSlash) {
        p = HYPHENS_FORWARD_SLASH;
      } else {
        p = HYPHENS;
      }
    } else if (splitForwardSlash) {
      p = FORWARD_SLASH;
    }
    if (p != null) {
      int firstHyphen = StringUtils.indexOfRegex(p, in);
      if (firstHyphen > 0) {
        yypushback(in.length() - firstHyphen);
      }
    }
  }

  /**
   * Handle hyphenated things with numbers in them.
   * If an apparent negative number is generated from inside a hyphenated word
   * (e.g., for "11-20", we first tokenize "11" and then appear to have generated "-20"),
   * then tokenize the hyphen separately as a hyphen or dash.
   */
  private void handleHyphenatedNumber(String in) {
    // Strip dashes from hyphenated words
    // System.err.println("prevWord: " + prevWord + " in: " + in + " last word: " + prevWord.originalText().toLowerCase(Locale.ROOT) +
    //    " 0th dash: " + HYPHENS_DASHES.matcher(in).lookingAt() + " 1st dash: " + HYPHENS_DASHES.matcher(in.substring(1)).lookingAt() +
    //    " last word number: " + NUMBER.matcher(prevWord.originalText().toLowerCase(Locale.ROOT)).matches() +
    //    " last word after: |" + prevWordAfter + "|");
    if (prevWord != null && in.length() >= 2 &&
        HYPHENS_DASHES.matcher(in).lookingAt() && ! HYPHENS_DASHES.matcher(in.substring(1)).lookingAt()) {
      String lastWord = prevWord.originalText().toLowerCase(Locale.ROOT);

      if (lastWord.length() > 0 && prevWordAfter != null && prevWordAfter.length() == 0) {
        // We're running under invertible = true and there was no space after previous thing, like for "TRS-80"
        yypushback(in.length() - 1);
      } else if (lastWord.length() > 0) {
        // It's not invertible=true, and so we don't know if there is space before, but we guess not if number of common case
        switch (lastWord) {
          case "early":
          case "mid":
          case "late":
          case "for":
          case "top":
          case "ak":
          case "b":
          case "c":
          case "dc":
          case "f":
          case "m":
            yypushback(in.length() - 1);
            break;
          default:
            if (NUMBER.matcher(lastWord).matches() && prevWordAfter == null) {
              // last word is a number as well; cases like scores "2-3"
              // if no spacing info, if the prior context is number, we guess there was no space....
              yypushback(in.length() - 1);
            }
        }
      }
    }
    // System.err.println("handleHyphenatedNumber made token " + in + " into " + yytext());
  }

  /** Remove soft hyphen characters and thousands separator characters from numbers. */
  private static String removeFromNumber(String in) {
    StringBuilder out = null;
    if ("-".equals(in)) {
      // Shortcut for if we split on hyphens
      return in;
    }

    // \u00AD is the soft hyphen character, which we remove, regarding it as inserted only for line-breaking
    // \u066C\u2009\u202F are thousands separator characters that it seems safe to remove.
    int length = in.length();
    for (int i = 0; i < length; i++) {
      char ch = in.charAt(i);
      if (ch == '\u00AD' || ch == '\u066C' || ch == '\u2009' || ch == '\u202F') {
        if (out == null) {
          out = new StringBuilder(length);
          out.append(in.substring(0, i));
        }
      } else if (out != null) {
        out.append(ch);
      }
    }
    if (out == null) {
      return in;
    }
    return out.toString().trim();
  }


  /*
   * This class has now been extended to cover the main Windows CP1252 characters,
   * at either their correct Unicode codepoints, or in their invalid
   * positions as 8 bit chars inside the iso-8859 control region.
   *
   * ellipsis   85      0133    2026    8230   COMPLICATED!! Also a newline character for IBM 390; we let ellipsis win
   * dagger     86    2020
   * double dagger 87 2021
   * single quote curly starting        91      0145    2018    8216
   * single quote curly ending  92      0146    2019    8217
   * double quote curly starting        93      0147    201C    8220
   * double quote curly ending  94      0148    201D    8221
   * bullet     95
   * en dash    96      0150    2013    8211
   * em dash    97      0151    2014    8212
   */


  private int indexOfSpace(String txt) {
    for (int i = 0, len = txt.length(); i < len; i++) {
      char ch = txt.charAt(i);
      if (ch == ' ' || ch == '\u00A0') {
        return i;
      }
    }
    return -1;
  }

  private Object getNext() {
    final String txt = yytext();
    return getNext(txt, txt);
  }

  /** Make the next token.
   *  If the begin character offset exceeds what can be stored in 32 bits, it is
   *  entered as Integer.MAX_VALUE and an error is logged.
   *
   *  @param txt What the token should be
   *  @param originalText The original String that got transformed into txt
   */
  private Object getNext(String txt, String originalText) {
    txt = Normalizer.normalize(txt, Normalizer.Form.NFC);
    int begin = Math.toIntExact(yychar);
    if (invertible) {
      String str = prevWordAfter.toString();
      prevWordAfter.setLength(0);
      CoreLabel word = (CoreLabel) tokenFactory.makeToken(txt, begin, yylength());
      word.set(CoreAnnotations.OriginalTextAnnotation.class, originalText);
      word.set(CoreAnnotations.BeforeAnnotation.class, str);
      prevWord.set(CoreAnnotations.AfterAnnotation.class, str);
      prevWord = word;
      return word;
    } else {
      Object word = tokenFactory.makeToken(txt, begin, yylength());
      if (word instanceof CoreLabel) {
        prevWord = (CoreLabel) word;
      }
      return word;
    }
  }

  private void fixJFlex4SpaceAfterTokenBug() {
    // try to work around an apparent jflex bug where it
    // gets a space at the token end by getting
    // wrong the length of the trailing context.
    while (yylength() > 0) {
      char last = yycharat(yylength()-1);
      if (last == ' ' || last == '\t' || (last >= '\n' && last <= '\r' || last == '\u0085')) {
        if (DEBUG) { logger.info("fixJFlex4SpaceAfterTokenBug still needed for " + yytext() + "!"); }
        yypushback(1);
      } else {
        break;
      }
    }
  }

  private Object processAcronym() {
    fixJFlex4SpaceAfterTokenBug();
    String s;
    if (yylength() == 2) { // "I.", etc. Treat as "I" + "."
      yypushback(1); // return a period next time;
      s = yytext(); // return the word without the final period
    } else if (strictAcronym && ! "U.S.".equals(yytext())) {
      yypushback(1); // return a period for next time
      s = yytext(); // return the word without the final period
    } else {
      s = yytext(); // return the word WITH the final period
      yypushback(1); // (reduplication:) also return a period for next time
    }
    String txt = yytext();
    if (DEBUG) { logger.info("Used {ABBREV2} to recognize " + txt + " as " + s); }
    return getNext(s, txt);
  }

  private Object processAbbrev3() {
    fixJFlex4SpaceAfterTokenBug();
    String txt = yytext();
    if (DEBUG) { logger.info("Used {ABBREV3} to recognize " + txt); }
    return getNext(txt, txt);
  }

  /** Assuming we're at an end of sentence (uppercase following), we usually put back a period to become end-of-sentence. */
  private Object processAbbrev1() {
    String s;
    if (strictAcronym && ! "U.S.".equals(yytext())) {
      yypushback(1); // return a period for next time
      s = yytext();
    } else {
      s = yytext();
      yypushback(1); // return a period for next time
    }
    String txt = yytext();
    if (DEBUG) { logger.info("Used {ABBREV1} to recognize " + txt + " as " + s); }
    return getNext(s, txt);
  }

%}


/* Todo: Really SGML shouldn't be here at all, it's kind of legacy. But we continue to tokenize
   some simple standard forms of concrete SGML syntax, since it tends to give robustness.          */
/* ---
( +([A-Za-z][A-Za-z0-9:.-]*( *= *['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]| *\/))*
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]*([ ]+([A-Za-z][A-Za-z0-9:.-]*([ ]*=[ ]*['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)>
( +[A-Za-z][A-Za-z0-9:.-]*)*
FOO = ([ ]+[A-Za-z][A-Za-z0-9:.-]*)*
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]* *)>
SGML = \<([!\?][A-Za-z\-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*['\"][^\r\n'\"]*['\"]|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)\>
   --- */

// <STORYID cat=w pri=u>
// SGML1 allows attribute value match over newline; SGML2 does not.
SGML1 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ \r\n]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ \r\n]*=[ \r\n]*('[^']*'|\"[^\"]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ \r\n]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ \r\n]*\>
SGML2 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ ]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ ]*\>
SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
SPAMP = &amp;
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
SPLET = &[aeiouAEIOU](acute|grave|uml);

%include LexCommon.tokens

/* SPACE, SPACENL, etc are in LexCommon.tokens */
SPACENLS = {SPACENL}+
/* These next ones are useful to get a fixed length trailing context. */
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
SENTEND1 = {SPACENL}({SPACENL}|[:uppercase:]|{SGML1})
SENTEND2 = {SPACE}({SPACE}|[:uppercase:]|{SGML2})
DIGIT = [:digit:]|[\u07C0-\u07C9]
DATE = {DIGIT}{1,2}[\-\u2012\/]{DIGIT}{1,2}[\-\u2012\/]{DIGIT}{2,4}|{DIGIT}{4}[\-\u2012\/]{DIGIT}{1,2}[\-\u2012\/]{DIGIT}{1,2}
/* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
NUM = {DIGIT}*([.,\u066B\u066C]{DIGIT}+)+|{DIGIT}+([.:,\u00AD\u066B\u066C\u2009\u202F]{DIGIT}+)*
/* Now don't allow bracketed negative numbers!  They have too many uses (e.g.,
   years or times in parentheses), and having them in tokens messes up
   treebank parsing.
   NUMBER = [\-+]?{NUM}|\({NUM}\) */
NUMBER = [\-\u2212+]?{NUM}
SUBSUPNUM = [\u207A\u207B\u208A\u208B]?([\u2070\u00B9\u00B2\u00B3\u2074-\u2079]+|[\u2080-\u2089]+)
/* Constrain fraction to only match likely fractions. Full one allows hyphen, space, or non-breaking space between integer and fraction part, but strictFraction allows only hyphen. */
FRAC = ({DIGIT}{1,4}[- \u00A0])?{DIGIT}{1,4}(\\?\/|\u2044){DIGIT}{1,4}
FRAC2 = [\u00BC\u00BD\u00BE\u2153-\u215E]
/* # is here for historical reasons -- old UK ASCII-equivalent used # for pound mark. Bit ugly now. */
DOLSIGN = ([A-Z]*\$|#)
/* Currency: These are cent, pound, currency, yen; CP1252 euro; ECU and many other currency simples including Euro;
   armenian dram, afghani, bengali rupee, thai bhat; full-wdith dollar, cent pound, yen, won */
DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9\u0E3F\u17DB\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]
/* not used DOLLAR      {DOLSIGN}[ \t]*{NUMBER}  */
/* |\( ?{NUMBER} ?\))    # is for pound signs */
/* Curse of intelligent tokenization, here we come. To model what LDC does, we separate out some \p{Digit}+\p{Alpha}+ tokens as 2 words */
/* Go with just the top 20 currencies. */
SEP_CURRENCY = (USD|EUR|JPY|GBP|AUD|CAD|CHF|CNY|SEK|NZD|MXN|SGD|HKD|NOK|KRW|TRY|RUB|INR|BRL|ZAR)
/* Can't include s for seconds as too many iPhone 6s, 1990s, etc. */
SEP_UNITS = (lbs?|ltr|mins?|[kcm][gml]|[MGTP]([B]|[H][z])|fps|bpm|[MG][b][p][s])
SEP_OTHER = ([ap]m|hrs?|words?|m(on)?ths?|y(ea)?rs?|pts?)
/* If there is a longer alphabetic match, another longer pattern will match so don't need to filter that. */
SEP_SUFFIX = ({SEP_CURRENCY}|{SEP_UNITS}|{SEP_OTHER})
/* For some reason U+0237-U+024F (dotless j) isn't in [:letter:]. Recent additions? */
LETTER = ([:letter:]|{SPLET}|[\u00AD\u200C\u200D\u2060\u0237-\u024F\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02FF\u0300-\u036F\u0370-\u037D\u0384\u0385\u03CF\u03F6\u03FC-\u03FF\u0483-\u0487\u04CF\u04F6-\u04FF\u0510-\u0525\u055A-\u055F\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0615-\u061A\u063B-\u063F\u064B-\u065E\u0670\u06D6-\u06EF\u06FA-\u06FF\u070F\u0711\u0730-\u074F\u0750-\u077F\u07A6-\u07B1\u07CA-\u07F5\u07FA\u0900-\u0903\u093C\u093E-\u094E\u0951-\u0955\u0962-\u0963\u0981-\u0983\u09BC-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A4F\u0A81-\u0A83\u0ABC-\u0ACF\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0C01-\u0C03\u0C3E-\u0C56\u0D3E-\u0D44\u0D46-\u0D48\u0E30-\u0E3A\u0E47-\u0E4E\u0EB1-\u0EBC\u0EC8-\u0ECD])
/* Allow in the zero-width (non-)joiner characters. Allow in Modifier non-spacing (= separated accent chars) */
WORD = {LETTER}({LETTER}|{DIGIT}|[\p{Mn}\p{Mc}])*([.!?]{LETTER}({LETTER}|{DIGIT}|[\p{Mn}\p{Mc}])*)*
/* THING: The $ was for things like New$;
   WAS: only keep hyphens with short one side like co-ed. But (old) treebank just allows hyphenated things as words!
   THING allows d'Avignon or NUMBER before HYPHEN and the same things after it. Only first number can be negative. */
THING = ([dDoOlL]{APOSETCETERA}[\p{Alpha}\p{Digit}])?([\p{Alpha}\p{Digit}]+|{NUMBER})({HYPHEN}([dDoOlL]{APOSETCETERA}[\p{Alpha}\p{Digit}])?([\p{Alpha}\p{Digit}]+|{NUM}))*
THINGA = [A-Z]+(([+&]|{SPAMP})[A-Z]+)+
THING3 = [\p{Alpha}\p{Digit}]+(-[\p{Alpha}]+){0,2}(\\?\/[\p{Alpha}\p{Digit}]+(-[\p{Alpha}]+){0,2}){1,2}
APOS = ['\u0092\u2019´]|&apos;  /* ASCII straight quote, single right curly quote in CP1252 (wrong) or Unicode or reversed quote or HTML SGML escape */
/* Includes extra ones that may appear inside a word, rightly or wrongly */
APOSETCETERA = {APOS}|[`\u0091\u2018\u201B]
/* HTHING recognizes hyphenated words, including ones with various kinds of numbers in them. And with underscores. */
HTHING = [\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit}.,\u00AD\u200C\u200D\u2060]*([-_]([\p{Alpha}\p{Digit}\u00AD\u200C\u200D\u2060]+(\.[:digit:]+)?|{ACRO2}\.))+
/* from the CLEAR (biomedical?) treebank documentation */
/* we're going to split on most hyphens except a few */
/* From Supplementary Guidelines for ETTB 2.0 (Justin Mott, Colin Warner, Ann Bies; Ann Taylor) */
/*
Hyphenated words that are allowed to be kept together match these patterns.
Note that this list is case-insensitive and non-exhaustive.
a- adeno- agro- ante- anti- aorto- arch- ambi- -able -ahol -aholic -ation axio- be- bi- bio- broncho-
co- counter- cross- centi- -centric circum- cis- colo- contra- cortico- cran- crypto- -cracy -crat cyber-
de- deca- demi- dis- -dom e- eco- electro- ennea- -esque -ette ex- extra- -er -ery ferro- -ful -fest -fold
gastro- -gate -gon giga- hepta- hemi- hypo- hexa- -hood
in- inter- intra- -ian -ible -ing -isation -ise -ising -ism -ist -itis -ization -ize -izing ideo- idio- infra- iso-
-less -logist -logy -ly judeo- macro- mega- micro- mid- mini- mono- musculo- mm-hm mm-mm -most multi- medi- milli-
neo- neuro- nitro- non- novem- octa- octo- o-kay -o-torium ortho- over-
paleo- pan- para- pelvi- penta- peri- pheno- phospho- pica- pneumo- poly- post- pre- preter- pro- pseudo-
quasi- quadri- quinque- -rama re- recto- salpingo- sero- semi- sept- soci- sub- super- supra- sur-
tele- tera- tetra- tri- u- uber- uh-huh uh-oh ultra- un- uni- vice- veno- ventriculo- -wise x-
*/
HTHINGEXCEPTIONPREFIXED = (e|a|u|x|agro|ante|anti|arch|be|bi|bio|co|counter|cross|cyber|de|eco|ex|extra|inter|intra|macro|mega|micro|mid|mini|multi|neo|non|over|pan|para|peri|post|pre|pro|pseudo|quasi|re|semi|sub|super|tri|ultra|un|uni|vice)(-([\p{Alpha}\p{Digit}\u00AD]+|{ACRO2}\.))+
HTHINGEXCEPTIONSUFFIXED = ([\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit}.,\u00AD]*)(-)(esque|ette|fest|fold|gate|itis|less|most|o-torium|rama|wise)(s|es|d|ed)?
HTHINGEXCEPTIONWHOLE = (mm-hm|mm-mm|o-kay|uh-huh|uh-oh)(s|es|d|ed)?

/* things like 'll and 'm */
REDAUX = {APOSETCETERA}([msdMSD]|re|ve|ll)
/* For things that will have n't on the end. They can't end in 'n' */
/* \u00AD is soft hyphen. \u2060 is word joiner */
SWORD = [\p{Alpha}\u00AD\u200C\u200D\u2060]*[A-MO-Za-mo-z][\u00AD\u200C\u200D\u2060]*
SREDAUX = n{APOSETCETERA}t
/* Tokens you want but already okay: C'mon 'n' '[2-9]0s '[eE]m 'till?
   [Yy]'all 'Cause Shi'ite B'Gosh o'clock.  Here now only need apostrophe
   final words. */
/* Note that Jflex doesn't support {2,} form.  Only {2,k}. */
/* [yY]' is for Y'know, y'all and I for I.  So exclude from one letter first */
/* Rest are for French borrowings.  n allows n'ts in "don'ts" */
/* Arguably, c'mon should be split to "c'm" + "on", but not yet. 'Twixt for betwixt */
APOWORD = {APOS}n{APOS}?|[lLdDjJ]{APOS}|Dunkin{APOS}|somethin{APOS}|ol{APOS}|{APOS}em|diff{APOSETCETERA}rent|[A-HJ-XZn]{APOSETCETERA}[:letter:]{2}[:letter:]*|{APOS}[1-9]0s|[1-9]0{APOS}s|{APOS}till?|[:letter:][:letter:]*[aeiouyAEIOUY]{APOSETCETERA}[aeioulA-Z][:letter:]*|{APOS}cause|cont'd\.?|nor'easter|c'mon|e'er|s'mores|ev'ry|li'l|nat'l|ass't|'twixt|O{APOSETCETERA}o
APOWORD2 = y{APOS}
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
FULLURL = (ftp|svn|svn\+ssh|http|https|mailto):\/\/[^ \t\n\f\r<>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+[^ \t\n\f\r<>|.!?¡¿,·;:&`\"\'\*\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
LIKELYURL = ((www\.([^ \t\n\f\r`<>|.!?,\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+[a-zA-Z]{2,4})|(([^ \t\n\f\r`<>|.!?,:\/$\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+(com|net|org|edu)))(\/[^ \t\n\f\r`<>|]+[^ \t\n\f\r`<>|.!?,;:&\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-])?
/* &lt;,< should match &gt;,>, but that's too complicated */
/* EMAIL = (&lt;|<)?[a-zA-Z0-9][^ \t\n\f\r\"<>|()\u00A0{}]*@([^ \t\n\f\r\"<>|(){}.\u00A0]+\.)*([^ \t\n\f\r\"<>|(){}\[\].,;:\u00A0]+)(&gt;|>)? */
EMAIL = (&lt;|<)?(mailto:)?[a-zA-Z0-9._%+-]+@[A-Za-z0-9][A-Za-z0-9.-]*[A-Za-z0-9](&gt;|>)?

/* Technically, names should be capped at 15 characters and can be any non-zero string of ASCII letters, numbers
    and underscores. However, if you length limit then you get into weirdness with what happens to the rest of the
    characters, and allowing ones starting with numbers disables using @ for "at" before numeric quantities, so we
    just special case in a couple of people like that. */
TWITTER_NAME = [@\uFF20]([A-Za-z_][a-zA-Z_0-9]*|50cent)
TWITTER_HASHTAG = [#\uFF03]{LETTER}({LETTER}|{DIGIT}|_)*({LETTER}|{DIGIT})
TWITTER = {TWITTER_NAME}|{TWITTER_HASHTAG}

ISO8601DATETIME = [0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[x0-9]{2}:[0-9]{2}Z?)?
DEGREES = °[CF]

/* --- This block becomes ABBREV1 and is usually followed by lower case words. --- */
/* Abbreviations - originally induced from 1987 WSJ by hand; since variously expanded */
ABMONTH = Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec
/* "May." isn't an abbreviation. "Jun." and "Jul." barely occur, but don't seem dangerous */
ABDAYS = Mon|Tue|Tues|Wed|Thu|Thurs|Fri
/* Sat. and Sun. barely occur and can easily lead to errors, so we omit them */
/* In caseless, |a\.m|p\.m handled as ACRO, and this is better as can often
   be followed by capitalized. */
/* Ma. or Me. isn't included as too many errors, and most sources use Mass. etc. */
/* Fed. is tricky.  Usually sentence end, but not before "Governor" or "Natl. Mtg. Assn." */
/* Make some states case sensitive, since they're also reasonably common words */
/* Only allow La since you also get LA for Los Angeles. */
ABSTATE = Ala|Ariz|[A]z|[A]rk|Calif|Colo|Conn|Ct|Dak|[D]el|Fla|Ga|[I]ll|Ind|Kans?|Ky|[L][a]|[M]ass|Md|Mich|Minn|[M]iss|Mo|Mont|Neb|Nev|Okla|[O]re|[P]a|Penn|Tenn|[T]ex|Va|Vt|[W]ash|Wisc?|Wyo
/* Bhd is Malaysian companies! Rt. is Hungarian? */
/* Special case: Change the class of Pty when followed by Ltd to not sentence break (in main code below)... */
ABCOMP = Inc|Cos?|Corp|Pp?t[ye]s?|Ltd|Plc|Rt|Bancorp|Bhd|Assn|Univ|Intl|Sys
/* Don't include fl. oz. since Oz turns up too much in caseless tokenizer. ft now allows upper after it for "Fort" use. */
ABNUM = tel|est|ext|sq
/* p used to be in ABNUM list, but it can't be any more, since the lexer
   is now caseless.  We don't want to have it recognized for P.  Both
   p. and P. are now under ABBREV2. ABLIST also went away as no-op [a-e].
   Dr. Sci. is a degree some places. */
ABPTIT = Jr|Sr|Bros|(Ed|Ph)\.D|[BDM]\.Sc|LL\.[BDM]|Esq|Sci
/* ss?p and aff are for bio taxonomy; also gen and cf but appear elsewhere as ABBREV2 already; fl for flourished. var for variety */
ABTAXONOMY = (s(ub)?)?spp?|aff|[f][l]|var
/* Notes: many misspell etc. ect.; kr. is some other currency. eg. for e.g. */
/*  Tech would be useful for Indian B. Tech. degrees, but "tech" is used too much as a word. Avg = average; pl. for plural */
/* Cir. for circuit court; lb for pounds. I'm adding "min." and "max." as only lower case (because "Max" is name, product modifier). cit. for op. cit. */
ABVARIA = etc|ect|al|seq|Bldg|Pls|wrt|orig|incl|t[b]?[s][p]|kr|eg|Avg|pl|Cir|lb|[m][i][n]|[m][a][x]|cit

/* ABBREV1 abbreviations are normally followed by lower case words.
 * If they're followed by an uppercase one, we assume there is also a sentence boundary.
 */
ABBREV1 = ({ABMONTH}|{ABDAYS}|{ABSTATE}|{ABCOMP}|{ABNUM}|{ABPTIT}|{ABTAXONOMY}|{ABVARIA})\.

/* --- This block becomes ABBREV2 and is usually followed by upper case words. --- */
/* In the caseless world S.p.A. "Società Per Azioni (Italian: shared company)" is got as a regular acronym */
/* ACRO Is a bad case -- can go either way! */
ACRO = [A-Za-z](\.[A-Za-z])*|(Canada|Sino|Korean|EU|Japan|non)-U\.S|U\.S\.-(U\.K|U\.S\.S\.R)
ACRO2 = [A-Za-z](\.[A-Za-z])+|(Canada|Sino|Korean|EU|Japan|non)-U\.S|U\.S\.-(U\.K|U\.S\.S\.R)
/* ABTITLE is mainly person titles, but also Mt for mountains and Ft for Fort. St[ae] does Saint, Santa, suite, etc. */
/* "Rt." occurs both in "Rt. Rev." (capitalized following) and in abbreviation at end of Hungarian company (lower follows). */
/* Added "Amb" for Ambassador. Don't have "Ambs" as occurs as family name. Fr. for Friar */
/* Smt. and Ven. before Indian names; Br for brother; Eng. for engineer (but is occasional Chinese name) */
ABTITLE = Mr|Mrs|Ms|Mx|[M]iss|Drs?|Profs?|Sens?|Reps?|Attys?|Lt|Col|Gen|Messrs|Govs?|Adm|Rev|Fr|Rt|Maj|Sgt|Cpl|Pvt|Capt|St[ae]?|Ave|Pres|Lieut|Rt|Hon|Brig|Co?mdr|Pfc|Spc|Supts?|Det|Mt|Ft|Adj|Adv|Asst|Assoc|Ens|Insp|Mlle|Mme|Msgr|Sfc|Amb|S[m][t]|Ven|Br|Eng
/* Exhs?. is used for law case exhibits. ass't = assistant, Govt = Government.
   Ph is in there for Ph. D  Sc for B.Sc. syn. for biology synonym; def. for defeated; Mk for Mark (like tank); Soc. for society */
/* Jos. is kind of dubious as also a name, place and family name. Maybe should delete but alsl common as abbreviated given name. */
ABCOMP2 = Invt|Elec|Natl|M[ft]g|Dept|Blvd|Rd|Ave|[P][l]|viz|Exhs?|ass't|Govt|[v]|Wm|Jos|Cie|cf|TREAS|P[h]|[S][c]|syn|def|Mk|Soc

/* ABRREV2 abbreviations are normally followed by an upper case word. We mainly hope they aren't used sentence finally.
 * But we still do recognize them as sentence final when after the period a variety of common function words occur.
 */
ABBREV2PRE = {ABTITLE}|{ACRO}|{ABCOMP2}
ABBREV2 = ({ABBREV2PRE})\.
/* ACRONYM = ({ACRO})\. */
/* Cie. is used by French companies sometimes before and sometimes at end as in English Co.  But we treat as allowed to have Capital following without being sentence end.  Cia. is used in Spanish/South American company abbreviations, which come before the company name, but we exclude that and lose, because in a caseless segmenter, it's too confusable with CIA. */
/* Added Wm. for William and Jos. for Joseph */
/* In tables: Mkt. for market Div. for division of company, Chg., Yr.: year */

/* ABBREV4 abbreviations are always treated as sentence-internal, no matter what follows them. */
ABBREV4 = vs\.|a\.k\.a\.

/* --- ABBREV3 abbreviations are allowed only before numbers. ---
 * Otherwise, they aren't recognized as abbreviations (unless they also appear in ABBREV1 or ABBREV2).
 * est. is "estimated" -- common in some financial contexts. ext. is extension, ca. is circa.
 * "Art(s)." is for "article(s)" -- common in legal context, Sec(t). for section(s). ch for chapters.
 * res for resolution (of Congress etc.)
 */
/* Maybe also "op." for "op. cit." but also get a photo op. Rs. for Rupees */
/* Pt for part needs to be case sensitive (vs. country code for Portugal). */
ABBREV3 = (ca|chs?|figs?|prop|nos?|nrs?|vols?|sect?s?|arts?|paras?|bldg|prop|pp|op|approx|p[t]|rs|Apt|Rt|Res)\.
/* Case for south/north before a few places. */
ABBREVSN = So\.|No\.

/* See also a couple of special cases for pty. and op./loc in the code below. */

HYPHEN = [-\u058A\u2010\u2011\u2012]
HYPHENS = {HYPHEN}+
SSN = [0-9]{3}{HYPHEN}[0-9]{2}{HYPHEN}[0-9]{4}
/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
PHONE = (\([0-9]{2,3}\)[ \u00A0\u2007]?|(\+\+?)?([0-9]{1,4}[\- \u00A0\u2007\u2012])?[0-9]{2,4}[\- \u00A0\u2007\u2012/])[0-9]{3,4}[\- \u00A0\u2007\u2012]?[0-9]{3,5}|((\+\+?)?[0-9]{1,4}\.)?[0-9]{2,4}\.[0-9]{3,4}\.[0-9]{3,5}|[2-9][0-9]{2}[-\u2012][0-9]{4}
/* Fake duck feet appear sometimes in WSJ, and aren't likely to be SGML, less than, etc., so group. */
FAKEDUCKFEET = <<|>>
LESSTHAN = <|&lt;
GREATERTHAN = >|&gt;
LDOTS = \.\.\.+|[\u0085\u2026]
SPACEDLDOTS = \.[ \u00A0\u202F](\.[ \u00A0\u202F])+\.
ATS = @+
UNDS = _+
ASTS = \*+|(\\\*){1,3}
HASHES = #+
FNMARKS = {ATS}|{HASHES}|{UNDS}
/* U+3001 is Chinese dunhao comma; U+0F0D is Tibetan shad */
INSENTP = [,;:\u3001\u0F0D]
QUOTES = {APOS}|[`\u2018-\u201F\u0082\u0084\u0091-\u0094\u2039\u203A\u00AB\u00BB]{1,2}
DBLQUOT = \"|&quot;|[`'\u0091\u0092\u2018\u2019]'
/* Cap'n for captain, c'est for french */
TBSPEC = -(RRB|LRB|RCB|LCB|RSB|LSB)-|C\.D\.s|pro-|anti-|S(&|&amp;)P-500|S(&|&amp;)Ls|Cap{APOS}n|c{APOS}est
SWEARING = f[-*][-c*]k(in[g']?|e[dr])?|f[-*](in[g']?|e[dr])|(bull|dip)?s[h@][-\*#]t(ty|e|box|s)?|c[-*]nts?|p[-*]ss(e[sd]|ing)?|c[-*]ck|b[-*]tch|t[-*]ts|tw[-*]ts?|cr[-*]p|d[-*]cks?|b[-*][-*s]t[-*]rds?|pr[-*]ck|d[-*]mn|bl[-*]{2,2}dy
TBSPEC2 = {APOS}[0-9][0-9]
BANGWORDS = (E|Yahoo|Jeopardy)\!
BANGMAGAZINES = OK\!

/* Smileys (based on Chris Potts' sentiment tutorial, but much more restricted set - e.g., no "8)", "do:" or "):", too ambiguous) and simple Asian smileys */
SMILEY = [<>]?[:;=][\-o\*']?[\(\)DPdpO\\{@\|\[\]]
ASIANSMILEY = [\^x=~<>]\.\[\^x=~<>]|[\-\^x=~<>']_[\-\^x=~<>']|\([\-\^x=~<>'][_.]?[\-\^x=~<>']\)|\([\^x=~<>']-[\^x=~<>'`]\)|¯\\_\(ツ\)_\/¯

/* Slightly generous but generally reasonably good emoji parsing */
/* These are emoji that can be followed by a zwj (U+200D) and then gender or similar things (as well as skin color). Mainly humans but certain others like bears, hearts */
EMOJI_GENDERED = [\u26F9\u2764\u{01F3C3}-\u{01F3C4}\u{01F3CA}-\u{01F3CC}\u{01F408}\u{01F415}\u{01F43B}\u{01F466}-\u{01F469}\u{01F46E}-\u{01F477}\u{01F481}-\u{01F482}\u{01F486}-\u{01F487}\u{01F575}\u{01F62E}\u{1F635}\u{01F636}\u{01F645}-\u{01F647}\u{01F64B}\u{01F64D}-\u{01F64E}\u{01F6A3}\u{01F6B4}-\u{01F6B6}\u{01F926}\u{01F934}-\u{01F93E}\u{01F9B8}-\u{01F9B9}\u{01F9CD}-\u{01F9DF}\u{01FAF1}-\u{01FAF2}]
/* Emoji follow is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
EMOJI_FOLLOW = [\uFE0E\uFE0F\u{01F3FB}-\u{01F3FF}]
/* Just things followed by the keycap surrounding char - note that if not separated by space beforehand, may be mistokenized */
EMOJI_KEYCAPS = [\u0023\u002A\u0030-\u0039]\uFE0F?\u20E3
/* Flags (changed to use \U to avoid bug in IntelliJ JFlex plugin).
 * 1st disjunct: Two geographic characters as a flag
 * 2nd disjunct: Tag digits and small letters, currently used only for GB regions flags (Scotland, Wales, England)
 * 3rd disjunct: emoji tag sequence (ETS) support for certain additional flags: gay, transgender, pirate
 */
EMOJI_FLAG = [\U01F1E6-\U01F1FF]{2,2}|\U01F3F4[\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}]+\U0E007F
/* Rainbow flag, transgender flag, etc. */
EMOJI_MISC = [\u{01F3F3}\u{01F3F4}\u{01F441}][\uFE0E\uFE0F]?\u200D[\u2620\u26A7\u{01F308}\u{01F5E8}][\uFE0E\uFE0F]?|{EMOJI_KEYCAPS}
/* Things that have an emoji presentation form. This is where the general single character emoji appear */
EMOJI_PRESENTATION = [\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9-\u21AA\u231A-\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA-\u25AB\u25B6\u25C0\u25FB-\u27BF\u2934-\u2935\u2B05-\u2B07\u2B1B-\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\u{01F000}-\u{01FAFF}]
/* Emoji modifier is something that appears after a zero-width joiner (zwj) U+200D */
EMOJI_MODIFIER = [\u2640\u2642\u2695-\u2696\u2708\u2744\u2764\u2B1B\u{01F32B}\u{01F33E}\u{01F373}\u{01F37C}\u{01F384}\u{01F393}\u{01F3A4}\u{01F3A8}\u{01F3EB}\u{01F3ED}\u{01F466}-\u{01F469}\u{01F468}-\u{01F469}\u{01F48B}\u{01F4A8}\u{01F4AB}\u{01F4BB}-\u{01F4BC}\u{01F525}\u{01F527}\u{01F52C}\u{01F5E8}\u{01F680}\u{01F692}\u{01F91D}\u{01F9AF}\u{01F9B0}-\u{01F9B3}\u{01F9BA}-\u{01F9BD}\u{01F9D1}\u{01FA79}\u{01FAF2}]
/* flag | emoji optionally with follower | precomposed gendered/family consisting of human followed by one or more of zero width joiner then another human/profession | Misc */
EMOJI = {EMOJI_FLAG}|{EMOJI_PRESENTATION}{EMOJI_FOLLOW}?|{EMOJI_GENDERED}{EMOJI_FOLLOW}?(\u200D{EMOJI_MODIFIER}{EMOJI_FOLLOW}?){1,3}|{EMOJI_MISC}

/* U+2200-U+2BFF has a lot of the various mathematical, etc. symbol ranges */
/* \uFF65 is Halfwidth katakana middle dot; \u30FB is Katakana middle dot */
/* Math and other symbols that stand alone: °²× ∀; \u33A1 is m^2 in one char! */
/* Tibetan tsheg or tsek (U+0F0B) goes between syllables; words aren't space separated, so it may be a word or syllable marker; it indicates a possible line-break point. Treat as separate symbol. */
MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600-\u0603\u0606-\u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703-\u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u0F0B\u1FBD\u2016\u2017\u2020-\u2025\u2030-\u2038\u203B\u203C\u2043\u203E-\u2042\u2044\u2053\u207A-\u207F\u208A-\u208E\u2100-\u214F\u2190-\u21FF\u2200-\u2BFF\u3001-\u3006\u3008-\u3020\u30FB\u33A1\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF65]

PROG_LANGS = c[+][+]|(c|f)#
/* Assimilations3 leave 3 chars behind after division */
ASSIMILATIONS3 = cannot|'twas|dunno|['’]d['’]ve
/* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
/* Include splitting some apostrophe-less negations, but not ones like "wont" that are also words. */
ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre

/* CP1252: dagger, double dagger, per mille, bullet, small tilde, trademark */
CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]

/* CP1252 letters */
/* 83 = f with hook --> U+0192; 8a = S with Caron --> U+0160; 9c = ligature oe --> U+0153; */
/* CP1252LETTER = [\u0083\u008A\u009C] */

%%

{PROG_LANGS}      { String tok = yytext();
                    if (DEBUG) { logger.info("Used {PROG_LANGS} to recognize " + tok); }
                    return getNext(tok, tok);
                  }
{ASSIMILATIONS3}  { if (splitAssimilations) {
                      yypushback(3);
                    }
                    String tok = yytext();
                    if (DEBUG) { logger.info("Used {ASSIMILATIONS3} to recognize " + tok +
                             "; splitAssimilations=" + splitAssimilations); }
                    return getNext(tok, tok);
                  }
{ASSIMILATIONS2}/[^\p{Alpha}]
                  { if (splitAssimilations) {
                      yypushback(2);
                    }
                    String tok = yytext();
                    if (DEBUG) { logger.info("Used {ASSIMILATIONS2} to recognize " + tok + " as " + tok +
                            "; splitAssimilations=" + splitAssimilations); }
                    return getNext(tok, tok);
                  }
<YyNotTokenizePerLine>{SGML1}
                        { final String origTxt = yytext();
                          String txt = origTxt;
                          if (normalizeSpace) {
                            txt = SINGLE_SPACE_PATTERN.matcher(txt).replaceAll("\u00A0"); // change to non-breaking space
                          }
                          if (DEBUG) { logger.info("Used {SGML1} to recognize " + origTxt + " as " + txt); }
                          return getNext(txt, origTxt);
                        }
<YyTokenizePerLine>{SGML2}
                        { final String origTxt = yytext();
                          String txt = origTxt;
                          if (normalizeSpace) {
                            txt = txt.replace(' ', '\u00A0'); // change space to non-breaking space
                          }
                          if (DEBUG) { logger.info("Used {SGML2} to recognize " + origTxt + " as " + txt); }
                          return getNext(txt, origTxt);
                        }
{SPMDASH}               { final String origTxt = yytext();
                          String tok = LexerUtils.handleDashes(origTxt, dashesStyle);
                          if (DEBUG) { logger.info("Used {SPMDASH} to recognize " + origTxt + " as " + tok); }
                          return getNext(tok, origTxt);
                        }
{SPAMP}                 { final String origTxt = yytext();
                          String tok;
                          if (normalizeAmpersandEntity) {
                            tok = LexerUtils.normalizeAmp(origTxt);
                          } else {
                            tok = origTxt;
                          }
                          if (DEBUG) { logger.info("Used {SPAMP} to recognize " + origTxt + " as " + tok); }
                          return getNext(tok, origTxt);
                         }
{SPPUNC}                { String tok = yytext();
                          if (DEBUG) { logger.info("Used {SPPUNC} to recognize " + tok); }
                          return getNext(tok, tok);
                        }
{WORD}/{REDAUX}         { final String origTxt = yytext();
                          String tok = LexerUtils.removeSoftHyphens(origTxt);
                          if (americanize) {
                            tok = Americanize.americanize(tok);
                          }
                          if (DEBUG) { logger.info("Used {WORD} to recognize " + origTxt + " as " + tok); }
                          return getNext(tok, origTxt);
                        }
{SWORD}/{SREDAUX}       { final String origTxt = yytext();
                          String tok = LexerUtils.removeSoftHyphens(origTxt);
                          if (DEBUG) { logger.info("Used {SWORD} to recognize " + origTxt + " as " + tok); }
                          return getNext(tok, origTxt);
                        }
{DIGIT}+/{SEP_SUFFIX}   { String txt = yytext();
                          if (DEBUG) { logger.info("Used {DIGIT}/{SEP_SUFFIX} to recognize " + txt); }
                          return getNext(txt, txt);
                        }
{WORD}                  { final String origTxt = yytext();
                          String tok = LexerUtils.removeSoftHyphens(origTxt);
                          if (americanize) {
                            tok = Americanize.americanize(tok);
                          }
                          if (DEBUG) { logger.info("Used {WORD} (2) to recognize " + origTxt + " as " + tok); }
                          return getNext(tok, origTxt);
                        }
{APOWORD}               { String tok = yytext();
                          String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                          if (DEBUG) { logger.info("Used {APOWORD} to recognize " + tok + " as " + norm +
                                                   "; probablyLeft=" + false); }
                          return getNext(norm, tok);
                        }
{APOWORD2}/[:letter:]   { String txt = yytext();
                          if (DEBUG) { logger.info("Used {APOWORD2} to recognize " + txt); }
                          return getNext(txt, txt);
                        }
{FULLURL}               { String txt = yytext();
                          String norm = txt;
                          if (escapeForwardSlashAsterisk) {
                            norm = LexerUtils.escapeChar(norm, '/');
                            norm = LexerUtils.escapeChar(norm, '*');
                          }
                          if (DEBUG) { logger.info("Used {FULLURL} to recognize " + txt + " as " + norm); }
                          return getNext(norm, txt);
                        }
{LIKELYURL}/[^\p{Alpha}]  { String txt = yytext();
                            String norm = txt;
                            if (escapeForwardSlashAsterisk) {
                              norm = LexerUtils.escapeChar(norm, '/');
                              norm = LexerUtils.escapeChar(norm, '*');
                            }
                            if (DEBUG) { logger.info("Used {LIKELYURL} to recognize " + txt + " as " + norm); }
                            return getNext(norm, txt);
                          }
{EMAIL}                 { String tok = yytext();
                          if (DEBUG) { logger.info("Used {EMAIL} to recognize " + tok); }
                          return getNext(tok, tok);
                        }
{TWITTER}               { String tok = yytext();
                          if (DEBUG) { logger.info("Used {TWITTER} to recognize " + tok); }
                          return getNext(tok, tok);
                        }
{REDAUX}/[^\p{Alpha}'’]   { String tok = yytext();
                          String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                          if (DEBUG) { logger.info("Used {REDAUX} to recognize " + tok + " as " + norm +
                                                   "; probablyLeft=" + false); }
                          return getNext(norm, tok);
                        }
{SREDAUX}/[^\p{Alpha}'’]  { String tok = yytext();
                          String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                          if (DEBUG) { logger.info("Used {SREDAUX} to recognize " + tok + " as " + norm +
                                                   "; probablyLeft=" + false); }
                          return getNext(norm, tok);
                        }
{DATE}                  { String origTxt = yytext();
                          String txt;
                          if (escapeForwardSlashAsterisk) {
                            txt = LexerUtils.escapeChar(origTxt, '/');
                          } else {
                            txt = origTxt;
                          }
                          if (DEBUG) { logger.info("Used {DATE} to recognize " + origTxt + " as " + txt); }
                          return getNext(txt, origTxt);
                        }
/* Malaysian currency */
RM/{NUM}        { String txt = yytext();
                  if (DEBUG) { logger.info("Used Malaysian currency to recognize " + txt); }
                  return getNext(txt, txt);
                }
{NUMBER}        { String txt = yytext();
                  handleHyphenatedNumber(txt);
                  if (DEBUG) { logger.info("Used {NUMBER} to recognize " + yytext() + " as " + removeFromNumber(yytext())); }
                  return getNext(removeFromNumber(yytext()), yytext());
                }
{SUBSUPNUM}     { String txt = yytext();
                  if (DEBUG) { logger.info("Used {SUBSUPNUM} to recognize " + txt); }
                  return getNext(txt, txt);
                }
{FRAC}          { String txt = yytext();
                  // if we are in strictFraction mode, we need to reject everything after a space or non-breaking space...
                  if (strictFraction) {
                    int spaceIndex = indexOfSpace(txt);
                    if (spaceIndex >= 0) {
                      yypushback(txt.length() - spaceIndex);
                      txt = yytext();
                      if (DEBUG) { logger.info("Used {FRAC} (strictFraction) to recognize " + txt); }
                      return getNext(txt, txt);
                    }
                  }
                  String origTxt = txt;
                  if (escapeForwardSlashAsterisk) {
                    txt = LexerUtils.escapeChar(txt, '/');
                  }
                  if (normalizeSpace) {
                    txt = txt.replace(' ', '\u00A0'); // change space to non-breaking space
                  }
                  if (DEBUG) { logger.info("Used {FRAC} to recognize " + origTxt + " as " + txt); }
                  return getNext(txt, origTxt);
                }
{FRAC2}         { String txt = yytext();
                  String norm = LexerUtils.normalizeFractions(normalizeFractions, escapeForwardSlashAsterisk, txt);
                  if (DEBUG) { logger.info("Used {FRAC2} to recognize " + txt + " as " + norm +
                                       "; normalizeFractions=" + normalizeFractions +
                                       ", escapeForwardSlashAsterisk=" + escapeForwardSlashAsterisk); }
                  return getNext(norm, txt);
                }
{TBSPEC}        { final String origTxt = yytext();
                  String tok;
                  if (normalizeAmpersandEntity) {
                    tok = LexerUtils.normalizeAmp(origTxt);
                  } else {
                    tok = origTxt;
                  }
                  if (DEBUG) { logger.info("Used {TBSPEC} to recognize " + origTxt + " as " + tok); }
                  return getNext(tok, origTxt);
                }
{SWEARING}      { String txt = yytext();
                  String normTok = txt;
                  if (escapeForwardSlashAsterisk) {
                    normTok = LexerUtils.escapeChar(normTok, '*');
                  }
                  if (DEBUG) { logger.info("Used {SWEARING} to recognize " + txt + " as " + normTok); }
                  return getNext(normTok, txt);
                }
{BANGWORDS}     { String txt = yytext();
                  if (DEBUG) { logger.info("Used {BANGWORDS} to recognize "+ txt); }
                  return getNext(txt, txt);
                }
<YyNotTokenizePerLine>{BANGMAGAZINES}/{SPACENL}magazine   {
                          String txt = yytext();
                          if (DEBUG) { logger.info("Used {BANGMAGAZINES} to recognize "+ txt); }
                          return getNext(txt, txt);
                        }
<YyTokenizePerLine>{BANGMAGAZINES}/{SPACE}magazine   {
                          String txt = yytext();
                          if (DEBUG) { logger.info("Used {BANGMAGAZINES} to recognize "+ txt); }
                          return getNext(txt, txt);
                        }
{THING3}                { breakByHyphensSlashes(yytext());
                          if (escapeForwardSlashAsterisk) {
                            String txt = yytext();
                            String normTok = LexerUtils.escapeChar(txt, '/');
                            if (DEBUG) { logger.info("Used {THING3} to recognize " + txt + " as " + normTok); }
                            return getNext(normTok, txt);
                          } else {
                            String txt = yytext();
                            if (DEBUG) { logger.info("Used {THING3} to recognize " + txt); }
                            return getNext(txt, txt);
                          }
                        }
{DOLSIGN}               { String txt = yytext();
                          if (DEBUG) { logger.info("Used {DOLSIGN} to recognize " + txt); }
                            return getNext(txt, txt);
                        }
{DOLSIGN2}              { String txt = yytext();
                          String normTok;
                          if (normalizeCurrency) {
                            normTok = LexerUtils.normalizeCurrency(txt);
                          } else {
                            normTok = LexerUtils.minimallyNormalizeCurrency(txt);
                          }
                          if (DEBUG) { logger.info("Used {DOLSIGN2} to recognize " + txt + " as " + normTok); }
                          return getNext(normTok, txt);
                        }
/* Any acronym can be treated as sentence final iff followed by this list of words (pronouns, determiners, and prepositions, etc.). "U.S." is the single big source of errors.  Character classes make this rule case sensitive! (This is needed!!). A one letter acronym candidate like "Z." or "I." in this context usually isn't, and so we return the leter and pushback the period for next time. We can't have "To" in list, as often get adjacent in headlines: "U.S. To Ask ...." */
<YyNotTokenizePerLine>{ABBREV2}/({SPACENLS})([A]|[A]bout|[A]ccording|[A]dditionally|[A]fter|[A]ll|[A]lso|[A]lthough|[A]n|[A]nother|[A]s|[A]t|[B]efore|[B]oth|[B]ut|[B]y|[D]id|[D]uring|[E]ach|[E]arlier|[F]ollowing|[F]or|[F]rom|[H]e|[H]er|[H]ere|[H]is|[H]ow|[H]owever|[I]f|[I]n|[I]t|[I]ts|[L]ast|[L]ater|[M]any|[M]ore|[M]ost|[M]rs?\.|[M]s\.|[N]ow|[O]n|[O]nce|[O]ne|[O]ther|[O]ur|[S]he|[S]ince|[S]o|[S]ome|[S]uch|[T]hat|[T]he|[T]heir|[T]hen|[T]here|[T]hese|[T]hey|[T]his|[T]wo|[U]nder|[U]pon|[W]e|[W]hen|[W]hile|[W]hat|[W]ho|[W]hy|[Y]et|[Y]ou|{SGML1})({SPACENL}|[?!]) {
                          return processAcronym();
                        }
<YyTokenizePerLine>{ABBREV2}/({SPACES})([A]|[A]bout|[A]ccording|[A]dditionally|[A]fter|[A]ll|[A]lso|[A]lthough|[A]n|[A]nother|[A]s|[A]t|[B]efore|[B]oth|[B]ut|[B]y|[D]id|[D]uring|[E]ach|[E]arlier|[F]ollowing|[F]or|[F]rom|[H]e|[H]er|[H]ere|[H]is|[H]ow|[H]owever|[I]f|[I]n|[I]t|[I]ts|[L]ast|[L]ater|[M]any|[M]ore|[M]ost|[M]rs?\.|[M]s\.|[N]ow|[O]n|[O]nce|[O]ne|[O]ther|[O]ur|[S]he|[S]ince|[S]o|[S]ome|[S]uch|[T]hat|[T]he|[T]heir|[T]hen|[T]here|[T]hese|[T]hey|[T]his|[T]wo|[U]nder|[U]pon|[W]e|[W]hen|[W]hile|[W]hat|[W]ho|[W]hy|[Y]et|[Y]ou|{SGML1})({SPACE}|[?!]) {
                          return processAcronym();
                        }

/* Special case to get ca., fig. or Prop. before numbers */
<YyNotTokenizePerLine>{ABBREV3}/{SPACENL}?[:digit:]   {
                          return processAbbrev3();
                        }
<YyTokenizePerLine>{ABBREV3}/{SPACENL}?[:digit:]   {
                          return processAbbrev3();
                        }
<YyNotTokenizePerLine>{ABBREVSN}/{SPACENL}+(Africa|Korea|Cal) { return getNext(); }
<YyTokenizePerLine>{ABBREVSN}/{SPACE}+(Africa|Korea|Cal) { return getNext(); }
/* Special case to get pty. ltd. or pty limited. Also added "Co." since someone complained, but usually a comma after it. */
(pty|pte|pvt|co)\./{SPACE}(ltd|lim|llc)  { return getNext(); }
/* Special case to get op. cit.. or loc. cit. */
(op|loc)\./{SPACE}cit\.  { return getNext(); }
<YyNotTokenizePerLine>{ABBREV1}/{SENTEND1}     {
                          return processAbbrev1();
                        }
<YyTokenizePerLine>{ABBREV1}/{SENTEND2}     {
                          return processAbbrev1();
                        }
<YyNotTokenizePerLine>{ABBREV1}s?/[^][^]        { return getNext(); }
<YyTokenizePerLine>{ABBREV1}s?/[^\r\n][^\r\n]        { return getNext(); }
{ABBREV1}s?             { // this one should only match if we're basically at the end of file
                          // since the last one matches two things, even newlines (if not tokenize per line)
                          return processAbbrev1();
                        }
{ABBREV2}s?             { String tok = yytext();
                          if (DEBUG) { logger.info("Used {ABBREV2} to recognize " + tok); }
                          return getNext(tok, tok);
                        }
/* Last millennium (in the WSJ) "Alex." is generally an abbreviation for Alex. Brown, brokers! Recognize just this case. */
<YyNotTokenizePerLine>Alex\./{SPACENL}Brown   { String tok = yytext();
                                                if (DEBUG) { logger.info("Used {ALEX} to recognize " + tok); }
                                                return getNext(tok, tok);
                                              }

<YyTokenizePerLine>Alex\./{SPACE}Brown        { String tok = yytext();
                                                if (DEBUG) { logger.info("Used {ALEX} (2) to recognize " + tok); }
                                                return getNext(tok, tok);
                                              }
{ABBREV2PRE}/{SPACENL}    { String tok = yytext();
                          if (DEBUG) { logger.info("Used {ABBREV2PRE} to recognize " + tok); }
                          return getNext(tok, tok);
                        }
{ABBREV4}               { String tok = yytext();
                          if (DEBUG) { logger.info("Used {ABBREV4} to recognize " + tok); }
                          return getNext(tok, tok);
                        }
{TBSPEC2}/{SPACENL}     { return getNext(); }
{ISO8601DATETIME}       { return getNext(); }
//{ISO8601DATE}           { return getNext(); }
{DEGREES}               { return getNext(); }
/* Ideally would factor this out for use in other tokenizers,
 * but the other tokenizers don't have TokenizerPerLine options */
<YyNotTokenizePerLine>{FILENAME}/({SPACENL}|[.?!,\"'<()])      { return getNext(); }
<YyTokenizePerLine>{FILENAME}/({SPACE}|[.?!,\"'<()])      { return getNext(); }
{WORD}\./{INSENTP}      { String origTok = yytext();
                          String norm = LexerUtils.removeSoftHyphens(origTok);
                          if (DEBUG) { logger.info("Used {WORD} (3) to recognize " + origTok + " as " + norm); }
                          return getNext(norm, origTok);
                        }
{SSN}                   { return getNext(); }
{PHONE}                 { String txt = yytext();
                          String norm = txt;
                          if (normalizeSpace) {
                            norm = norm.replace(' ', '\u00A0'); // change space to non-breaking space
                          }
                          norm = LexerUtils.pennNormalizeParens(norm, normalizeParentheses);
                          if (DEBUG) { logger.info("Used {PHONE} to recognize " + txt + " as " + norm); }
                          return getNext(norm, txt);
                        }
{DBLQUOT}/[\p{Alpha}\p{Digit}$]  { String tok = yytext();
                                   String norm = LexerUtils.handleQuotes(tok, true, quoteStyle);
                                   if (DEBUG) { logger.info("Used {DBLQUOT} to recognize " + tok + " as " + norm +
                                                            "; probablyLeft=" + true); }
                                   return getNext(norm, tok);
                                 }
{DBLQUOT}               { String tok = yytext();
                          String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                          if (DEBUG) { logger.info("Used {SREDAUX} to recognize " + tok + " as " + norm +
                                                   "; probablyLeft=" + false); }
                          return getNext(norm, tok);
                        }
{SMILEY}/[^\p{Alpha}\p{Digit}] { String txt = yytext();
                  String origText = txt;
                  txt = LexerUtils.pennNormalizeParens(txt, normalizeParentheses);
                  if (DEBUG) { logger.info("Used {SMILEY} to recognize " + origText + " as " + txt); }
                  return getNext(txt, origText);
                }
{ASIANSMILEY}   { String txt = yytext();
                  String origText = txt;
                  txt = LexerUtils.pennNormalizeParens(txt, normalizeParentheses);
                  return getNext(txt, origText);
                }
{EMOJI}         { String txt = yytext();
                  if (DEBUG) { logger.info("Used {EMOJI} to recognize " + txt); }
                  return getNext(txt, txt);
                }
{LESSTHAN}      { return getNext("<", yytext()); }
{GREATERTHAN}   { return getNext(">", yytext()); }
\{              { if (normalizeOtherBrackets) {
                    return getNext(openbrace, yytext()); }
                  else {
                    return getNext();
                  }
                }
\}              { if (normalizeOtherBrackets) {
                    return getNext(closebrace, yytext()); }
                  else {
                    return getNext();
                  }
                }
\[              { if (normalizeOtherBrackets) {
                    return getNext("-LSB-", yytext()); }
                  else {
                    return getNext();
                  }
                }
\]              { if (normalizeOtherBrackets) {
                    return getNext("-RSB-", yytext()); }
                  else {
                    return getNext();
                  }
                }
\(              { if (normalizeParentheses) {
                    return getNext(openparen, yytext()); }
                  else {
                    return getNext();
                  }
                }
\)              { if (normalizeParentheses) {
                    return getNext(closeparen, yytext()); }
                  else {
                    return getNext();
                  }
                }
{HYPHENS}       { final String origTxt = yytext();
                  String tok = origTxt;
                  if (yylength() <= 4) {
                     tok = LexerUtils.handleDashes(origTxt, dashesStyle);
                  }
                  if (DEBUG) { logger.info("Used {HYPHENS} to recognize " + origTxt + " as " + tok); }
                  return getNext(tok, origTxt);
                }

<YyNotTokenizePerLine>{LDOTS}/\.{SPACENLS}[:letter:]    {
                  /* attempt to treat fourth ellipsis as period if followed by space and letter. */
                  String tok = yytext();
                  String norm = LexerUtils.handleEllipsis(tok, ellipsisStyle);
                  if (DEBUG) { logger.info("Used {LDOTS1} to recognize " + tok + " as " + norm); }
                  return getNext(norm, tok);
                }
<YyTokenizePerLine>{LDOTS}/\.{SPACES}[:letter:]    {
                  /* attempt to treat fourth ellipsis as period if followed by space and letter. */
                  String tok = yytext();
                  String norm = LexerUtils.handleEllipsis(tok, ellipsisStyle);
                  if (DEBUG) { logger.info("Used {LDOTS2} to recognize " + tok + " as " + norm); }
                  return getNext(norm, tok);
                }
<YyNotTokenizePerLine>{SPACEDLDOTS}/{SPACE}\.{SPACENLS}[:letter:]    {
                  /* attempt to treat fourth ellipsis as period if followed by space and letter. */
                  String tok = yytext();
                  String norm = LexerUtils.handleEllipsis(tok, ellipsisStyle);
                  if (DEBUG) { logger.info("Used {LDOTS3} to recognize " + tok + " as " + norm); }
                  return getNext(norm, tok);
                }
<YyTokenizePerLine>{SPACEDLDOTS}/{SPACE}\.{SPACES}[:letter:]    {
                  /* attempt to treat fourth ellipsis as period if followed by space and letter. */
                  String tok = yytext();
                  String norm = LexerUtils.handleEllipsis(tok, ellipsisStyle);
                  if (DEBUG) { logger.info("Used {LDOTS4} to recognize " + tok + " as " + norm); }
                  return getNext(norm, tok);
                }
{LDOTS}|{SPACEDLDOTS}    { String tok = yytext();
                           String norm = LexerUtils.handleEllipsis(tok, ellipsisStyle);
                           if (DEBUG) { logger.info("Used {LDOTS5} to recognize " + tok + " as " + norm); }
                           return getNext(norm, tok);
                         }
{FNMARKS}       { return getNext(); }
{ASTS}          { if (escapeForwardSlashAsterisk) {
                    return getNext(LexerUtils.escapeChar(yytext(), '*'), yytext()); }
                  else {
                    return getNext();
                  }
                }
{INSENTP}       { return getNext(); }
[?!]+|[\u2047\u2048]    { return getNext(); }
[.¡¿\u037E\u0589\u061F\u06D4\u0700-\u0702\u07FA\u3002]  { return getNext(); }
=+              { return getNext(); }
\/              { if (escapeForwardSlashAsterisk) {
                    return getNext(LexerUtils.escapeChar(yytext(), '/'), yytext()); }
                  else {
                    return getNext();
                  }
                }
/* {HTHING}/[^\p{Alpha}\p{Digit}.+]    { return getNext(LexerUtils.removeSoftHyphens(yytext()),
                                               yytext()); } */
{HTHINGEXCEPTIONWHOLE}  {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
{HTHINGEXCEPTIONWHOLE}\./{INSENTP}  {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
{HTHINGEXCEPTIONPREFIXED}  {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
{HTHINGEXCEPTIONPREFIXED}\./{INSENTP}  {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
{HTHINGEXCEPTIONSUFFIXED}  {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
{HTHINGEXCEPTIONSUFFIXED}\./{INSENTP}  {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
{HTHING}        { String tok = yytext();
                  breakByHyphensSlashes(tok);
                  tok = yytext();
                  String norm = LexerUtils.removeSoftHyphens(tok);
                  if (DEBUG) { logger.info("Used {HTHING} to recognize " + tok + " as " + norm); }
                  return getNext(norm, tok); }
{HTHING}\./{INSENTP}
                { String tok = yytext();
                  breakByHyphensSlashes(tok);
                  tok = yytext();
                  String norm = LexerUtils.removeSoftHyphens(tok);
                  if (DEBUG) { logger.info("Used {HTHING} (2) to recognize " + tok + " as " + norm); }
                  return getNext(norm, tok);
                }
/* {THING}\./{INSENTP}          { String tok = yytext();       // cdm [2017]: I don't understand what this was for, and it seems harmful....
                               /* A THING can contain quote like O'Malley */
                               String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                                if (DEBUG) { logger.info("Used {THING} to recognize " + tok + " as " + norm +
                                                         "; probablyLeft=" + false); }
                                return getNext(norm, tok);
                              } */
{THING}         { breakByHyphensSlashes(yytext()); // this was causing fail of attempted to pushback too much!
                  String tok = yytext();
                  /* A THING can contain quote like O'Malley */
                  String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                  if (DEBUG) { logger.info("Used {THING} (2) to recognize " + tok + " as " + norm +
                                           "; probablyLeft=" + false); }
                  return getNext(norm, tok);
                }
{THINGA}\./{INSENTP}    { final String origTxt = yytext();
                          String tok;
                          if (normalizeAmpersandEntity) {
                            tok = LexerUtils.normalizeAmp(origTxt);
                          } else {
                            tok = origTxt;
                          }
                          if (DEBUG) { logger.info("Used {THINGA} to recognize " + origTxt + " as " + tok); }
                          return getNext(tok, origTxt);
                        }
{THINGA}                { final String origTxt = yytext();
                          String tok;
                          if (normalizeAmpersandEntity) {
                            tok = LexerUtils.normalizeAmp(origTxt);
                          } else {
                            tok = origTxt;
                          }
                          if (DEBUG) { logger.info("Used {THINGA} (2) to recognize " + origTxt + " as " + tok); }
                          return getNext(tok, origTxt);
                        }
/* Special case so as to prefer treating ''' as a single followed by a double quote (happens in newswire) */
'/''[^'\p{Alpha}]       { String tok = yytext();
                          String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                          if (DEBUG) { logger.info("Used {'/''} to recognize " + tok + " as " + norm +
                                           "; probablyLeft=" + false); }
                          return getNext(norm, tok);
                        }
/* This QUOTES must proceed (S)REDAUX (2) so it by preference matches straight quote before word.
   Trying to collapse the first two cases seemed to break things (?!?). */
{QUOTES}/[:letter:]{NOT_SPACENL_ONE_CHAR}
                { // Extra context is to not match on ones like 'd but you do want words like "a"
                  // can't have digit here because of cases like '90s
                  String tok = yytext();
                  /* invert single quote - often but not always right */
                  String norm = LexerUtils.handleQuotes(tok, true, quoteStyle);
                  if (DEBUG) { logger.info("Used {QUOTES} to recognize " + tok + " as " + norm +
                                           "; probablyLeft=" + true); }
                  return getNext(norm, tok);
                }
{QUOTES}/[AaIiUu]{SPACENL_ONE_CHAR}
                { // Extra context is to not match on ones like 'd but you do want words like "a"
                  // can't have digit here because of cases like '90s
                  String tok = yytext();
                  /* invert single quote - often but not always right */
                  String norm = LexerUtils.handleQuotes(tok, true, quoteStyle);
                  if (DEBUG) { logger.info("Used {QUOTES} (2) to recognize " + tok + " as " + norm +
                                           "; probablyLeft=" + true); }
                  return getNext(norm, tok);
                }
{QUOTES}        { String tok = yytext();
                  String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                  if (DEBUG) { logger.info("Used {QUOTES} (3) to recognize " + tok + " as " + norm +
                                           "; probablyLeft=" + false); }
                  return getNext(norm, tok);
                }
/* These (S)REDAUX (2) cases are needed in case string ends on "it's". See: testJacobEisensteinApostropheCase */
{REDAUX}        { String tok = yytext();
                  if (DEBUG) { logger.info("Used {REDAUX} (2) to recognize " + tok); }
                  return getNext(tok, tok);
                }
{SREDAUX}       { String tok = yytext();
                  String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
                  if (DEBUG) { logger.info("Used {SREDAUX} (2) to recognize " + tok + " as " + norm +
                                           "; probablyLeft=" + false); }
                  return getNext(norm, tok);
                }

{FAKEDUCKFEET}  { return getNext(); }
{MISCSYMBOL}    { return getNext(); }
{CP1252_MISC_SYMBOL}  { String tok = yytext();
                        String norm = LexerUtils.processCp1252misc(tok);
                        if (DEBUG) { logger.info("Used {CP1252_MISC_SYMBOL} to recognize " + tok + " as " + norm); }
                        return getNext(norm, tok);
                      }
{SPACES}|&nbsp;|[\u0000\u0008\u007F\u200B\u200E-\u200F\uFEFF]
                { if (invertible) {
                    prevWordAfter.append(yytext());
                  }
                }
{NEWLINE}       { if (tokenizeNLs) {
                      return getNext(AbstractTokenizer.NEWLINE_TOKEN, yytext()); // for tokenizing newlines
                  } else if (invertible) {
                    // System.err.println("Appending newline: |" + yytext() + "|");
                    prevWordAfter.append(yytext());
                  }
                }
.       { String str = yytext();
          int first = str.codePointAt(0);
          String msg = String.format("Untokenizable: %s (U+%s, decimal: %s)",
                          yytext(), Integer.toHexString(first).toUpperCase(), Integer.toString(first));
          switch (untokenizable) {
            case NONE_DELETE:
              if (invertible) {
                prevWordAfter.append(str);
              }
              break;
            case FIRST_DELETE:
              if (invertible) {
                prevWordAfter.append(str);
              }
              if ( ! this.seenUntokenizableCharacter) {
                logger.warning(msg);
                this.seenUntokenizableCharacter = true;
              }
              break;
            case ALL_DELETE:
              if (invertible) {
                prevWordAfter.append(str);
              }
              logger.warning(msg);
              this.seenUntokenizableCharacter = true;
              break;
            case NONE_KEEP:
              return getNext();
            case FIRST_KEEP:
              if ( ! this.seenUntokenizableCharacter) {
                logger.warning(msg);
                this.seenUntokenizableCharacter = true;
              }
              return getNext();
            case ALL_KEEP:
              logger.warning(msg);
              this.seenUntokenizableCharacter = true;
              return getNext();
          }
        }
<<EOF>> { if (invertible) {
            // prevWordAfter.append(yytext());
            String str = prevWordAfter.toString();
            if (DEBUG) { logger.info("At end of text making after: |" + str + "|"); }
            prevWord.set(CoreAnnotations.AfterAnnotation.class, str);
            if (DEBUG) { logger.info("prevWord is |" + prevWord.get(CoreAnnotations.TextAnnotation.class) + "|, its after is " +
                                     "|" + prevWord.get(CoreAnnotations.AfterAnnotation.class) + "|"); }
            prevWordAfter.setLength(0);
          }
          return null;
        }