Skip to content

Commit

Permalink
Normalize all PTB produced tokens, not just the German ones, using NFC
Browse files Browse the repository at this point in the history
Testing on 0.1% of Wikipedia (from a few years ago), this slows down the English tokenizer by about 1.5%
The German umlaut unit test still works as well
  • Loading branch information
AngledLuffa committed Apr 20, 2022
1 parent 58a2288 commit d46fecd
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 61 deletions.
Expand Up @@ -45,64 +45,6 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
token.setValue(token.word()+"-"+token.sentIndex());
}

/**
* Some people write umlauts as two characters instead of just one
*<br>
* German CoreNLP doesn't handle the two character versions correctly,
* so here we condense it into the one character version
*/
public static void condenseUmlauts(CoreLabel token) {
String value = token.value();
String updatedValue = condenseUmlauts(value);
if (updatedValue != null) {
token.setValue(updatedValue);
}

String word = token.word();
String updatedWord = condenseUmlauts(word);
if (updatedWord != null) {
token.setWord(updatedWord);
}
}

public static String condenseUmlauts(String value) {
StringBuilder ns = null;
for (int i = 0; i < value.length(); ++i) {
final char cur = value.charAt(i);
if ((int) cur == 776) {
// this is the umlaut character
if (ns == null) {
ns = new StringBuilder(value.length());
ns.append(value.substring(0, i));
}
final char prev = ns.length() == 0 ? ' ' : ns.charAt(ns.length() - 1);
if (prev == 'a') {
ns.setCharAt(ns.length() - 1, 'ä');
} else if (prev == 'A') {
ns.setCharAt(ns.length() - 1, 'Ä');
} else if (prev == 'o') {
ns.setCharAt(ns.length() - 1, 'ö');
} else if (prev == 'O') {
ns.setCharAt(ns.length() - 1, 'Ö');
} else if (prev == 'u') {
ns.setCharAt(ns.length() - 1, 'ü');
} else if (prev == 'U') {
ns.setCharAt(ns.length() - 1, 'Ü');
} else {
ns.append(cur);
}
} else {
if (ns != null) {
ns.append(cur);
}
}
}
if (ns != null) {
return ns.toString();
}
return null;
}

@Override
public List<CoreLabel> process(List<CoreLabel> tokens) {
List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
Expand Down Expand Up @@ -134,9 +76,6 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
}
}

for (CoreLabel label : processedTokens) {
condenseUmlauts(label);
}
return processedTokens;
}

Expand Down
2 changes: 2 additions & 0 deletions src/edu/stanford/nlp/process/PTBLexer.flex
Expand Up @@ -27,6 +27,7 @@ package edu.stanford.nlp.process;


import java.io.Reader;
import java.text.Normalizer;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
Expand Down Expand Up @@ -488,6 +489,7 @@ import edu.stanford.nlp.util.logging.Redwood;
* @param originalText The original String that got transformed into txt
*/
private Object getNext(String txt, String originalText) {
txt = Normalizer.normalize(txt, Normalizer.Form.NFC);
int begin = Math.toIntExact(yychar);
if (invertible) {
String str = prevWordAfter.toString();
Expand Down
2 changes: 2 additions & 0 deletions src/edu/stanford/nlp/process/PTBLexer.java
Expand Up @@ -31,6 +31,7 @@


import java.io.Reader;
import java.text.Normalizer;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
Expand Down Expand Up @@ -88334,6 +88335,7 @@ private Object getNext() {
* @param originalText The original String that got transformed into txt
*/
private Object getNext(String txt, String originalText) {
txt = Normalizer.normalize(txt, Normalizer.Form.NFC);
int begin = Math.toIntExact(yychar);
if (invertible) {
String str = prevWordAfter.toString();
Expand Down

0 comments on commit d46fecd

Please sign in to comment.