Remove specific vi stuff. The json files in the tokenizer can probabl…

…y go away as well
stanfordnlp · Mar 27, 2021 · 3ccb132 · 3ccb132
1 parent 53fabf5
commit 3ccb132
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 48 deletions.
diff --git a/stanza/models/tokenization/vocab.py b/stanza/models/tokenization/vocab.py
@@ -24,12 +24,7 @@ def build_vocab(self):
 
     def normalize_unit(self, unit):
         # Normalize minimal units used by the tokenizer
-        # For Vietnamese this means a syllable, for other languages this means a character
-        normalized = unit
-        if self.lang.startswith('vi'):
-            normalized = normalized.lstrip()
-
-        return normalized
+        return unit
 
     def normalize_token(self, token):
         token = SPACE_RE.sub(' ', token.lstrip())

diff --git a/stanza/pipeline/tokenize_processor.py b/stanza/pipeline/tokenize_processor.py
@@ -11,7 +11,6 @@
 from stanza.pipeline._constants import *
 from stanza.pipeline.processor import UDProcessor, register_processor
 from stanza.pipeline.registry import PROCESSOR_VARIANTS
-from stanza.utils.datasets.postprocess_vietnamese_tokenizer_data import paras_to_chunks
 from stanza.models.common import doc
 from stanza.pipeline.external.jieba import JiebaTokenizer
 from stanza.pipeline.external.spacy import SpacyTokenizer
@@ -79,14 +78,7 @@ def process(self, document):
 
         raw_text = '\n\n'.join(document) if isinstance(document, list) else document
         # set up batches
-        if self.config.get('lang') == 'vi':
-            # special processing is due for Vietnamese
-            text = '\n\n'.join([x.rstrip() for x in NEWLINE_WHITESPACE_RE.split(raw_text)]).rstrip()
-            dummy_labels = '\n\n'.join(['0' * len(x) for x in text.split('\n\n')])
-            data = paras_to_chunks(text, dummy_labels)
-            batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True)
-        else:
-            batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
+        batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
         # get dict data
         _, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None,
                                                self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),

diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -31,7 +31,6 @@
 import tempfile
 
 import stanza.utils.datasets.common as common
-import stanza.utils.datasets.postprocess_vietnamese_tokenizer_data as postprocess_vietnamese_tokenizer_data
 import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data
 import stanza.utils.datasets.preprocess_ssj_data as preprocess_ssj_data
 
@@ -142,11 +141,6 @@ def prepare_dataset_labels(input_txt, input_conllu, tokenizer_dir, short_name, s
                                  "-o", f"{tokenizer_dir}/{short_name}-ud-{dataset}.toklabels",
                                  "-m", mwt_name(tokenizer_dir, short_name, dataset)])
 
-    if short_language == "vi":
-        postprocess_vietnamese_tokenizer_data.main([input_txt,
-                                                    "--char_level_pred", f"{tokenizer_dir}/{short_name}-ud-{dataset}.toklabels",
-                                                    "-o", f"{tokenizer_dir}/{short_name}-ud-{dataset}.json"])
-
 MWT_RE = re.compile("^[0-9]+[-][0-9]+")
 
 def strip_mwt_from_sentences(sents):
@@ -647,7 +641,7 @@ def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_l
     """
     Process a normal UD treebank with train/dev/test splits
 
-    SL-SSJ and Vietnamese both use this code path as well.
+    SL-SSJ, the combined datasets, etc all use this code path as well.
     """
     prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "train", augment, prepare_labels)
     prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "dev", augment, prepare_labels)

diff --git a/stanza/utils/training/run_ete.py b/stanza/utils/training/run_ete.py
@@ -66,12 +66,8 @@ def run_ete(paths, dataset, short_name, command_args, extra_args):
     # TOKENIZE step
     # the raw data to process starts in tokenize_dir
     # retokenize it using the saved model
-    if short_language == 'vi':
-        tokenizer_type = "--json_file"
-        tokenizer_file = f"{tokenize_dir}/{test_short_name}-ud-{dataset}.json"
-    else:
-        tokenizer_type = "--txt_file"
-        tokenizer_file = f"{tokenize_dir}/{test_short_name}.{dataset}.txt"
+    tokenizer_type = "--txt_file"
+    tokenizer_file = f"{tokenize_dir}/{test_short_name}.{dataset}.txt"
 
     tokenizer_output = f"{ete_dir}/{short_name}.{dataset}.tokenizer.conllu"
 

diff --git a/stanza/utils/training/run_tokenizer.py b/stanza/utils/training/run_tokenizer.py
@@ -31,26 +31,15 @@ def run_treebank(mode, paths, treebank, short_name,
     tokenize_dir = paths["TOKENIZE_DATA_DIR"]
 
     short_language = short_name.split("_")[0]
-    if short_language == 'vi':
-        label_type = "--json_file"
-        label_file = f"{tokenize_dir}/{short_name}-ud-train.json"
-        dev_type = "--json_file"
-        dev_file = f"{tokenize_dir}/{short_name}-ud-dev.json"
-        test_type = "--json_file"
-        test_file = f"{tokenize_dir}/{short_name}-ud-test.json"
-        train_type = "--txt_file"
-        train_file = f"{tokenize_dir}/{short_name}.train.txt"
-        train_dev_args = ["--dev_json_file", dev_file]
-    else:
-        label_type = "--label_file"
-        label_file = f"{tokenize_dir}/{short_name}-ud-train.toklabels"
-        dev_type = "--txt_file"
-        dev_file = f"{tokenize_dir}/{short_name}.dev.txt"
-        test_type = "--txt_file"
-        test_file = f"{tokenize_dir}/{short_name}.test.txt"
-        train_type = "--txt_file"
-        train_file = f"{tokenize_dir}/{short_name}.train.txt"
-        train_dev_args = ["--dev_txt_file", dev_file, "--dev_label_file", f"{tokenize_dir}/{short_name}-ud-dev.toklabels"]
+    label_type = "--label_file"
+    label_file = f"{tokenize_dir}/{short_name}-ud-train.toklabels"
+    dev_type = "--txt_file"
+    dev_file = f"{tokenize_dir}/{short_name}.dev.txt"
+    test_type = "--txt_file"
+    test_file = f"{tokenize_dir}/{short_name}.test.txt"
+    train_type = "--txt_file"
+    train_file = f"{tokenize_dir}/{short_name}.train.txt"
+    train_dev_args = ["--dev_txt_file", dev_file, "--dev_label_file", f"{tokenize_dir}/{short_name}-ud-dev.toklabels"]
 
     if short_language == "zh" or short_language.startswith("zh-"):
         extra_args = ["--skip_newline"] + extra_args