Skip to content

Commit

Permalink
Remove specific vi stuff. The json files in the tokenizer can probabl…
Browse files Browse the repository at this point in the history
…y go away as well
  • Loading branch information
AngledLuffa committed Mar 27, 2021
1 parent 53fabf5 commit 3ccb132
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 48 deletions.
7 changes: 1 addition & 6 deletions stanza/models/tokenization/vocab.py
Expand Up @@ -24,12 +24,7 @@ def build_vocab(self):

def normalize_unit(self, unit):
# Normalize minimal units used by the tokenizer
# For Vietnamese this means a syllable, for other languages this means a character
normalized = unit
if self.lang.startswith('vi'):
normalized = normalized.lstrip()

return normalized
return unit

def normalize_token(self, token):
token = SPACE_RE.sub(' ', token.lstrip())
Expand Down
10 changes: 1 addition & 9 deletions stanza/pipeline/tokenize_processor.py
Expand Up @@ -11,7 +11,6 @@
from stanza.pipeline._constants import *
from stanza.pipeline.processor import UDProcessor, register_processor
from stanza.pipeline.registry import PROCESSOR_VARIANTS
from stanza.utils.datasets.postprocess_vietnamese_tokenizer_data import paras_to_chunks
from stanza.models.common import doc
from stanza.pipeline.external.jieba import JiebaTokenizer
from stanza.pipeline.external.spacy import SpacyTokenizer
Expand Down Expand Up @@ -79,14 +78,7 @@ def process(self, document):

raw_text = '\n\n'.join(document) if isinstance(document, list) else document
# set up batches
if self.config.get('lang') == 'vi':
# special processing is due for Vietnamese
text = '\n\n'.join([x.rstrip() for x in NEWLINE_WHITESPACE_RE.split(raw_text)]).rstrip()
dummy_labels = '\n\n'.join(['0' * len(x) for x in text.split('\n\n')])
data = paras_to_chunks(text, dummy_labels)
batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True)
else:
batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
# get dict data
_, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None,
self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),
Expand Down
8 changes: 1 addition & 7 deletions stanza/utils/datasets/prepare_tokenizer_treebank.py
Expand Up @@ -31,7 +31,6 @@
import tempfile

import stanza.utils.datasets.common as common
import stanza.utils.datasets.postprocess_vietnamese_tokenizer_data as postprocess_vietnamese_tokenizer_data
import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data
import stanza.utils.datasets.preprocess_ssj_data as preprocess_ssj_data

Expand Down Expand Up @@ -142,11 +141,6 @@ def prepare_dataset_labels(input_txt, input_conllu, tokenizer_dir, short_name, s
"-o", f"{tokenizer_dir}/{short_name}-ud-{dataset}.toklabels",
"-m", mwt_name(tokenizer_dir, short_name, dataset)])

if short_language == "vi":
postprocess_vietnamese_tokenizer_data.main([input_txt,
"--char_level_pred", f"{tokenizer_dir}/{short_name}-ud-{dataset}.toklabels",
"-o", f"{tokenizer_dir}/{short_name}-ud-{dataset}.json"])

MWT_RE = re.compile("^[0-9]+[-][0-9]+")

def strip_mwt_from_sentences(sents):
Expand Down Expand Up @@ -647,7 +641,7 @@ def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_l
"""
Process a normal UD treebank with train/dev/test splits
SL-SSJ and Vietnamese both use this code path as well.
SL-SSJ, the combined datasets, etc all use this code path as well.
"""
prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "train", augment, prepare_labels)
prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "dev", augment, prepare_labels)
Expand Down
8 changes: 2 additions & 6 deletions stanza/utils/training/run_ete.py
Expand Up @@ -66,12 +66,8 @@ def run_ete(paths, dataset, short_name, command_args, extra_args):
# TOKENIZE step
# the raw data to process starts in tokenize_dir
# retokenize it using the saved model
if short_language == 'vi':
tokenizer_type = "--json_file"
tokenizer_file = f"{tokenize_dir}/{test_short_name}-ud-{dataset}.json"
else:
tokenizer_type = "--txt_file"
tokenizer_file = f"{tokenize_dir}/{test_short_name}.{dataset}.txt"
tokenizer_type = "--txt_file"
tokenizer_file = f"{tokenize_dir}/{test_short_name}.{dataset}.txt"

tokenizer_output = f"{ete_dir}/{short_name}.{dataset}.tokenizer.conllu"

Expand Down
29 changes: 9 additions & 20 deletions stanza/utils/training/run_tokenizer.py
Expand Up @@ -31,26 +31,15 @@ def run_treebank(mode, paths, treebank, short_name,
tokenize_dir = paths["TOKENIZE_DATA_DIR"]

short_language = short_name.split("_")[0]
if short_language == 'vi':
label_type = "--json_file"
label_file = f"{tokenize_dir}/{short_name}-ud-train.json"
dev_type = "--json_file"
dev_file = f"{tokenize_dir}/{short_name}-ud-dev.json"
test_type = "--json_file"
test_file = f"{tokenize_dir}/{short_name}-ud-test.json"
train_type = "--txt_file"
train_file = f"{tokenize_dir}/{short_name}.train.txt"
train_dev_args = ["--dev_json_file", dev_file]
else:
label_type = "--label_file"
label_file = f"{tokenize_dir}/{short_name}-ud-train.toklabels"
dev_type = "--txt_file"
dev_file = f"{tokenize_dir}/{short_name}.dev.txt"
test_type = "--txt_file"
test_file = f"{tokenize_dir}/{short_name}.test.txt"
train_type = "--txt_file"
train_file = f"{tokenize_dir}/{short_name}.train.txt"
train_dev_args = ["--dev_txt_file", dev_file, "--dev_label_file", f"{tokenize_dir}/{short_name}-ud-dev.toklabels"]
label_type = "--label_file"
label_file = f"{tokenize_dir}/{short_name}-ud-train.toklabels"
dev_type = "--txt_file"
dev_file = f"{tokenize_dir}/{short_name}.dev.txt"
test_type = "--txt_file"
test_file = f"{tokenize_dir}/{short_name}.test.txt"
train_type = "--txt_file"
train_file = f"{tokenize_dir}/{short_name}.train.txt"
train_dev_args = ["--dev_txt_file", dev_file, "--dev_label_file", f"{tokenize_dir}/{short_name}-ud-dev.toklabels"]

if short_language == "zh" or short_language.startswith("zh-"):
extra_args = ["--skip_newline"] + extra_args
Expand Down

0 comments on commit 3ccb132

Please sign in to comment.