Skip to content

Commit

Permalink
Bugfix tests hf tokenizers (#897)
Browse files Browse the repository at this point in the history
1. xfailing test doesn't fail anymore (cloning now works)
2. delimiter needs to be " " for v0.13, other delimiters don't seem to
   work
  • Loading branch information
BenjaminBossan committed Sep 27, 2022
1 parent 078f5c5 commit 57ea797
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions skorch/tests/test_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,14 +290,20 @@ def test_set_params(self, data):
tokenizer.set_params(
model__dropout=0.123,
trainer__vocab_size=123,
pre_tokenizer__delimiter='*',
max_length=456,
# With v0.13 of tokenizers, it seems like delimiter always needs to
# be " ", otherwise this error is raised: Error while attempting to
# unpickle Tokenizer: data did not match any variant of untagged
# enum ModelWrapper at line 1 column 2586. So we cannot change its
# value in this test but we should still ensure that set_params
# doesn't fail, so we keep it.
pre_tokenizer__delimiter=' ',
)
tokenizer.fit(data)

assert tokenizer.tokenizer_.model.dropout == pytest.approx(0.123)
assert len(tokenizer.vocabulary_) == pytest.approx(123, abs=5)
assert tokenizer.tokenizer_.pre_tokenizer.delimiter == '*'
assert tokenizer.tokenizer_.pre_tokenizer.delimiter == ' '
assert tokenizer.max_length == 456


Expand Down Expand Up @@ -379,7 +385,6 @@ def tokenizer(self, tokenizer_not_fitted, data):
def test_fixed_vocabulary(self, tokenizer):
assert tokenizer.fixed_vocabulary_ is False

@pytest.mark.xfail
def test_clone(self, tokenizer):
# This might get fixed in a future release of tokenizers
# https://github.com/huggingface/tokenizers/issues/941
Expand Down

0 comments on commit 57ea797

Please sign in to comment.