Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NLTK Moses tokenizer #58

Merged
merged 4 commits into from
Jul 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ torchtext.egg-info/
*/**/*.pyc
*/**/*~
*~
.cache
4 changes: 4 additions & 0 deletions build_tools/travis/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ pip install -r requirements.txt
if [[ "$SKIP_TESTS" != "true" ]]; then
# SpaCy English models
python -m spacy download en

# NLTK data needed for Moses tokenizer
python -m nltk.downloader perluniprops nonbreaking_prefixes

# PyTorch
conda install --yes pytorch torchvision -c soumith
fi
18 changes: 0 additions & 18 deletions test/data/test_field.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from unittest import TestCase

import six
import torchtext.data as data


Expand Down Expand Up @@ -88,20 +87,3 @@ def test_pad(self):
field = data.Field(init_token="<bos>", eos_token="<eos>",
sequential=False, include_lengths=True)
assert field.pad(minibatch) == minibatch

def test_get_tokenizer(self):
# Test the default case with str.split
assert data.get_tokenizer(str.split) == str.split
test_str = "A string, particularly one with slightly complex punctuation."
assert data.get_tokenizer(str.split)(test_str) == str.split(test_str)

# Test SpaCy option, and verify it properly handles punctuation.
assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Test that errors are raised for invalid input arguments.
with self.assertRaises(ValueError):
data.get_tokenizer(1)
with self.assertRaises(ValueError):
data.get_tokenizer("some other string")
33 changes: 33 additions & 0 deletions test/data/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from unittest import TestCase

import six
import torchtext.data as data


class TestUtils(TestCase):
def test_get_tokenizer(self):
# Test the default case with str.split
assert data.get_tokenizer(str.split) == str.split
test_str = "A string, particularly one with slightly complex punctuation."
assert data.get_tokenizer(str.split)(test_str) == str.split(test_str)

# Test SpaCy option, and verify it properly handles punctuation.
assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Test Moses option. Test strings taken from NLTK doctests.
# Note that internally, MosesTokenizer converts to unicode if applicable
moses_tokenizer = data.get_tokenizer("moses")
assert moses_tokenizer(test_str) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Nonbreaking prefixes should tokenize the final period.
assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]

# Test that errors are raised for invalid input arguments.
with self.assertRaises(ValueError):
data.get_tokenizer(1)
with self.assertRaises(ValueError):
data.get_tokenizer("some other string")
4 changes: 2 additions & 2 deletions torchtext/data/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def fromtree(cls, data, fields, subtrees=False):
try:
from nltk.tree import Tree
except ImportError:
print('''Please install NLTK:
$ pip install nltk''')
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
raise
tree = Tree.fromstring(data)
if subtrees:
Expand Down
22 changes: 18 additions & 4 deletions torchtext/data/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
def get_tokenizer(tokenizer):
if callable(tokenizer):
return tokenizer
if tokenizer == 'spacy':
if tokenizer == "spacy":
try:
import spacy
spacy_en = spacy.load('en')
Expand All @@ -14,10 +14,24 @@ def get_tokenizer(tokenizer):
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
elif tokenizer == "moses":
try:
from nltk.tokenize.moses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
raise
except LookupError:
print("Please install the necessary NLTK corpora. "
"See the docs at http://nltk.org for more information.")
raise
raise ValueError("Requested tokenizer {}, valid choices are a "
"callable that takes a single string as input "
"and \"spacy\" for the SpaCy English "
"tokenizer.".format(tokenizer))
"callable that takes a single string as input, "
"\"spacy\" for the SpaCy English tokenizer, or "
"\"moses\" for the NLTK port of the Moses tokenization "
"script.".format(tokenizer))


def interleave_keys(a, b):
Expand Down