Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #166 to use specified tokenizer when tagging. #167

Merged
merged 3 commits into from Nov 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 14 additions & 0 deletions tests/test_blob.py
Expand Up @@ -752,6 +752,20 @@ def test_tokenize_method(self):
# Pass in the TabTokenizer
assert_equal(blob.tokenize(tokenizer), tb.WordList(["This is", "text."]))

def test_tags_uses_custom_tokenizer(self):
tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer)
assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'VBP'), (
u'3', u'CD'), (u'88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')])

def test_tags_with_custom_tokenizer_and_tagger(self):
tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
tagger = tb.taggers.PatternTagger()
blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer, pos_tagger=tagger)
# PatterTagger takes raw text (not tokens), and handles tokenization itself.
assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'NN'),
(u'3.88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')])

@mock.patch('textblob.translate.Translator.translate')
def test_translate(self, mock_translate):
mock_translate.return_value = 'Esta es una frase.'
Expand Down
2 changes: 1 addition & 1 deletion textblob/base.py
Expand Up @@ -22,7 +22,7 @@ class BaseTagger(with_metaclass(ABCMeta)):
@abstractmethod
def tag(self, text, tokenize=True):
"""Return a list of tuples of the form (word, tag)
for a given set of text.
for a given set of text or BaseBlob instance.
"""
return

Expand Down
9 changes: 6 additions & 3 deletions textblob/blob.py
Expand Up @@ -458,9 +458,12 @@ def pos_tags(self):

:rtype: list of tuples
"""
return [(Word(word, pos_tag=t), unicode(t))
for word, t in self.pos_tagger.tag(self.raw)
if not PUNCTUATION_REGEX.match(unicode(t))]
if isinstance(self, TextBlob):
return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
else:
return [(Word(word, pos_tag=t), unicode(t))
for word, t in self.pos_tagger.tag(self)
if not PUNCTUATION_REGEX.match(unicode(t))]

tags = pos_tags

Expand Down
19 changes: 11 additions & 8 deletions textblob/en/taggers.py
Expand Up @@ -3,10 +3,11 @@
from __future__ import absolute_import

import nltk
import textblob.compat

import textblob as tb
from textblob.en import tag as pattern_tag
from textblob.decorators import requires_nltk_corpus
from textblob.tokenizers import word_tokenize
from textblob.base import BaseTagger


Expand All @@ -17,7 +18,9 @@ class PatternTagger(BaseTagger):
"""

def tag(self, text, tokenize=True):
"""Tag a string `text`."""
"""Tag a string or BaseBlob."""
if not isinstance(text, textblob.compat.text_type):
text = text.raw
return pattern_tag(text, tokenize)


Expand All @@ -27,9 +30,9 @@ class NLTKTagger(BaseTagger):
"""

@requires_nltk_corpus
def tag(self, text, tokenize=True):
"""Tag a string `text`."""
if tokenize:
text = list(word_tokenize(text))
tagged = nltk.tag.pos_tag(text)
return tagged
def tag(self, text):
"""Tag a string or BaseBlob."""
if isinstance(text, textblob.compat.text_type):
text = tb.TextBlob(text)

return nltk.tag.pos_tag(text.tokens)