Merge remote-tracking branch 'upstream/dev' into pattern-sentiment-as…

…sessments
sloria · Nov 30, 2017 · 7b1ffb4 · 7b1ffb4
2 parents 2275489 + 7763b31
commit 7b1ffb4
Show file tree

Hide file tree

Showing 13 changed files with 108 additions and 49 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,20 +1,27 @@
 language: python
 sudo: false
 python:
-    - "2.7"
-    - "3.4"
-    - "3.5"
-    - "3.6"
+  - "2.7"
+  - "3.4"
+  - "3.5"
+  - "3.6"
 before_install:
-    - "wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz"
-    - "tar -xzvf nltk_data-0.11.0.tar.gz -C ~"
-# Install dependencies
+  - wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz
+  - tar -xzvf nltk_data-0.11.0.tar.gz -C ~
 install:
-    - "pip install numpy"
-    - "pip install -U ."
-    - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt; fi
-# Run tests
+  - pip install numpy
+  - pip install -U six
+  - pip install -U .
+  - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt;
+    fi
 script:
-    - "python run_tests.py"
-    # Run doctests against py34
-    - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi
+  - python run_tests.py
+  - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi
+deploy:
+  provider: pypi
+  user: sloria
+  password:
+    secure: aPoSh6zkeB6PnS77fmoeT/PzB/oeE7aM0g9ZrPd19ZwC5aORtF7/ifDfzYwYWhdyua4fLAzaEu3Z+pk5z644r1Zq8Jxryv18LeFzkzO/Sk/O9LxpJQ+ypbTIIK9Oc5LdQ0qCd5L3RtMV3zIvocvnpryVmkAm/vYBm77rCBFcMxg=
+  on:
+    tags: true
+  distributions: sdist bdist_wheel
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -23,4 +23,7 @@ Contributors (chronological)
 - Adrián López Calvo `@AdrianLC <https://github.com/AdrianLC>`_
 - Nitish Kulshrestha `@nitkul <https://github.com/nitkul>`_
 - Jhon Eslava `@EpicJhon <https://github.com/EpicJhon>`_
+- `@jcalbert <https://github.com/jcalbert>`_
+- Tyler James Harden `@tylerjharden <https://github.com/tylerjharden>`_
+- `@pavelmalai <https://github.com/pavelmalai>`_
 - Jeff Kolb `@jeffakolb <https://github.com/jeffakolb>`_
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,32 @@
 Changelog
 =========
 
+0.14.0 (2017-11-20)
+-------------------
+
+Features:
+
+- Use specified tokenizer when tagging (:issue:`167`). Thanks
+  :user:`jschnurr` for the PR.
+
+0.13.1 (2017-11-11)
+-------------------
+
+Bug fixes:
+
+- Avoid AttributeError when using pattern's sentiment analyzer
+  (:issue:`178`). Thanks :user:`tylerjharden` for the catch and patch.
+- Correctly pass ``format`` argument to ``NLTKClassifier.accuracy``
+  (:issue:`177`). Thanks :user:`pavelmalai` for the catch and patch.
+
+0.13.0 (2017-08-15)
+-------------------
+
+Features:
+
+- Performance improvements to `NaiveBayesClassifier` (:issue:`63`, :issue:`77`,
+  :issue:`123`). Thanks :user:`jcalbert` for the PR.
+
 0.12.0 (2017-02-27)
 -------------------
 

diff --git a/README.rst b/README.rst
@@ -2,11 +2,11 @@
 TextBlob: Simplified Text Processing
 ====================================
 
-.. image:: https://badge.fury.io/py/textblob.png
+.. image:: https://badge.fury.io/py/textblob.svg
     :target: http://badge.fury.io/py/textblob
     :alt: Latest version
 
-.. image:: https://travis-ci.org/sloria/TextBlob.png?branch=master
+.. image:: https://travis-ci.org/sloria/TextBlob.svg?branch=master
     :target: https://travis-ci.org/sloria/TextBlob
     :alt: Travis-CI
 

diff --git a/tasks.py b/tasks.py
@@ -49,14 +49,3 @@ def readme(ctx, browse=False):
 def doctest(ctx):
     os.chdir(docs_dir)
     ctx.run("make doctest")
-
-@task
-def publish(ctx, test=False):
-    """Publish to the cheeseshop."""
-    clean(ctx)
-    if test:
-        ctx.run('python setup.py register -r test sdist bdist_wheel', echo=True)
-        ctx.run('twine upload dist/* -r test', echo=True)
-    else:
-        ctx.run('python setup.py register sdist bdist_wheel', echo=True)
-        ctx.run('twine upload dist/*', echo=True)
diff --git a/tests/test_blob.py b/tests/test_blob.py
@@ -752,6 +752,20 @@ def test_tokenize_method(self):
         # Pass in the TabTokenizer
         assert_equal(blob.tokenize(tokenizer), tb.WordList(["This is", "text."]))
 
+    def test_tags_uses_custom_tokenizer(self):
+        tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
+        blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer)
+        assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'VBP'), (
+            u'3', u'CD'), (u'88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')])
+
+    def test_tags_with_custom_tokenizer_and_tagger(self):
+        tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
+        tagger = tb.taggers.PatternTagger()
+        blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer, pos_tagger=tagger)
+        # PatterTagger takes raw text (not tokens), and handles tokenization itself.
+        assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'NN'),
+                                 (u'3.88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')])
+
     @mock.patch('textblob.translate.Translator.translate')
     def test_translate(self, mock_translate):
         mock_translate.return_value = 'Esta es una frase.'

diff --git a/tests/test_translate.py b/tests/test_translate.py
@@ -81,7 +81,7 @@ def test_detect_non_ascii(self):
     def test_translate_spaces(self):
         es_text = "Hola, me llamo Adrián! Cómo estás? Yo bien"
         to_en = self.translator.translate(es_text, from_lang="es", to_lang="en")
-        assert_equal(to_en, "Hello, my name is Adrian! How are you? I am good")
+        assert_equal(to_en, "Hi, my name is Adrián! How are you? I am good")
 
     def test_translate_missing_from_language_auto_detects(self):
         text = "Ich hole das Bier"
@@ -99,7 +99,7 @@ def test_translate_text(self):
     def test_translate_non_ascii(self):
         text = "ذات سيادة كاملة"
         translated = self.translator.translate(text, from_lang='ar', to_lang='en')
-        assert_equal(translated, "With full sovereignty")
+        assert_equal(translated, "Fully sovereign")
 
         text2 = "美丽比丑陋更好"
         translated = self.translator.translate(text2, from_lang="zh-CN", to_lang='en')

diff --git a/textblob/__init__.py b/textblob/__init__.py
@@ -1,13 +1,12 @@
 import os
+from .blob import TextBlob, Word, Sentence, Blobber, WordList
 
-__version__ = '0.12.0'
+__version__ = '0.14.0'
 __license__ = 'MIT'
 __author__ = 'Steven Loria'
 
 PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
 
-from .blob import TextBlob, Word, Sentence, Blobber, WordList
-
 __all__ = [
     'TextBlob',
     'Word',

diff --git a/textblob/_text.py b/textblob/_text.py
@@ -815,7 +815,7 @@ def avg(assessments, weighted=lambda w: 1):
         # A synset id.
         # Sentiment("a-00193480") => horrible => (-0.6, 1.0)   (English WordNet)
         # Sentiment("c_267") => verschrikkelijk => (-0.9, 1.0) (Dutch Cornetto)
-        elif isinstance(s, basestring) and RE_SYNSET.match(s):
+        elif isinstance(s, basestring) and RE_SYNSET.match(s) and hasattr(s, "synonyms"):
             a = [(s.synonyms[0],) + self.synset(s.id, pos=s.pos) + (None,)]
         # A string of words.
         # Sentiment("a horrible movie") => (-0.6, 1.0)

diff --git a/textblob/base.py b/textblob/base.py
@@ -22,7 +22,7 @@ class BaseTagger(with_metaclass(ABCMeta)):
     @abstractmethod
     def tag(self, text, tokenize=True):
         """Return a list of tuples of the form (word, tag)
-        for a given set of text.
+        for a given set of text or BaseBlob instance.
         """
         return
 

diff --git a/textblob/blob.py b/textblob/blob.py
@@ -471,9 +471,12 @@ def pos_tags(self):
 
         :rtype: list of tuples
         """
-        return [(Word(word, pos_tag=t), unicode(t))
-                for word, t in self.pos_tagger.tag(self.raw)
-                if not PUNCTUATION_REGEX.match(unicode(t))]
+        if isinstance(self, TextBlob):
+            return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
+        else:
+            return [(Word(word, pos_tag=t), unicode(t))
+                    for word, t in self.pos_tagger.tag(self)
+                    if not PUNCTUATION_REGEX.match(unicode(t))]
 
     tags = pos_tags
 

diff --git a/textblob/classifiers.py b/textblob/classifiers.py
@@ -76,9 +76,22 @@ def basic_extractor(document, train_set):
 
     :param document: The text to extract features from. Can be a string or an iterable.
     :param list train_set: Training data set, a list of tuples of the form
-        ``(words, label)``.
+        ``(words, label)`` OR an iterable of strings.
     """
-    word_features = _get_words_from_dataset(train_set)
+
+    try:
+        el_zero = next(iter(train_set))  # Infer input from first element.
+    except StopIteration:
+        return {}
+    if isinstance(el_zero, basestring):
+        word_features = [w for w in chain([el_zero], train_set)]
+    else:
+        try:
+            assert(isinstance(el_zero[0], basestring))
+            word_features = _get_words_from_dataset(chain([el_zero], train_set))
+        except:
+            raise ValueError('train_set is proabably malformed.')
+
     tokens = _get_document_tokens(document)
     features = dict(((u'contains({0})'.format(word), (word in tokens))
                                             for word in word_features))
@@ -123,6 +136,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **
             self.train_set = self._read_data(train_set, format)
         else:  # train_set is a list of tuples
             self.train_set = train_set
+        self._word_set = _get_words_from_dataset(self.train_set) #Keep a hidden set of unique words.
         self.train_features = None
 
     def _read_data(self, dataset, format=None):
@@ -166,7 +180,7 @@ def extract_features(self, text):
         '''
         # Feature extractor may take one or two arguments
         try:
-            return self.feature_extractor(text, self.train_set)
+            return self.feature_extractor(text, self._word_set)
         except (TypeError, AttributeError):
             return self.feature_extractor(text)
 
@@ -246,7 +260,7 @@ def accuracy(self, test_set, format=None):
             file format.
         """
         if is_filelike(test_set):
-            test_data = self._read_data(test_set)
+            test_data = self._read_data(test_set, format)
         else:  # test_set is a list of tuples
             test_data = test_set
         test_features = [(self.extract_features(d), c) for d, c in test_data]
@@ -260,6 +274,7 @@ def update(self, new_data, *args, **kwargs):
             ``(text, label)``.
         """
         self.train_set += new_data
+        self._word_set.update(_get_words_from_dataset(new_data))
         self.train_features = [(self.extract_features(d), c)
                                 for d, c in self.train_set]
         try:

diff --git a/textblob/en/taggers.py b/textblob/en/taggers.py
@@ -3,10 +3,11 @@
 from __future__ import absolute_import
 
 import nltk
+import textblob.compat
 
+import textblob as tb
 from textblob.en import tag as pattern_tag
 from textblob.decorators import requires_nltk_corpus
-from textblob.tokenizers import word_tokenize
 from textblob.base import BaseTagger
 
 
@@ -17,7 +18,9 @@ class PatternTagger(BaseTagger):
     """
 
     def tag(self, text, tokenize=True):
-        """Tag a string `text`."""
+        """Tag a string or BaseBlob."""
+        if not isinstance(text, textblob.compat.text_type):
+            text = text.raw
         return pattern_tag(text, tokenize)
 
 
@@ -27,9 +30,9 @@ class NLTKTagger(BaseTagger):
     """
 
     @requires_nltk_corpus
-    def tag(self, text, tokenize=True):
-        """Tag a string `text`."""
-        if tokenize:
-            text = list(word_tokenize(text))
-        tagged = nltk.tag.pos_tag(text)
-        return tagged
+    def tag(self, text):
+        """Tag a string or BaseBlob."""
+        if isinstance(text, textblob.compat.text_type):
+            text = tb.TextBlob(text)
+
+        return nltk.tag.pos_tag(text.tokens)