Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev' into pattern-sentiment-as…
Browse files Browse the repository at this point in the history
…sessments
  • Loading branch information
jeffakolb committed Nov 30, 2017
2 parents 2275489 + 7763b31 commit 7b1ffb4
Show file tree
Hide file tree
Showing 13 changed files with 108 additions and 49 deletions.
35 changes: 21 additions & 14 deletions .travis.yml
@@ -1,20 +1,27 @@
language: python
sudo: false
python:
- "2.7"
- "3.4"
- "3.5"
- "3.6"
- "2.7"
- "3.4"
- "3.5"
- "3.6"
before_install:
- "wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz"
- "tar -xzvf nltk_data-0.11.0.tar.gz -C ~"
# Install dependencies
- wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz
- tar -xzvf nltk_data-0.11.0.tar.gz -C ~
install:
- "pip install numpy"
- "pip install -U ."
- if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt; fi
# Run tests
- pip install numpy
- pip install -U six
- pip install -U .
- if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt;
fi
script:
- "python run_tests.py"
# Run doctests against py34
- if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi
- python run_tests.py
- if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi
deploy:
provider: pypi
user: sloria
password:
secure: aPoSh6zkeB6PnS77fmoeT/PzB/oeE7aM0g9ZrPd19ZwC5aORtF7/ifDfzYwYWhdyua4fLAzaEu3Z+pk5z644r1Zq8Jxryv18LeFzkzO/Sk/O9LxpJQ+ypbTIIK9Oc5LdQ0qCd5L3RtMV3zIvocvnpryVmkAm/vYBm77rCBFcMxg=
on:
tags: true
distributions: sdist bdist_wheel
3 changes: 3 additions & 0 deletions AUTHORS.rst
Expand Up @@ -23,4 +23,7 @@ Contributors (chronological)
- Adrián López Calvo `@AdrianLC <https://github.com/AdrianLC>`_
- Nitish Kulshrestha `@nitkul <https://github.com/nitkul>`_
- Jhon Eslava `@EpicJhon <https://github.com/EpicJhon>`_
- `@jcalbert <https://github.com/jcalbert>`_
- Tyler James Harden `@tylerjharden <https://github.com/tylerjharden>`_
- `@pavelmalai <https://github.com/pavelmalai>`_
- Jeff Kolb `@jeffakolb <https://github.com/jeffakolb>`_
26 changes: 26 additions & 0 deletions CHANGELOG.rst
@@ -1,6 +1,32 @@
Changelog
=========

0.14.0 (2017-11-20)
-------------------

Features:

- Use specified tokenizer when tagging (:issue:`167`). Thanks
:user:`jschnurr` for the PR.

0.13.1 (2017-11-11)
-------------------

Bug fixes:

- Avoid AttributeError when using pattern's sentiment analyzer
(:issue:`178`). Thanks :user:`tylerjharden` for the catch and patch.
- Correctly pass ``format`` argument to ``NLTKClassifier.accuracy``
(:issue:`177`). Thanks :user:`pavelmalai` for the catch and patch.

0.13.0 (2017-08-15)
-------------------

Features:

- Performance improvements to `NaiveBayesClassifier` (:issue:`63`, :issue:`77`,
:issue:`123`). Thanks :user:`jcalbert` for the PR.

0.12.0 (2017-02-27)
-------------------

Expand Down
4 changes: 2 additions & 2 deletions README.rst
Expand Up @@ -2,11 +2,11 @@
TextBlob: Simplified Text Processing
====================================

.. image:: https://badge.fury.io/py/textblob.png
.. image:: https://badge.fury.io/py/textblob.svg
:target: http://badge.fury.io/py/textblob
:alt: Latest version

.. image:: https://travis-ci.org/sloria/TextBlob.png?branch=master
.. image:: https://travis-ci.org/sloria/TextBlob.svg?branch=master
:target: https://travis-ci.org/sloria/TextBlob
:alt: Travis-CI

Expand Down
11 changes: 0 additions & 11 deletions tasks.py
Expand Up @@ -49,14 +49,3 @@ def readme(ctx, browse=False):
def doctest(ctx):
os.chdir(docs_dir)
ctx.run("make doctest")

@task
def publish(ctx, test=False):
"""Publish to the cheeseshop."""
clean(ctx)
if test:
ctx.run('python setup.py register -r test sdist bdist_wheel', echo=True)
ctx.run('twine upload dist/* -r test', echo=True)
else:
ctx.run('python setup.py register sdist bdist_wheel', echo=True)
ctx.run('twine upload dist/*', echo=True)
14 changes: 14 additions & 0 deletions tests/test_blob.py
Expand Up @@ -752,6 +752,20 @@ def test_tokenize_method(self):
# Pass in the TabTokenizer
assert_equal(blob.tokenize(tokenizer), tb.WordList(["This is", "text."]))

def test_tags_uses_custom_tokenizer(self):
tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer)
assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'VBP'), (
u'3', u'CD'), (u'88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')])

def test_tags_with_custom_tokenizer_and_tagger(self):
tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
tagger = tb.taggers.PatternTagger()
blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer, pos_tagger=tagger)
# PatterTagger takes raw text (not tokens), and handles tokenization itself.
assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'NN'),
(u'3.88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')])

@mock.patch('textblob.translate.Translator.translate')
def test_translate(self, mock_translate):
mock_translate.return_value = 'Esta es una frase.'
Expand Down
4 changes: 2 additions & 2 deletions tests/test_translate.py
Expand Up @@ -81,7 +81,7 @@ def test_detect_non_ascii(self):
def test_translate_spaces(self):
es_text = "Hola, me llamo Adrián! Cómo estás? Yo bien"
to_en = self.translator.translate(es_text, from_lang="es", to_lang="en")
assert_equal(to_en, "Hello, my name is Adrian! How are you? I am good")
assert_equal(to_en, "Hi, my name is Adrián! How are you? I am good")

def test_translate_missing_from_language_auto_detects(self):
text = "Ich hole das Bier"
Expand All @@ -99,7 +99,7 @@ def test_translate_text(self):
def test_translate_non_ascii(self):
text = "ذات سيادة كاملة"
translated = self.translator.translate(text, from_lang='ar', to_lang='en')
assert_equal(translated, "With full sovereignty")
assert_equal(translated, "Fully sovereign")

text2 = "美丽比丑陋更好"
translated = self.translator.translate(text2, from_lang="zh-CN", to_lang='en')
Expand Down
5 changes: 2 additions & 3 deletions textblob/__init__.py
@@ -1,13 +1,12 @@
import os
from .blob import TextBlob, Word, Sentence, Blobber, WordList

__version__ = '0.12.0'
__version__ = '0.14.0'
__license__ = 'MIT'
__author__ = 'Steven Loria'

PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))

from .blob import TextBlob, Word, Sentence, Blobber, WordList

__all__ = [
'TextBlob',
'Word',
Expand Down
2 changes: 1 addition & 1 deletion textblob/_text.py
Expand Up @@ -815,7 +815,7 @@ def avg(assessments, weighted=lambda w: 1):
# A synset id.
# Sentiment("a-00193480") => horrible => (-0.6, 1.0) (English WordNet)
# Sentiment("c_267") => verschrikkelijk => (-0.9, 1.0) (Dutch Cornetto)
elif isinstance(s, basestring) and RE_SYNSET.match(s):
elif isinstance(s, basestring) and RE_SYNSET.match(s) and hasattr(s, "synonyms"):
a = [(s.synonyms[0],) + self.synset(s.id, pos=s.pos) + (None,)]
# A string of words.
# Sentiment("a horrible movie") => (-0.6, 1.0)
Expand Down
2 changes: 1 addition & 1 deletion textblob/base.py
Expand Up @@ -22,7 +22,7 @@ class BaseTagger(with_metaclass(ABCMeta)):
@abstractmethod
def tag(self, text, tokenize=True):
"""Return a list of tuples of the form (word, tag)
for a given set of text.
for a given set of text or BaseBlob instance.
"""
return

Expand Down
9 changes: 6 additions & 3 deletions textblob/blob.py
Expand Up @@ -471,9 +471,12 @@ def pos_tags(self):
:rtype: list of tuples
"""
return [(Word(word, pos_tag=t), unicode(t))
for word, t in self.pos_tagger.tag(self.raw)
if not PUNCTUATION_REGEX.match(unicode(t))]
if isinstance(self, TextBlob):
return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
else:
return [(Word(word, pos_tag=t), unicode(t))
for word, t in self.pos_tagger.tag(self)
if not PUNCTUATION_REGEX.match(unicode(t))]

tags = pos_tags

Expand Down
23 changes: 19 additions & 4 deletions textblob/classifiers.py
Expand Up @@ -76,9 +76,22 @@ def basic_extractor(document, train_set):
:param document: The text to extract features from. Can be a string or an iterable.
:param list train_set: Training data set, a list of tuples of the form
``(words, label)``.
``(words, label)`` OR an iterable of strings.
"""
word_features = _get_words_from_dataset(train_set)

try:
el_zero = next(iter(train_set)) # Infer input from first element.
except StopIteration:
return {}
if isinstance(el_zero, basestring):
word_features = [w for w in chain([el_zero], train_set)]
else:
try:
assert(isinstance(el_zero[0], basestring))
word_features = _get_words_from_dataset(chain([el_zero], train_set))
except:
raise ValueError('train_set is proabably malformed.')

tokens = _get_document_tokens(document)
features = dict(((u'contains({0})'.format(word), (word in tokens))
for word in word_features))
Expand Down Expand Up @@ -123,6 +136,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **
self.train_set = self._read_data(train_set, format)
else: # train_set is a list of tuples
self.train_set = train_set
self._word_set = _get_words_from_dataset(self.train_set) #Keep a hidden set of unique words.
self.train_features = None

def _read_data(self, dataset, format=None):
Expand Down Expand Up @@ -166,7 +180,7 @@ def extract_features(self, text):
'''
# Feature extractor may take one or two arguments
try:
return self.feature_extractor(text, self.train_set)
return self.feature_extractor(text, self._word_set)
except (TypeError, AttributeError):
return self.feature_extractor(text)

Expand Down Expand Up @@ -246,7 +260,7 @@ def accuracy(self, test_set, format=None):
file format.
"""
if is_filelike(test_set):
test_data = self._read_data(test_set)
test_data = self._read_data(test_set, format)
else: # test_set is a list of tuples
test_data = test_set
test_features = [(self.extract_features(d), c) for d, c in test_data]
Expand All @@ -260,6 +274,7 @@ def update(self, new_data, *args, **kwargs):
``(text, label)``.
"""
self.train_set += new_data
self._word_set.update(_get_words_from_dataset(new_data))
self.train_features = [(self.extract_features(d), c)
for d, c in self.train_set]
try:
Expand Down
19 changes: 11 additions & 8 deletions textblob/en/taggers.py
Expand Up @@ -3,10 +3,11 @@
from __future__ import absolute_import

import nltk
import textblob.compat

import textblob as tb
from textblob.en import tag as pattern_tag
from textblob.decorators import requires_nltk_corpus
from textblob.tokenizers import word_tokenize
from textblob.base import BaseTagger


Expand All @@ -17,7 +18,9 @@ class PatternTagger(BaseTagger):
"""

def tag(self, text, tokenize=True):
"""Tag a string `text`."""
"""Tag a string or BaseBlob."""
if not isinstance(text, textblob.compat.text_type):
text = text.raw
return pattern_tag(text, tokenize)


Expand All @@ -27,9 +30,9 @@ class NLTKTagger(BaseTagger):
"""

@requires_nltk_corpus
def tag(self, text, tokenize=True):
"""Tag a string `text`."""
if tokenize:
text = list(word_tokenize(text))
tagged = nltk.tag.pos_tag(text)
return tagged
def tag(self, text):
"""Tag a string or BaseBlob."""
if isinstance(text, textblob.compat.text_type):
text = tb.TextBlob(text)

return nltk.tag.pos_tag(text.tokens)

0 comments on commit 7b1ffb4

Please sign in to comment.