Merge pull request #29 from Suor/py3-clean

Migrate code to support Python 3
scrapinghub · Mar 3, 2015 · d7e2fae · d7e2fae
2 parents 225cc76 + 86d44e6
commit d7e2fae
Show file tree

Hide file tree

Showing 14 changed files with 95 additions and 95 deletions.
diff --git a/docs/intro.rst b/docs/intro.rst
@@ -38,7 +38,7 @@ and also to embed annotation results back into HTML.
 Installation
 ------------
 
-Webstruct requires Python 2.7.
+Webstruct requires Python 2.7 or Python 3.3+.
 
 To install Webstruct, use pip::
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,5 +1,6 @@
 -r requirements.txt
 nose
+doctest-ignore-unicode
 numpydoc
 coverage
 tox
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+six
 lxml
 numpy
 scipy

diff --git a/runtests.sh b/runtests.sh
@@ -1 +1,5 @@
-nosetests --with-doctest "$@"
+nosetests \
+    --with-doctest \
+    --with-doctest-ignore-unicode \
+    --doctest-options='+ELLIPSIS,+IGNORE_UNICODE' \
+    "$@"
diff --git a/setup.py b/setup.py
@@ -23,6 +23,9 @@
         "License :: OSI Approved :: MIT License",
         "Programming Language :: Python :: 2",
         "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.3",
+        "Programming Language :: Python :: 3.4",
     ],
     requires=['sklearn', 'lxml'],
 )
diff --git a/webstruct/feature_extraction.py b/webstruct/feature_extraction.py
@@ -38,11 +38,13 @@
 
 
 """
-from __future__ import absolute_import
+from __future__ import absolute_import, print_function
 import re
 import copy
 from itertools import chain, groupby
 from collections import namedtuple, Counter
+import six
+from six.moves import zip
 
 from lxml.etree import XPathEvaluator
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -166,12 +168,12 @@ def tokenize_single(self, tree):
             >>> html_tokenizer = HtmlTokenizer(replace_html_tags={'b': 'strong'})
             >>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>")
             >>> html_tokens, tags = html_tokenizer.tokenize_single(tree)
-            >>> html_tokens  # doctest: +ELLIPSIS
-            [HtmlToken(token=u'hello', parent=<Element p at ...>, index=0), HtmlToken...]
+            >>> html_tokens
+            [HtmlToken(token='hello', parent=<Element p at ...>, index=0), HtmlToken...]
             >>> tags
-            ['O', u'B-PER', u'I-PER', u'B-PER', 'O']
+            ['O', 'B-PER', 'I-PER', 'B-PER', 'O']
             >>> for tok, iob_tag in zip(html_tokens, tags):
-            ...     print "%5s" % iob_tag, tok.token, tok.elem.tag, tok.parent.tag
+            ...     print("%5s" % iob_tag, tok.token, tok.elem.tag, tok.parent.tag)
                 O hello p p
             B-PER John p p
             I-PER Doe strong strong
@@ -187,9 +189,9 @@ def tokenize_single(self, tree):
         tree = copy.deepcopy(tree)
         self.sequence_encoder.reset()
         self._prepare_tree(tree)
-        res = zip(*(self._process_tree(tree)))
+        res = list(zip(*self._process_tree(tree)))
         if not res:
-            return ([], [])
+            return [], []
         return list(res[0]), list(res[1])
 
     def tokenize(self, trees):
@@ -290,7 +292,7 @@ def _cleanup_elem(self, elem):
 
     def _tokenize_and_split(self, text):
         input_tokens = self._limit_tags(self.text_tokenize_func(text or ''))
-        input_tokens = map(unicode, input_tokens)
+        input_tokens = map(six.text_type, input_tokens)
         return self.sequence_encoder.encode_split(input_tokens)
 
     def _limit_tags(self, input_tokens):

diff --git a/webstruct/gazetteers/geonames.py b/webstruct/gazetteers/geonames.py
@@ -2,6 +2,7 @@
 from __future__ import absolute_import
 import os
 import csv
+import six
 import zipfile
 import numpy as np
 
@@ -96,7 +97,7 @@ def _joined_names_column(df):
     """
     return df.apply(
         lambda row: ','.join(set([
-            unicode(n)
+            six.text_type(n)
             for n in [row['main_name'], row['asciiname'], row['alternatenames']]
             if n and n is not np.nan
         ])),

diff --git a/webstruct/loaders.py b/webstruct/loaders.py
@@ -27,6 +27,7 @@
 import glob
 from itertools import chain
 from collections import defaultdict
+import six
 import lxml.html
 import lxml.html.clean
 
@@ -113,7 +114,7 @@ class GateLoader(HtmlLoader):
     >>> loader = GateLoader(known_entities={'ORG', 'CITY'})
     >>> html = b"<html><body><p><ORG>Scrapinghub</ORG> has an <b>office</b> in <CITY>Montevideo</CITY></p></body></html>"
     >>> tree = loader.loadbytes(html)
-    >>> lxml.html.tostring(tree)
+    >>> lxml.html.tostring(tree).decode()
     '<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p></body></html>'
 
     """
@@ -133,14 +134,14 @@ def loadbytes(self, data):
     def _replace_entities(self, html_bytes):
         # replace requested entities with unified tokens
         open_re, close_re = self._entity_patterns(self.known_entities)
-        html_bytes = re.sub(open_re, r' __START_\1__ ', html_bytes)
-        html_bytes = re.sub(close_re, r' __END_\1__ ', html_bytes)
+        html_bytes = re.sub(open_re, br' __START_\1__ ', html_bytes)
+        html_bytes = re.sub(close_re, br' __END_\1__ ', html_bytes)
         return html_bytes
 
     def _entity_patterns(self, entities):
         entities_pattern = '|'.join(list(entities))
-        open_re = re.compile('<(%s)>' % entities_pattern, re.I)
-        close_re = re.compile('</(%s)>' % entities_pattern, re.I)
+        open_re = re.compile(six.b('<(%s)>' % entities_pattern), re.I)
+        close_re = re.compile(six.b('</(%s)>' % entities_pattern), re.I)
         return open_re, close_re
 
 

diff --git a/webstruct/model.py b/webstruct/model.py
@@ -3,7 +3,7 @@
 :mod:`webstruct.model` contains convetional wrappers for creating NER models.
 """
 from __future__ import absolute_import
-import urllib2
+from six.moves.urllib.request import urlopen
 from lxml.html import tostring
 
 from webstruct.loaders import HtmlLoader
@@ -49,7 +49,7 @@ def extract_from_url(self, url):
         A convenience wrapper for :meth:`extract` method that downloads
         input data from a remote URL.
         """
-        data = urllib2.urlopen(url).read()
+        data = urlopen(url).read()
         return self.extract(data)
 
     def extract_raw(self, bytes_data):
@@ -92,7 +92,7 @@ def extract_groups_from_url(self, url, dont_penalize=None):
         A convenience wrapper for :meth:`extract_groups` method that downloads
         input data from a remote URL.
         """
-        data = urllib2.urlopen(url).read()
+        data = urlopen(url).read()
         return self.extract_groups(data)
 
     def build_entity(self, html_tokens, tag):
@@ -120,7 +120,7 @@ def annotate_url(self, url):
         Return annotated HTML data in WebAnnotator format; input is downloaded
         from ``url``.
         """
-        data = urllib2.urlopen(url).read()
+        data = urlopen(url).read()
         return self.annotate(data)
 
     def __getstate__(self):

diff --git a/webstruct/tests/test_feature_extraction.py b/webstruct/tests/test_feature_extraction.py
@@ -66,7 +66,7 @@ def assertTokenizationWorks(self, tree):
         )
 
         tree = html_tokens[0].root
-        self.assertNotIn('__', tostring(tree))
+        self.assertNotIn(b'__', tostring(tree))
 
     def test_tokenize_single(self):
         self.assertTokenizationWorks(self._load())
@@ -81,16 +81,16 @@ def test_detokenize_single(self):
         tokenizer = HtmlTokenizer()
         html_tokens, tags = tokenizer.tokenize_single(src_tree)
         new_tree = html_tokens[0].root
-        self.assertIn('__START_ORG__', tostring(src_tree))
-        self.assertNotIn('__START_ORG__', tostring(new_tree))
+        self.assertIn(b'__START_ORG__', tostring(src_tree))
+        self.assertNotIn(b'__START_ORG__', tostring(new_tree))
 
         self.assertHtmlTreeEqual(
             new_tree,
             html_document_fromstring(UNANNOTATED_HTML)
         )
 
         detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
-        self.assertIn('__START_ORG__', tostring(detokenized_tree))
+        self.assertIn(b'__START_ORG__', tostring(detokenized_tree))
 
         self.assertHtmlTreeEqual(
             detokenized_tree,

diff --git a/webstruct/tests/test_loaders.py b/webstruct/tests/test_loaders.py
@@ -10,16 +10,16 @@ def test_wa_loader():
     ld = WebAnnotatorLoader()
     tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa1.html'))
     res = lxml.html.tostring(tree)
-    assert "<p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p>" in res, res
-    assert "wa-" not in res, res
-    assert "WA-" not in res, res
+    assert b"<p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p>" in res, res
+    assert b"wa-" not in res, res
+    assert b"WA-" not in res, res
 
 
 def test_wa_loader_None_bug():
     ld = WebAnnotatorLoader()
     tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa2.html'))
     res = lxml.html.tostring(tree)
-    assert '<em>Inc.</em> __END_ORG__ </p>' in res, res
+    assert b'<em>Inc.</em> __END_ORG__ </p>' in res, res
 
 
 def test_wa_loader_with_known_entities():
@@ -28,7 +28,7 @@ def test_wa_loader_with_known_entities():
     html = b"<html><body><p><span wa-subtypes='' wa-id='227' wa-type='ORG' class='WebAnnotator_org'>Scrapinghub</span> has an <b>office</b> in <span wa-subtypes='' wa-id='228' wa-type='CITY' class='WebAnnotator_org'>Montevideo</span></p></body></html>"
     tree = loader.loadbytes(html)
     res = lxml.html.tostring(tree)
-    assert '<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in Montevideo</p></body></html>' in res
+    assert b'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in Montevideo</p></body></html>' in res
 
 
 def _assert_entities(fragment, known_entities, expected):

diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
 import re
 
 
@@ -8,69 +8,69 @@ class WordTokenizer(object):
     that doesn't split on @ and ':' symbols and doesn't split contractions::
 
     >>> from nltk.tokenize.treebank import TreebankWordTokenizer  # doctest: +SKIP
-    >>> s = u'''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
-    >>> TreebankWordTokenizer().tokenize(s)  # doctest: +SKIP
-    [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York.', u'Email', u':', u'muffins', u'@', u'gmail.com']
+    >>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
+    >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com']
     >>> WordTokenizer().tokenize(s)
-    [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York.', u'Email:', u'muffins@gmail.com']
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com']
 
-    >>> s = u'''Shelbourne Road,'''
+    >>> s = '''Shelbourne Road,'''
     >>> WordTokenizer().tokenize(s)
-    [u'Shelbourne', u'Road', u',']
+    ['Shelbourne', 'Road', ',']
 
-    >>> s = u'''population of 100,000'''
+    >>> s = '''population of 100,000'''
     >>> WordTokenizer().tokenize(s)
-    [u'population', u'of', u'100,000']
+    ['population', 'of', '100,000']
 
-    >>> s = u'''Hello|World'''
+    >>> s = '''Hello|World'''
     >>> WordTokenizer().tokenize(s)
-    [u'Hello', u'|', u'World']
+    ['Hello', '|', 'World']
 
-    >>> s2 = u'"We beat some pretty good teams to get here," Slocum said.'
+    >>> s2 = '"We beat some pretty good teams to get here," Slocum said.'
     >>> WordTokenizer().tokenize(s2)  # doctest: +NORMALIZE_WHITESPACE
-    [u'``', u'We', u'beat', u'some', u'pretty', u'good',
-    u'teams', u'to', u'get', u'here', u',', u"''", u'Slocum', u'said', u'.']
-    >>> s3 = u'''Well, we couldn't have this predictable,
+    ['``', 'We', 'beat', 'some', 'pretty', 'good',
+    'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
+    >>> s3 = '''Well, we couldn't have this predictable,
     ... cliche-ridden, \"Touched by an
     ... Angel\" (a show creator John Masius
     ... worked on) wanna-be if she didn't.'''
     >>> WordTokenizer().tokenize(s3)  # doctest: +NORMALIZE_WHITESPACE
-    [u'Well', u',', u'we', u"couldn't", u'have', u'this', u'predictable',
-     u',', u'cliche-ridden', u',', u'``', u'Touched', u'by', u'an',
-     u'Angel', u"''", u'(', u'a', u'show', u'creator', u'John', u'Masius',
-     u'worked', u'on', u')', u'wanna-be', u'if', u'she', u"didn't", u'.']
+    ['Well', ',', 'we', "couldn't", 'have', 'this', 'predictable',
+     ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an',
+     'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius',
+     'worked', 'on', ')', 'wanna-be', 'if', 'she', "didn't", '.']
 
     Some issues:
 
     >>> WordTokenizer().tokenize("Phone:855-349-1914")  # doctest: +SKIP
-    [u'Phone', u':', u'855-349-1914']
+    ['Phone', ':', '855-349-1914']
 
-    >>> WordTokenizer().tokenize(u"Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")  # doctest: +SKIP
-    [u'Copyright', u'\xc2\xa9', u'2014', u'Wall', u'Decor', u'and', u'Home', u'Accents', u'.', u'All', u'Rights', u'Reserved', u'.']
+    >>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")  # doctest: +SKIP
+    ['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.']
 
-    >>> WordTokenizer().tokenize(u"Powai Campus, Mumbai-400077")  # doctest: +SKIP
-    [u'Powai', u'Campus', u',', u'Mumbai", "-", "400077']
+    >>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077")  # doctest: +SKIP
+    ['Powai', 'Campus', ',', 'Mumbai", "-", "400077']
 
-    >>> WordTokenizer().tokenize(u"1 5858/ 1800")  # doctest: +SKIP
-    [u'1', u'5858', u'/', u'1800']
+    >>> WordTokenizer().tokenize("1 5858/ 1800")  # doctest: +SKIP
+    ['1', '5858', '/', '1800']
 
-    >>> WordTokenizer().tokenize(u"Saudi Arabia-")  # doctest: +SKIP
-    [u'Saudi', u'Arabia', u'-']
+    >>> WordTokenizer().tokenize("Saudi Arabia-")  # doctest: +SKIP
+    ['Saudi', 'Arabia', '-']
 
     """
 
     # regex, token
     # if token is None - regex match group is taken
     rules = [
         (re.compile(r'\s+', re.UNICODE), ''),
-        (re.compile(ur'“'), u"``"),
-        (re.compile(ur'["”]'), u"''"),
+        (re.compile(r'“'), "``"),
+        (re.compile(r'["”]'), "''"),
         (re.compile(r'``'), None),
-        (re.compile(ur'…|\.\.\.'), u'...'),
+        (re.compile(r'…|\.\.\.'), '...'),
         (re.compile(r'--'), None),
         (re.compile(r',(?=\D|$)'), None),
         (re.compile(r'\.$'), None),
-        (re.compile(ur'[;#$£%&|!?[\](){}<>]'), None),
+        (re.compile(r'[;#$£%&|!?[\](){}<>]'), None),
         (re.compile(r"'(?=\s)|''", re.UNICODE), None),
     ]
 
@@ -79,7 +79,7 @@ class WordTokenizer(object):
     def _tokenize(self, text):
         # this one cannot be placed in the loop because it requires
         # position check (beginning of the string) or previous char value
-        text = self.open_quotes.sub(ur'\1``', text)
+        text = self.open_quotes.sub(r'\1``', text)
 
         i = 0
         token_start = 0