Skip to content

Commit

Permalink
Merge pull request #29 from Suor/py3-clean
Browse files Browse the repository at this point in the history
Migrate code to support Python 3
  • Loading branch information
kmike committed Mar 3, 2015
2 parents 225cc76 + 86d44e6 commit d7e2fae
Show file tree
Hide file tree
Showing 14 changed files with 95 additions and 95 deletions.
2 changes: 1 addition & 1 deletion docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ and also to embed annotation results back into HTML.
Installation
------------

Webstruct requires Python 2.7.
Webstruct requires Python 2.7 or Python 3.3+.

To install Webstruct, use pip::

Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-r requirements.txt
nose
doctest-ignore-unicode
numpydoc
coverage
tox
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
six
lxml
numpy
scipy
Expand Down
6 changes: 5 additions & 1 deletion runtests.sh
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
nosetests --with-doctest "$@"
nosetests \
--with-doctest \
--with-doctest-ignore-unicode \
--doctest-options='+ELLIPSIS,+IGNORE_UNICODE' \
"$@"
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
],
requires=['sklearn', 'lxml'],
)
18 changes: 10 additions & 8 deletions webstruct/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@
"""
from __future__ import absolute_import
from __future__ import absolute_import, print_function
import re
import copy
from itertools import chain, groupby
from collections import namedtuple, Counter
import six
from six.moves import zip

from lxml.etree import XPathEvaluator
from sklearn.base import BaseEstimator, TransformerMixin
Expand Down Expand Up @@ -166,12 +168,12 @@ def tokenize_single(self, tree):
>>> html_tokenizer = HtmlTokenizer(replace_html_tags={'b': 'strong'})
>>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>")
>>> html_tokens, tags = html_tokenizer.tokenize_single(tree)
>>> html_tokens # doctest: +ELLIPSIS
[HtmlToken(token=u'hello', parent=<Element p at ...>, index=0), HtmlToken...]
>>> html_tokens
[HtmlToken(token='hello', parent=<Element p at ...>, index=0), HtmlToken...]
>>> tags
['O', u'B-PER', u'I-PER', u'B-PER', 'O']
['O', 'B-PER', 'I-PER', 'B-PER', 'O']
>>> for tok, iob_tag in zip(html_tokens, tags):
... print "%5s" % iob_tag, tok.token, tok.elem.tag, tok.parent.tag
... print("%5s" % iob_tag, tok.token, tok.elem.tag, tok.parent.tag)
O hello p p
B-PER John p p
I-PER Doe strong strong
Expand All @@ -187,9 +189,9 @@ def tokenize_single(self, tree):
tree = copy.deepcopy(tree)
self.sequence_encoder.reset()
self._prepare_tree(tree)
res = zip(*(self._process_tree(tree)))
res = list(zip(*self._process_tree(tree)))
if not res:
return ([], [])
return [], []
return list(res[0]), list(res[1])

def tokenize(self, trees):
Expand Down Expand Up @@ -290,7 +292,7 @@ def _cleanup_elem(self, elem):

def _tokenize_and_split(self, text):
input_tokens = self._limit_tags(self.text_tokenize_func(text or ''))
input_tokens = map(unicode, input_tokens)
input_tokens = map(six.text_type, input_tokens)
return self.sequence_encoder.encode_split(input_tokens)

def _limit_tags(self, input_tokens):
Expand Down
3 changes: 2 additions & 1 deletion webstruct/gazetteers/geonames.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import absolute_import
import os
import csv
import six
import zipfile
import numpy as np

Expand Down Expand Up @@ -96,7 +97,7 @@ def _joined_names_column(df):
"""
return df.apply(
lambda row: ','.join(set([
unicode(n)
six.text_type(n)
for n in [row['main_name'], row['asciiname'], row['alternatenames']]
if n and n is not np.nan
])),
Expand Down
11 changes: 6 additions & 5 deletions webstruct/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import glob
from itertools import chain
from collections import defaultdict
import six
import lxml.html
import lxml.html.clean

Expand Down Expand Up @@ -113,7 +114,7 @@ class GateLoader(HtmlLoader):
>>> loader = GateLoader(known_entities={'ORG', 'CITY'})
>>> html = b"<html><body><p><ORG>Scrapinghub</ORG> has an <b>office</b> in <CITY>Montevideo</CITY></p></body></html>"
>>> tree = loader.loadbytes(html)
>>> lxml.html.tostring(tree)
>>> lxml.html.tostring(tree).decode()
'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in __START_CITY__ Montevideo __END_CITY__ </p></body></html>'
"""
Expand All @@ -133,14 +134,14 @@ def loadbytes(self, data):
def _replace_entities(self, html_bytes):
# replace requested entities with unified tokens
open_re, close_re = self._entity_patterns(self.known_entities)
html_bytes = re.sub(open_re, r' __START_\1__ ', html_bytes)
html_bytes = re.sub(close_re, r' __END_\1__ ', html_bytes)
html_bytes = re.sub(open_re, br' __START_\1__ ', html_bytes)
html_bytes = re.sub(close_re, br' __END_\1__ ', html_bytes)
return html_bytes

def _entity_patterns(self, entities):
entities_pattern = '|'.join(list(entities))
open_re = re.compile('<(%s)>' % entities_pattern, re.I)
close_re = re.compile('</(%s)>' % entities_pattern, re.I)
open_re = re.compile(six.b('<(%s)>' % entities_pattern), re.I)
close_re = re.compile(six.b('</(%s)>' % entities_pattern), re.I)
return open_re, close_re


Expand Down
8 changes: 4 additions & 4 deletions webstruct/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
:mod:`webstruct.model` contains convetional wrappers for creating NER models.
"""
from __future__ import absolute_import
import urllib2
from six.moves.urllib.request import urlopen
from lxml.html import tostring

from webstruct.loaders import HtmlLoader
Expand Down Expand Up @@ -49,7 +49,7 @@ def extract_from_url(self, url):
A convenience wrapper for :meth:`extract` method that downloads
input data from a remote URL.
"""
data = urllib2.urlopen(url).read()
data = urlopen(url).read()
return self.extract(data)

def extract_raw(self, bytes_data):
Expand Down Expand Up @@ -92,7 +92,7 @@ def extract_groups_from_url(self, url, dont_penalize=None):
A convenience wrapper for :meth:`extract_groups` method that downloads
input data from a remote URL.
"""
data = urllib2.urlopen(url).read()
data = urlopen(url).read()
return self.extract_groups(data)

def build_entity(self, html_tokens, tag):
Expand Down Expand Up @@ -120,7 +120,7 @@ def annotate_url(self, url):
Return annotated HTML data in WebAnnotator format; input is downloaded
from ``url``.
"""
data = urllib2.urlopen(url).read()
data = urlopen(url).read()
return self.annotate(data)

def __getstate__(self):
Expand Down
8 changes: 4 additions & 4 deletions webstruct/tests/test_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def assertTokenizationWorks(self, tree):
)

tree = html_tokens[0].root
self.assertNotIn('__', tostring(tree))
self.assertNotIn(b'__', tostring(tree))

def test_tokenize_single(self):
self.assertTokenizationWorks(self._load())
Expand All @@ -81,16 +81,16 @@ def test_detokenize_single(self):
tokenizer = HtmlTokenizer()
html_tokens, tags = tokenizer.tokenize_single(src_tree)
new_tree = html_tokens[0].root
self.assertIn('__START_ORG__', tostring(src_tree))
self.assertNotIn('__START_ORG__', tostring(new_tree))
self.assertIn(b'__START_ORG__', tostring(src_tree))
self.assertNotIn(b'__START_ORG__', tostring(new_tree))

self.assertHtmlTreeEqual(
new_tree,
html_document_fromstring(UNANNOTATED_HTML)
)

detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
self.assertIn('__START_ORG__', tostring(detokenized_tree))
self.assertIn(b'__START_ORG__', tostring(detokenized_tree))

self.assertHtmlTreeEqual(
detokenized_tree,
Expand Down
10 changes: 5 additions & 5 deletions webstruct/tests/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ def test_wa_loader():
ld = WebAnnotatorLoader()
tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa1.html'))
res = lxml.html.tostring(tree)
assert "<p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in __START_CITY__ Montevideo __END_CITY__ </p>" in res, res
assert "wa-" not in res, res
assert "WA-" not in res, res
assert b"<p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in __START_CITY__ Montevideo __END_CITY__ </p>" in res, res
assert b"wa-" not in res, res
assert b"WA-" not in res, res


def test_wa_loader_None_bug():
ld = WebAnnotatorLoader()
tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa2.html'))
res = lxml.html.tostring(tree)
assert '<em>Inc.</em> __END_ORG__ </p>' in res, res
assert b'<em>Inc.</em> __END_ORG__ </p>' in res, res


def test_wa_loader_with_known_entities():
Expand All @@ -28,7 +28,7 @@ def test_wa_loader_with_known_entities():
html = b"<html><body><p><span wa-subtypes='' wa-id='227' wa-type='ORG' class='WebAnnotator_org'>Scrapinghub</span> has an <b>office</b> in <span wa-subtypes='' wa-id='228' wa-type='CITY' class='WebAnnotator_org'>Montevideo</span></p></body></html>"
tree = loader.loadbytes(html)
res = lxml.html.tostring(tree)
assert '<html><body><p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in Montevideo</p></body></html>' in res
assert b'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in Montevideo</p></body></html>' in res


def _assert_entities(fragment, known_entities, expected):
Expand Down
66 changes: 33 additions & 33 deletions webstruct/text_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import absolute_import, unicode_literals
import re


Expand All @@ -8,69 +8,69 @@ class WordTokenizer(object):
that doesn't split on @ and ':' symbols and doesn't split contractions::
>>> from nltk.tokenize.treebank import TreebankWordTokenizer # doctest: +SKIP
>>> s = u'''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
>>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP
[u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York.', u'Email', u':', u'muffins', u'@', u'gmail.com']
>>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
>>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com']
>>> WordTokenizer().tokenize(s)
[u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York.', u'Email:', u'muffins@gmail.com']
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com']
>>> s = u'''Shelbourne Road,'''
>>> s = '''Shelbourne Road,'''
>>> WordTokenizer().tokenize(s)
[u'Shelbourne', u'Road', u',']
['Shelbourne', 'Road', ',']
>>> s = u'''population of 100,000'''
>>> s = '''population of 100,000'''
>>> WordTokenizer().tokenize(s)
[u'population', u'of', u'100,000']
['population', 'of', '100,000']
>>> s = u'''Hello|World'''
>>> s = '''Hello|World'''
>>> WordTokenizer().tokenize(s)
[u'Hello', u'|', u'World']
['Hello', '|', 'World']
>>> s2 = u'"We beat some pretty good teams to get here," Slocum said.'
>>> s2 = '"We beat some pretty good teams to get here," Slocum said.'
>>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE
[u'``', u'We', u'beat', u'some', u'pretty', u'good',
u'teams', u'to', u'get', u'here', u',', u"''", u'Slocum', u'said', u'.']
>>> s3 = u'''Well, we couldn't have this predictable,
['``', 'We', 'beat', 'some', 'pretty', 'good',
'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
>>> s3 = '''Well, we couldn't have this predictable,
... cliche-ridden, \"Touched by an
... Angel\" (a show creator John Masius
... worked on) wanna-be if she didn't.'''
>>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE
[u'Well', u',', u'we', u"couldn't", u'have', u'this', u'predictable',
u',', u'cliche-ridden', u',', u'``', u'Touched', u'by', u'an',
u'Angel', u"''", u'(', u'a', u'show', u'creator', u'John', u'Masius',
u'worked', u'on', u')', u'wanna-be', u'if', u'she', u"didn't", u'.']
['Well', ',', 'we', "couldn't", 'have', 'this', 'predictable',
',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an',
'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius',
'worked', 'on', ')', 'wanna-be', 'if', 'she', "didn't", '.']
Some issues:
>>> WordTokenizer().tokenize("Phone:855-349-1914") # doctest: +SKIP
[u'Phone', u':', u'855-349-1914']
['Phone', ':', '855-349-1914']
>>> WordTokenizer().tokenize(u"Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP
[u'Copyright', u'\xc2\xa9', u'2014', u'Wall', u'Decor', u'and', u'Home', u'Accents', u'.', u'All', u'Rights', u'Reserved', u'.']
>>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP
['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.']
>>> WordTokenizer().tokenize(u"Powai Campus, Mumbai-400077") # doctest: +SKIP
[u'Powai', u'Campus', u',', u'Mumbai", "-", "400077']
>>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP
['Powai', 'Campus', ',', 'Mumbai", "-", "400077']
>>> WordTokenizer().tokenize(u"1 5858/ 1800") # doctest: +SKIP
[u'1', u'5858', u'/', u'1800']
>>> WordTokenizer().tokenize("1 5858/ 1800") # doctest: +SKIP
['1', '5858', '/', '1800']
>>> WordTokenizer().tokenize(u"Saudi Arabia-") # doctest: +SKIP
[u'Saudi', u'Arabia', u'-']
>>> WordTokenizer().tokenize("Saudi Arabia-") # doctest: +SKIP
['Saudi', 'Arabia', '-']
"""

# regex, token
# if token is None - regex match group is taken
rules = [
(re.compile(r'\s+', re.UNICODE), ''),
(re.compile(ur'“'), u"``"),
(re.compile(ur'["”]'), u"''"),
(re.compile(r'“'), "``"),
(re.compile(r'["”]'), "''"),
(re.compile(r'``'), None),
(re.compile(ur'…|\.\.\.'), u'...'),
(re.compile(r'…|\.\.\.'), '...'),
(re.compile(r'--'), None),
(re.compile(r',(?=\D|$)'), None),
(re.compile(r'\.$'), None),
(re.compile(ur'[;#$£%&|!?[\](){}<>]'), None),
(re.compile(r'[;#$£%&|!?[\](){}<>]'), None),
(re.compile(r"'(?=\s)|''", re.UNICODE), None),
]

Expand All @@ -79,7 +79,7 @@ class WordTokenizer(object):
def _tokenize(self, text):
# this one cannot be placed in the loop because it requires
# position check (beginning of the string) or previous char value
text = self.open_quotes.sub(ur'\1``', text)
text = self.open_quotes.sub(r'\1``', text)

i = 0
token_start = 0
Expand Down

0 comments on commit d7e2fae

Please sign in to comment.