From 2f537b536f2355cdb69181279ac37177c3251f23 Mon Sep 17 00:00:00 2001 From: Ruairi Fahy Date: Tue, 21 Jun 2016 15:24:14 +0100 Subject: [PATCH 1/3] Add C files and use Cython for package setup --- .travis.yml | 4 ++-- MANIFEST.in | 4 +++- requirements.txt | 3 +-- setup.py | 16 +++++++++++----- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index e6b5e3f..00097fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,8 @@ env: install: - pip install cython -- pip install -U tox codecov - +- CYTHONIZE=1 python setup.py build +- pip install -U tox script: tox after_success: diff --git a/MANIFEST.in b/MANIFEST.in index a3b876d..69f062a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,4 @@ include scrapely/*.pyx -include scrapely/extraction/*.pyx \ No newline at end of file +include scrapely/extraction/*.pyx +include scrapely/*.c +include scrapely/extraction/*.c diff --git a/requirements.txt b/requirements.txt index c3f0da3..b59bcf1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ numpy w3lib -six -cython +six \ No newline at end of file diff --git a/setup.py b/setup.py index 871521f..c02fe82 100755 --- a/setup.py +++ b/setup.py @@ -1,17 +1,23 @@ #!/usr/bin/env python +import os from setuptools import setup, find_packages from setuptools.extension import Extension -from Cython.Build import cythonize import numpy as np + +USE_CYTHON = 'CYTHONIZE' in os.environ +ext = '.pyx' if USE_CYTHON else '.c' extensions = [ Extension("scrapely._htmlpage", - ["scrapely/_htmlpage.pyx"], + ["scrapely/_htmlpage%s" % ext], include_dirs=[np.get_include()]), Extension("scrapely.extraction._similarity", - ["scrapely/extraction/_similarity.pyx"], + ["scrapely/extraction/_similarity%s" % ext], include_dirs=[np.get_include()]), ] +if USE_CYTHON: + from Cython.Build import cythonize + extensions = cythonize(extensions) setup( @@ -38,6 +44,6 @@ 'Topic :: Internet :: WWW/HTTP', 'Topic :: Text Processing :: Markup :: HTML', ], - install_requires=['numpy', 'w3lib', 'six', 'cython'], - ext_modules=cythonize(extensions), + install_requires=['numpy', 'w3lib', 'six'], + ext_modules=extensions, ) From ac62a4e641132963f80deab2fce3e3a822c2d7e3 Mon Sep 17 00:00:00 2001 From: Ruairi Fahy Date: Tue, 8 Nov 2016 11:39:07 +0000 Subject: [PATCH 2/3] Python 3 fixes Add compatability function for some tests Add fallback if no c extenstions installed Fix comment parsing in c extension --- scrapely/_htmlpage.pyx | 16 +-- scrapely/compat.py | 6 + scrapely/extraction/regionextract.py | 11 +- scrapely/extraction/similarity.py | 16 ++- scrapely/extractors.py | 12 +- scrapely/htmlpage.py | 161 +++++++++++++++++++++++++-- setup.py | 5 +- tox.ini | 2 +- 8 files changed, 198 insertions(+), 31 deletions(-) create mode 100644 scrapely/compat.py diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx index c138f6a..6939671 100644 --- a/scrapely/_htmlpage.pyx +++ b/scrapely/_htmlpage.pyx @@ -90,7 +90,7 @@ cdef class CommentParser: if self.open_count == 0: self.start = i - 3 self.open_state = 1 - self.open_count += 1 + self.open_count = 1 self.inside_comment = True if self.close_count < self.open_count: @@ -141,12 +141,12 @@ cdef class ScriptParser: self.state = 1 if ((self.state == 1 and c == u'<') or (self.state == 2 and c == u'/') or - (self.state == 3 and c == u's' or c == u'S') or - (self.state == 4 and c == u'c' or c == u'C') or - (self.state == 5 and c == u'r' or c == u'R') or - (self.state == 6 and c == u'i' or c == u'I') or - (self.state == 7 and c == u'p' or c == u'P') or - (self.state == 8 and c == u't' or c == u'T') or + (self.state == 3 and c in u'sS') or + (self.state == 4 and c in u'cC') or + (self.state == 5 and c in u'rR') or + (self.state == 6 and c in u'iI') or + (self.state == 7 and c in u'pP') or + (self.state == 8 and c in u'tT') or (self.state == 9 and c == u'>')): self.state += 1 else: @@ -288,7 +288,7 @@ cpdef parse_html(s): if tag_name != u'!doctype': parsed.append( HtmlTag(tag_type, tag_name, - tag_attributes, tag_start, tag_end + 1)) + tag_attributes, tag_start, tag_end + 1)) if tag_name == u'script': script = True if open_tag: diff --git a/scrapely/compat.py b/scrapely/compat.py new file mode 100644 index 0000000..78e06e7 --- /dev/null +++ b/scrapely/compat.py @@ -0,0 +1,6 @@ +try: + utext = unicode +except NameError: + class utext(str): + def __repr__(self): + return 'u{}'.format(super(utext, self).__repr__()) diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py index 2eba3fa..f2f6cf2 100644 --- a/scrapely/extraction/regionextract.py +++ b/scrapely/extraction/regionextract.py @@ -64,17 +64,18 @@ class BasicTypeExtractor(object): annotations. For example: + >>> from scrapely.compat import utext >>> from scrapely.extraction.pageparsing import parse_strings >>> template, page = parse_strings( \ u'

x

', u'

a name

') >>> ex = BasicTypeExtractor(template.annotations[0]) - >>> ex.extract(page, 0, 1, None) + >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)] [(u'name', u' a name')] It supports attribute descriptors >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip()) >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor}) - >>> ex.extract(page, 0, 1, None) + >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)] [(u'name', u'a name')] It supports ignoring regions @@ -82,7 +83,7 @@ class BasicTypeExtractor(object): u'
x xx
',\ u'
a name id-9
') >>> ex = BasicTypeExtractor(template.annotations[0]) - >>> ex.extract(page, 0, 3, [PageRegion(1, 2)]) + >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])] [(u'name', u'a name')] """ @@ -640,12 +641,12 @@ def extract_text(self, text): pref_index = 0 if self.minprefix > 0: rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix) - if plen < self.minprefix: + if plen is None or plen < self.minprefix: return None pref_index = -rev_idx if self.minsuffix == 0: return text[pref_index:] sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix) - if slen < self.minsuffix: + if slen is None or slen < self.minsuffix: return None return text[pref_index:pref_index + sidx] diff --git a/scrapely/extraction/similarity.py b/scrapely/extraction/similarity.py index 81573fb..ac640b5 100644 --- a/scrapely/extraction/similarity.py +++ b/scrapely/extraction/similarity.py @@ -6,9 +6,17 @@ from operator import itemgetter from heapq import nlargest -# For typical use cases (small sequences and patterns) the naive approach actually -# runs faster than KMP algorithm -from . _similarity import naive_match_length +try: + # For typical use cases (small sequences and patterns) the naive approach + # actually runs faster than KMP algorithm + from . _similarity import naive_match_length +except ImportError: + def naive_match_length(to_search, subsequence, range_start, range_end): + startval = subsequence[0] + return ((i, common_prefix_length(to_search[i:], subsequence)) + for i in xrange(range_start, range_end) + if startval == to_search[i]) + def common_prefix_length(a, b): """Calculate the length of the common prefix in both sequences passed. @@ -46,7 +54,7 @@ def common_prefix(*sequences): def longest_unique_subsequence(to_search, subsequence, range_start=0, - range_end=None): + range_end=None): """Find the longest unique subsequence of items in an array or string. This searches to_search looking for the longest overlapping match with subsequence. If the largest match is unique (there is no other diff --git a/scrapely/extractors.py b/scrapely/extractors.py index 20dab22..4fe451a 100644 --- a/scrapely/extractors.py +++ b/scrapely/extractors.py @@ -81,7 +81,8 @@ def text(region): removing excessive whitespace, For example: - >>> t = lambda s: text(htmlregion(s)) + >>> from scrapely.compat import utext + >>> t = lambda s: utext(text(htmlregion(s))) >>> t(u'

test

') u'test' @@ -122,7 +123,8 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, opening and closing tag is removed. For example: - >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep) + >>> from scrapely.compat import utext + >>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep)) >>> t(u'test test') u'test test' @@ -272,7 +274,8 @@ def extract_number(txt): >>> extract_number(' 45.3, 7') It will handle unescaped entities: - >>> extract_number(u'£129.99') + >>> from scrapely.compat import utext + >>> utext(extract_number(u'£129.99')) u'129.99' """ txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt) @@ -285,6 +288,7 @@ def extract_price(txt): """ Extracts numbers making some price format specific assumptions + >>> from scrapely.compat import utext >>> extract_price('asdf 234,234.45sdf ') '234234.45' >>> extract_price('234,23') @@ -298,7 +302,7 @@ def extract_price(txt): >>> extract_price('adsfg') >>> extract_price('stained, linseed oil finish, clear glas doors') >>> extract_price('') - >>> extract_price(u'£129.99') + >>> utext(extract_price(u'£129.99')) u'129.99' """ txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt) diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index f4f4134..62e2650 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -11,14 +11,158 @@ from six.moves.urllib.request import urlopen from copy import deepcopy from w3lib.encoding import html_to_unicode - -from . import _htmlpage - - -parse_html = _htmlpage.parse_html -HtmlDataFragment = _htmlpage.HtmlDataFragment -HtmlTag = _htmlpage.HtmlTag -HtmlTagType = _htmlpage.HtmlTagType +try: + from . import _htmlpage + parse_html = _htmlpage.parse_html + HtmlDataFragment = _htmlpage.HtmlDataFragment + HtmlTag = _htmlpage.HtmlTag + HtmlTagType = _htmlpage.HtmlTagType +except ImportError: + import re + from collections import OrderedDict + + class HtmlTagType(object): + OPEN_TAG = 1 + CLOSE_TAG = 2 + UNPAIRED_TAG = 3 + + class HtmlDataFragment(object): + __slots__ = ('start', 'end', 'is_text_content') + + def __init__(self, start, end, is_text_content=False): + self.start = start + self.end = end + self.is_text_content = is_text_content + + def __str__(self): + return "" % ( + self.start, self.end, self.is_text_content) + + def __repr__(self): + return str(self) + + class HtmlTag(HtmlDataFragment): + __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text') + + def __init__(self, tag_type, tag, attr_text, start, end): + HtmlDataFragment.__init__(self, start, end) + self.tag_type = tag_type + self.tag = tag + if isinstance(attr_text, dict): + self._attributes = attr_text + self._attr_text = None + else: # defer loading attributes until necessary + self._attributes = OrderedDict() + self._attr_text = attr_text + + @property + def attributes(self): + if not self._attributes and self._attr_text: + for attr_match in _ATTR_REGEXP.findall(self._attr_text): + name = attr_match[0].lower() + values = [v for v in attr_match[1:] if v] + # According to HTML spec if attribute name is repeated only + # the first one is taken into account + if name not in self._attributes: + self._attributes[name] = values[0] if values else None + return self._attributes + + def __str__(self): + attributes = ', '.join( + sorted(["%s: %s" % (k, repr(v)) + for k, v in self.attributes.items()])) + return "" % ( + self.tag, attributes, self.tag_type, self.start, self.end) + + def __repr__(self): + return str(self) + + _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|" + "([^>\s]+))?)?") + _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?" + _DOCTYPE = r"" + _SCRIPT = "()(.*?)()" + _COMMENT = "(|<\?.+?>)" + + _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL) + _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG), + re.I | re.DOTALL) + _DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE) + _COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL) + + def parse_html(text): + """Higher level html parser. Calls lower level parsers and joins sucesive + HtmlDataFragment elements in a single one. + """ + # If have doctype remove it. + start_pos = 0 + match = _DOCTYPE_REGEXP.match(text) + if match: + start_pos = match.end() + prev_end = start_pos + for match in _HTML_REGEXP.finditer(text, start_pos): + start = match.start() + end = match.end() + + if start > prev_end: + yield HtmlDataFragment(prev_end, start, True) + + if match.groups()[0] is not None: # comment + yield HtmlDataFragment(start, end) + elif match.groups()[1] is not None: # + for e in _parse_script(match): + yield e + else: # tag + yield _parse_tag(match) + prev_end = end + textlen = len(text) + if prev_end < textlen: + yield HtmlDataFragment(prev_end, textlen, True) + + def _parse_script(match): + """parse a region matched by _HTML_REGEXP""" + open_text, content, close_text = match.groups()[1:4] + + open_tag = _parse_tag(_HTML_REGEXP.match(open_text)) + open_tag.start = match.start() + open_tag.end = match.start() + len(open_text) + + close_tag = _parse_tag(_HTML_REGEXP.match(close_text)) + close_tag.start = match.end() - len(close_text) + close_tag.end = match.end() + + yield open_tag + if open_tag.end < close_tag.start: + start_pos = 0 + for m in _COMMENT_REGEXP.finditer(content): + if m.start() > start_pos: + yield HtmlDataFragment( + open_tag.end + start_pos, open_tag.end + m.start()) + yield HtmlDataFragment( + open_tag.end + m.start(), open_tag.end + m.end()) + start_pos = m.end() + if open_tag.end + start_pos < close_tag.start: + yield HtmlDataFragment( + open_tag.end + start_pos, close_tag.start) + yield close_tag + + def _parse_tag(match): + """ + parse a tag matched by _HTML_REGEXP + """ + data = match.groups() + closing, tag, attr_text = data[4:7] + # if tag is None then the match is a comment + if tag is not None: + unpaired = data[-1] + if closing: + tag_type = HtmlTagType.CLOSE_TAG + elif unpaired: + tag_type = HtmlTagType.UNPAIRED_TAG + else: + tag_type = HtmlTagType.OPEN_TAG + return HtmlTag(tag_type, tag.lower(), attr_text, match.start(), + match.end()) def url_to_page(url, encoding=None, default_encoding='utf-8'): @@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index): text_start = htmlpage.parsed_body[start_index].start text_end = htmlpage.parsed_body[end_index or -1].end text = htmlpage.body[text_start:text_end] + return HtmlPageRegion.__new__(cls, htmlpage, text) def __init__(self, htmlpage, start_index, end_index): diff --git a/setup.py b/setup.py index c02fe82..c563751 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ setup( name='scrapely', - version='0.12.0', + version='0.13.0b1', license='BSD', description='A pure-python HTML screen-scraping library', author='Scrapy project', @@ -45,5 +45,8 @@ 'Topic :: Text Processing :: Markup :: HTML', ], install_requires=['numpy', 'w3lib', 'six'], + extras_requires={ + 'speedup': ['cython'] + }, ext_modules=extensions, ) diff --git a/tox.ini b/tox.ini index 34a9a3a..77edb41 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27,py33,py34 +envlist = py27,py34 usedevelop = True [testenv] From cf0ee1b8d2d8bc54bd3527dd48e9004240cc2b3a Mon Sep 17 00:00:00 2001 From: Ruairi Fahy Date: Thu, 10 Nov 2016 14:48:35 +0000 Subject: [PATCH 3/3] Handle parsing of ``. Use pypy as test environment. Test python parsing implementation Fallback to pure python parser if no cython available --- .travis.yml | 2 +- scrapely/_htmlpage.pyx | 9 +++- scrapely/compat.py | 6 --- scrapely/extraction/regionextract.py | 7 ++- scrapely/extractors.py | 12 ++--- scrapely/htmlpage.py | 2 +- setup.py | 8 +++- tests/test_htmlpage_data.py | 68 +++++++++++++++++++--------- tox.ini | 3 +- 9 files changed, 71 insertions(+), 46 deletions(-) delete mode 100644 scrapely/compat.py diff --git a/.travis.yml b/.travis.yml index 00097fc..767c039 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,8 @@ python: 2.7 env: - TOXENV=py27 -- TOXENV=py33 - TOXENV=py34 +- TOXENV=pypy install: - pip install cython diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx index 6939671..8d35720 100644 --- a/scrapely/_htmlpage.pyx +++ b/scrapely/_htmlpage.pyx @@ -84,8 +84,13 @@ cdef class CommentParser: (self.open_state == 4 and c == u'-')): self.open_state += 1 else: + # Handle comment + if self.open_state == 3 and c == u'>': + self.inside_comment = False + self.reset() + self.start, self.end = i - 2, i + return True self.open_state = 1 - if self.open_state == 5: if self.open_count == 0: self.start = i - 3 @@ -233,6 +238,8 @@ cpdef parse_html(s): parsed.append( HtmlDataFragment(comment_parser.start, tag_end + 1, False)) reset_tag = True + if (comment_parser.end - comment_parser.start) == 2: + open_tag = False if comment_parser.inside_comment: open_tag = False diff --git a/scrapely/compat.py b/scrapely/compat.py deleted file mode 100644 index 78e06e7..0000000 --- a/scrapely/compat.py +++ /dev/null @@ -1,6 +0,0 @@ -try: - utext = unicode -except NameError: - class utext(str): - def __repr__(self): - return 'u{}'.format(super(utext, self).__repr__()) diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py index f2f6cf2..8469ce9 100644 --- a/scrapely/extraction/regionextract.py +++ b/scrapely/extraction/regionextract.py @@ -64,18 +64,17 @@ class BasicTypeExtractor(object): annotations. For example: - >>> from scrapely.compat import utext >>> from scrapely.extraction.pageparsing import parse_strings >>> template, page = parse_strings( \ u'

x

', u'

a name

') >>> ex = BasicTypeExtractor(template.annotations[0]) - >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)] + >>> ex.extract(page, 0, 1, None) [(u'name', u' a name')] It supports attribute descriptors >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip()) >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor}) - >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)] + >>> ex.extract(page, 0, 1, None) [(u'name', u'a name')] It supports ignoring regions @@ -83,7 +82,7 @@ class BasicTypeExtractor(object): u'
x xx
',\ u'
a name id-9
') >>> ex = BasicTypeExtractor(template.annotations[0]) - >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])] + >>> ex.extract(page, 0, 3, [PageRegion(1, 2)]) [(u'name', u'a name')] """ diff --git a/scrapely/extractors.py b/scrapely/extractors.py index 4fe451a..20dab22 100644 --- a/scrapely/extractors.py +++ b/scrapely/extractors.py @@ -81,8 +81,7 @@ def text(region): removing excessive whitespace, For example: - >>> from scrapely.compat import utext - >>> t = lambda s: utext(text(htmlregion(s))) + >>> t = lambda s: text(htmlregion(s)) >>> t(u'

test

') u'test' @@ -123,8 +122,7 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, opening and closing tag is removed. For example: - >>> from scrapely.compat import utext - >>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep)) + >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep) >>> t(u'test test') u'test test' @@ -274,8 +272,7 @@ def extract_number(txt): >>> extract_number(' 45.3, 7') It will handle unescaped entities: - >>> from scrapely.compat import utext - >>> utext(extract_number(u'£129.99')) + >>> extract_number(u'£129.99') u'129.99' """ txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt) @@ -288,7 +285,6 @@ def extract_price(txt): """ Extracts numbers making some price format specific assumptions - >>> from scrapely.compat import utext >>> extract_price('asdf 234,234.45sdf ') '234234.45' >>> extract_price('234,23') @@ -302,7 +298,7 @@ def extract_price(txt): >>> extract_price('adsfg') >>> extract_price('stained, linseed oil finish, clear glas doors') >>> extract_price('') - >>> utext(extract_price(u'£129.99')) + >>> extract_price(u'£129.99') u'129.99' """ txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt) diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index 62e2650..5900c34 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -82,7 +82,7 @@ def __repr__(self): _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?" _DOCTYPE = r"" _SCRIPT = "()(.*?)()" - _COMMENT = "(|<\?.+?>)" + _COMMENT = "(""" +# for testing tags in different forms +PAGE3 = u""" + + + + Page name + + + + + + + +""" PARSED3 = [ - {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1}, - {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1}, - {'attributes': {}, 'end': 16, 'start': 12, 'tag': u'h1', 'tag_type': 1}, - {'end': 26, 'start': 16}, - {'attributes': {}, 'end': 31, 'start': 26, 'tag': u'h1', 'tag_type': 2}, - {'attributes': {}, 'end': 34, 'start': 31, 'tag': u'p', 'tag_type': 1}, - {'end': 51, 'start': 34}, - {'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2}, - {'end': 70, 'start': 55, 'is_text_content': False}, - {'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1}, - {'end': 104, 'start': 101, 'is_text_content': False}, - {'end': 118, 'start': 104, 'is_text_content': False}, - {'end': 124, 'start': 118, 'is_text_content': False}, - {'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2}, - {'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2}, - {'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2} + {'end': 16, 'start': 15, 'is_text_content': True}, + {'end': 22, 'start': 16, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'html'}, + {'end': 27, 'start': 22, 'is_text_content': True}, + {'end': 33, 'start': 27, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'head'}, + {'end': 38, 'start': 33, 'is_text_content': True}, + {'end': 69, 'start': 38, 'is_text_content': False}, + {'end': 74, 'start': 69, 'is_text_content': True}, + {'end': 81, 'start': 74, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'title'}, + {'end': 90, 'start': 81, 'is_text_content': True}, + {'end': 98, 'start': 90, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'title'}, + {'end': 103, 'start': 98, 'is_text_content': True}, + {'end': 137, 'start': 103, 'attributes': {'content': 'value', 'name': 'name'}, 'tag_type': 1, 'is_text_content': False, 'tag': 'meta'}, + {'end': 140, 'start': 137, 'is_text_content': False}, + {'end': 141, 'start': 140, 'is_text_content': True}, + {'end': 174, 'start': 141, 'is_text_content': False}, + {'end': 179, 'start': 174, 'is_text_content': True}, + {'end': 186, 'start': 179, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'head'}, + {'end': 192, 'start': 186, 'is_text_content': True}, + {'end': 320, 'start': 192, 'is_text_content': False}, + {'end': 325, 'start': 320, 'is_text_content': True}, + {'end': 331, 'start': 325, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'body'}, + {'end': 336, 'start': 331, 'is_text_content': True}, + {'end': 343, 'start': 336, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'body'}, + {'end': 344, 'start': 343, 'is_text_content': True}, + {'end': 351, 'start': 344, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'html'}, + {'end': 352, 'start': 351, 'is_text_content': True} ] # for testing tags inside scripts @@ -293,4 +318,3 @@ {"attributes": {}, "end": 91, "start": 84, "tag": "body", "tag_type": 2}, {"attributes": {}, "end": 98, "start": 91, "tag": "html", "tag_type": 2} ] - diff --git a/tox.ini b/tox.ini index 77edb41..1375cd9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27,py34 +envlist = py27,py34,pypy,pypy3 usedevelop = True [testenv] @@ -14,6 +14,7 @@ deps = nose-parameterized doctest-ignore-unicode coverage + cython commands = pip install -e . nosetests \