diff --git a/.travis.yml b/.travis.yml index e6b5e3f..767c039 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,13 +3,13 @@ python: 2.7 env: - TOXENV=py27 -- TOXENV=py33 - TOXENV=py34 +- TOXENV=pypy install: - pip install cython -- pip install -U tox codecov - +- CYTHONIZE=1 python setup.py build +- pip install -U tox script: tox after_success: diff --git a/MANIFEST.in b/MANIFEST.in index a3b876d..69f062a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,4 @@ include scrapely/*.pyx -include scrapely/extraction/*.pyx \ No newline at end of file +include scrapely/extraction/*.pyx +include scrapely/*.c +include scrapely/extraction/*.c diff --git a/requirements.txt b/requirements.txt index c3f0da3..b59bcf1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ numpy w3lib -six -cython +six \ No newline at end of file diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx index c138f6a..8d35720 100644 --- a/scrapely/_htmlpage.pyx +++ b/scrapely/_htmlpage.pyx @@ -84,13 +84,18 @@ cdef class CommentParser: (self.open_state == 4 and c == u'-')): self.open_state += 1 else: + # Handle comment + if self.open_state == 3 and c == u'>': + self.inside_comment = False + self.reset() + self.start, self.end = i - 2, i + return True self.open_state = 1 - if self.open_state == 5: if self.open_count == 0: self.start = i - 3 self.open_state = 1 - self.open_count += 1 + self.open_count = 1 self.inside_comment = True if self.close_count < self.open_count: @@ -141,12 +146,12 @@ cdef class ScriptParser: self.state = 1 if ((self.state == 1 and c == u'<') or (self.state == 2 and c == u'/') or - (self.state == 3 and c == u's' or c == u'S') or - (self.state == 4 and c == u'c' or c == u'C') or - (self.state == 5 and c == u'r' or c == u'R') or - (self.state == 6 and c == u'i' or c == u'I') or - (self.state == 7 and c == u'p' or c == u'P') or - (self.state == 8 and c == u't' or c == u'T') or + (self.state == 3 and c in u'sS') or + (self.state == 4 and c in u'cC') or + (self.state == 5 and c in u'rR') or + (self.state == 6 and c in u'iI') or + (self.state == 7 and c in u'pP') or + (self.state == 8 and c in u'tT') or (self.state == 9 and c == u'>')): self.state += 1 else: @@ -233,6 +238,8 @@ cpdef parse_html(s): parsed.append( HtmlDataFragment(comment_parser.start, tag_end + 1, False)) reset_tag = True + if (comment_parser.end - comment_parser.start) == 2: + open_tag = False if comment_parser.inside_comment: open_tag = False @@ -288,7 +295,7 @@ cpdef parse_html(s): if tag_name != u'!doctype': parsed.append( HtmlTag(tag_type, tag_name, - tag_attributes, tag_start, tag_end + 1)) + tag_attributes, tag_start, tag_end + 1)) if tag_name == u'script': script = True if open_tag: diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py index 2eba3fa..8469ce9 100644 --- a/scrapely/extraction/regionextract.py +++ b/scrapely/extraction/regionextract.py @@ -640,12 +640,12 @@ def extract_text(self, text): pref_index = 0 if self.minprefix > 0: rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix) - if plen < self.minprefix: + if plen is None or plen < self.minprefix: return None pref_index = -rev_idx if self.minsuffix == 0: return text[pref_index:] sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix) - if slen < self.minsuffix: + if slen is None or slen < self.minsuffix: return None return text[pref_index:pref_index + sidx] diff --git a/scrapely/extraction/similarity.py b/scrapely/extraction/similarity.py index 81573fb..ac640b5 100644 --- a/scrapely/extraction/similarity.py +++ b/scrapely/extraction/similarity.py @@ -6,9 +6,17 @@ from operator import itemgetter from heapq import nlargest -# For typical use cases (small sequences and patterns) the naive approach actually -# runs faster than KMP algorithm -from . _similarity import naive_match_length +try: + # For typical use cases (small sequences and patterns) the naive approach + # actually runs faster than KMP algorithm + from . _similarity import naive_match_length +except ImportError: + def naive_match_length(to_search, subsequence, range_start, range_end): + startval = subsequence[0] + return ((i, common_prefix_length(to_search[i:], subsequence)) + for i in xrange(range_start, range_end) + if startval == to_search[i]) + def common_prefix_length(a, b): """Calculate the length of the common prefix in both sequences passed. @@ -46,7 +54,7 @@ def common_prefix(*sequences): def longest_unique_subsequence(to_search, subsequence, range_start=0, - range_end=None): + range_end=None): """Find the longest unique subsequence of items in an array or string. This searches to_search looking for the longest overlapping match with subsequence. If the largest match is unique (there is no other diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index f4f4134..5900c34 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -11,14 +11,158 @@ from six.moves.urllib.request import urlopen from copy import deepcopy from w3lib.encoding import html_to_unicode - -from . import _htmlpage - - -parse_html = _htmlpage.parse_html -HtmlDataFragment = _htmlpage.HtmlDataFragment -HtmlTag = _htmlpage.HtmlTag -HtmlTagType = _htmlpage.HtmlTagType +try: + from . import _htmlpage + parse_html = _htmlpage.parse_html + HtmlDataFragment = _htmlpage.HtmlDataFragment + HtmlTag = _htmlpage.HtmlTag + HtmlTagType = _htmlpage.HtmlTagType +except ImportError: + import re + from collections import OrderedDict + + class HtmlTagType(object): + OPEN_TAG = 1 + CLOSE_TAG = 2 + UNPAIRED_TAG = 3 + + class HtmlDataFragment(object): + __slots__ = ('start', 'end', 'is_text_content') + + def __init__(self, start, end, is_text_content=False): + self.start = start + self.end = end + self.is_text_content = is_text_content + + def __str__(self): + return "" % ( + self.start, self.end, self.is_text_content) + + def __repr__(self): + return str(self) + + class HtmlTag(HtmlDataFragment): + __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text') + + def __init__(self, tag_type, tag, attr_text, start, end): + HtmlDataFragment.__init__(self, start, end) + self.tag_type = tag_type + self.tag = tag + if isinstance(attr_text, dict): + self._attributes = attr_text + self._attr_text = None + else: # defer loading attributes until necessary + self._attributes = OrderedDict() + self._attr_text = attr_text + + @property + def attributes(self): + if not self._attributes and self._attr_text: + for attr_match in _ATTR_REGEXP.findall(self._attr_text): + name = attr_match[0].lower() + values = [v for v in attr_match[1:] if v] + # According to HTML spec if attribute name is repeated only + # the first one is taken into account + if name not in self._attributes: + self._attributes[name] = values[0] if values else None + return self._attributes + + def __str__(self): + attributes = ', '.join( + sorted(["%s: %s" % (k, repr(v)) + for k, v in self.attributes.items()])) + return "" % ( + self.tag, attributes, self.tag_type, self.start, self.end) + + def __repr__(self): + return str(self) + + _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|" + "([^>\s]+))?)?") + _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?" + _DOCTYPE = r"" + _SCRIPT = "()(.*?)()" + _COMMENT = "(""" +# for testing tags in different forms +PAGE3 = u""" + + + + Page name + + + + + + + +""" PARSED3 = [ - {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1}, - {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1}, - {'attributes': {}, 'end': 16, 'start': 12, 'tag': u'h1', 'tag_type': 1}, - {'end': 26, 'start': 16}, - {'attributes': {}, 'end': 31, 'start': 26, 'tag': u'h1', 'tag_type': 2}, - {'attributes': {}, 'end': 34, 'start': 31, 'tag': u'p', 'tag_type': 1}, - {'end': 51, 'start': 34}, - {'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2}, - {'end': 70, 'start': 55, 'is_text_content': False}, - {'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1}, - {'end': 104, 'start': 101, 'is_text_content': False}, - {'end': 118, 'start': 104, 'is_text_content': False}, - {'end': 124, 'start': 118, 'is_text_content': False}, - {'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2}, - {'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2}, - {'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2} + {'end': 16, 'start': 15, 'is_text_content': True}, + {'end': 22, 'start': 16, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'html'}, + {'end': 27, 'start': 22, 'is_text_content': True}, + {'end': 33, 'start': 27, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'head'}, + {'end': 38, 'start': 33, 'is_text_content': True}, + {'end': 69, 'start': 38, 'is_text_content': False}, + {'end': 74, 'start': 69, 'is_text_content': True}, + {'end': 81, 'start': 74, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'title'}, + {'end': 90, 'start': 81, 'is_text_content': True}, + {'end': 98, 'start': 90, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'title'}, + {'end': 103, 'start': 98, 'is_text_content': True}, + {'end': 137, 'start': 103, 'attributes': {'content': 'value', 'name': 'name'}, 'tag_type': 1, 'is_text_content': False, 'tag': 'meta'}, + {'end': 140, 'start': 137, 'is_text_content': False}, + {'end': 141, 'start': 140, 'is_text_content': True}, + {'end': 174, 'start': 141, 'is_text_content': False}, + {'end': 179, 'start': 174, 'is_text_content': True}, + {'end': 186, 'start': 179, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'head'}, + {'end': 192, 'start': 186, 'is_text_content': True}, + {'end': 320, 'start': 192, 'is_text_content': False}, + {'end': 325, 'start': 320, 'is_text_content': True}, + {'end': 331, 'start': 325, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'body'}, + {'end': 336, 'start': 331, 'is_text_content': True}, + {'end': 343, 'start': 336, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'body'}, + {'end': 344, 'start': 343, 'is_text_content': True}, + {'end': 351, 'start': 344, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'html'}, + {'end': 352, 'start': 351, 'is_text_content': True} ] # for testing tags inside scripts @@ -293,4 +318,3 @@ {"attributes": {}, "end": 91, "start": 84, "tag": "body", "tag_type": 2}, {"attributes": {}, "end": 98, "start": 91, "tag": "html", "tag_type": 2} ] - diff --git a/tox.ini b/tox.ini index 34a9a3a..1375cd9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27,py33,py34 +envlist = py27,py34,pypy,pypy3 usedevelop = True [testenv] @@ -14,6 +14,7 @@ deps = nose-parameterized doctest-ignore-unicode coverage + cython commands = pip install -e . nosetests \