Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ python: 2.7

env:
- TOXENV=py27
- TOXENV=py33
- TOXENV=py34
- TOXENV=pypy

install:
- pip install cython
- pip install -U tox codecov

- CYTHONIZE=1 python setup.py build
- pip install -U tox
script: tox

after_success:
Expand Down
4 changes: 3 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
include scrapely/*.pyx
include scrapely/extraction/*.pyx
include scrapely/extraction/*.pyx
include scrapely/*.c
include scrapely/extraction/*.c
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
numpy
w3lib
six
cython
six
25 changes: 16 additions & 9 deletions scrapely/_htmlpage.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,18 @@ cdef class CommentParser:
(self.open_state == 4 and c == u'-')):
self.open_state += 1
else:
# Handle <!> comment
if self.open_state == 3 and c == u'>':
self.inside_comment = False
self.reset()
self.start, self.end = i - 2, i
return True
self.open_state = 1

if self.open_state == 5:
if self.open_count == 0:
self.start = i - 3
self.open_state = 1
self.open_count += 1
self.open_count = 1
self.inside_comment = True

if self.close_count < self.open_count:
Expand Down Expand Up @@ -141,12 +146,12 @@ cdef class ScriptParser:
self.state = 1
if ((self.state == 1 and c == u'<') or
(self.state == 2 and c == u'/') or
(self.state == 3 and c == u's' or c == u'S') or
(self.state == 4 and c == u'c' or c == u'C') or
(self.state == 5 and c == u'r' or c == u'R') or
(self.state == 6 and c == u'i' or c == u'I') or
(self.state == 7 and c == u'p' or c == u'P') or
(self.state == 8 and c == u't' or c == u'T') or
(self.state == 3 and c in u'sS') or
(self.state == 4 and c in u'cC') or
(self.state == 5 and c in u'rR') or
(self.state == 6 and c in u'iI') or
(self.state == 7 and c in u'pP') or
(self.state == 8 and c in u'tT') or
(self.state == 9 and c == u'>')):
self.state += 1
else:
Expand Down Expand Up @@ -233,6 +238,8 @@ cpdef parse_html(s):
parsed.append(
HtmlDataFragment(comment_parser.start, tag_end + 1, False))
reset_tag = True
if (comment_parser.end - comment_parser.start) == 2:
open_tag = False

if comment_parser.inside_comment:
open_tag = False
Expand Down Expand Up @@ -288,7 +295,7 @@ cpdef parse_html(s):
if tag_name != u'!doctype':
parsed.append(
HtmlTag(tag_type, tag_name,
tag_attributes, tag_start, tag_end + 1))
tag_attributes, tag_start, tag_end + 1))
if tag_name == u'script':
script = True
if open_tag:
Expand Down
4 changes: 2 additions & 2 deletions scrapely/extraction/regionextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,12 +640,12 @@ def extract_text(self, text):
pref_index = 0
if self.minprefix > 0:
rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
if plen < self.minprefix:
if plen is None or plen < self.minprefix:
return None
pref_index = -rev_idx
if self.minsuffix == 0:
return text[pref_index:]
sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
if slen < self.minsuffix:
if slen is None or slen < self.minsuffix:
return None
return text[pref_index:pref_index + sidx]
16 changes: 12 additions & 4 deletions scrapely/extraction/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,17 @@
from operator import itemgetter
from heapq import nlargest

# For typical use cases (small sequences and patterns) the naive approach actually
# runs faster than KMP algorithm
from . _similarity import naive_match_length
try:
# For typical use cases (small sequences and patterns) the naive approach
# actually runs faster than KMP algorithm
from . _similarity import naive_match_length
except ImportError:
def naive_match_length(to_search, subsequence, range_start, range_end):
startval = subsequence[0]
return ((i, common_prefix_length(to_search[i:], subsequence))
for i in xrange(range_start, range_end)
if startval == to_search[i])


def common_prefix_length(a, b):
"""Calculate the length of the common prefix in both sequences passed.
Expand Down Expand Up @@ -46,7 +54,7 @@ def common_prefix(*sequences):


def longest_unique_subsequence(to_search, subsequence, range_start=0,
range_end=None):
range_end=None):
"""Find the longest unique subsequence of items in an array or string. This
searches to_search looking for the longest overlapping
match with subsequence. If the largest match is unique (there is no other
Expand Down
161 changes: 153 additions & 8 deletions scrapely/htmlpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,158 @@
from six.moves.urllib.request import urlopen
from copy import deepcopy
from w3lib.encoding import html_to_unicode

from . import _htmlpage


parse_html = _htmlpage.parse_html
HtmlDataFragment = _htmlpage.HtmlDataFragment
HtmlTag = _htmlpage.HtmlTag
HtmlTagType = _htmlpage.HtmlTagType
try:
from . import _htmlpage
parse_html = _htmlpage.parse_html
HtmlDataFragment = _htmlpage.HtmlDataFragment
HtmlTag = _htmlpage.HtmlTag
HtmlTagType = _htmlpage.HtmlTagType
except ImportError:
import re
from collections import OrderedDict

class HtmlTagType(object):
OPEN_TAG = 1
CLOSE_TAG = 2
UNPAIRED_TAG = 3

class HtmlDataFragment(object):
__slots__ = ('start', 'end', 'is_text_content')

def __init__(self, start, end, is_text_content=False):
self.start = start
self.end = end
self.is_text_content = is_text_content

def __str__(self):
return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
self.start, self.end, self.is_text_content)

def __repr__(self):
return str(self)

class HtmlTag(HtmlDataFragment):
__slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')

def __init__(self, tag_type, tag, attr_text, start, end):
HtmlDataFragment.__init__(self, start, end)
self.tag_type = tag_type
self.tag = tag
if isinstance(attr_text, dict):
self._attributes = attr_text
self._attr_text = None
else: # defer loading attributes until necessary
self._attributes = OrderedDict()
self._attr_text = attr_text

@property
def attributes(self):
if not self._attributes and self._attr_text:
for attr_match in _ATTR_REGEXP.findall(self._attr_text):
name = attr_match[0].lower()
values = [v for v in attr_match[1:] if v]
# According to HTML spec if attribute name is repeated only
# the first one is taken into account
if name not in self._attributes:
self._attributes[name] = values[0] if values else None
return self._attributes

def __str__(self):
attributes = ', '.join(
sorted(["%s: %s" % (k, repr(v))
for k, v in self.attributes.items()]))
return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
self.tag, attributes, self.tag_type, self.start, self.end)

def __repr__(self):
return str(self)

_ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
"([^>\s]+))?)?")
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
_DOCTYPE = r"<!DOCTYPE.*?>"
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
_COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"

_ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
_HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
re.I | re.DOTALL)
_DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
_COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)

def parse_html(text):
"""Higher level html parser. Calls lower level parsers and joins sucesive
HtmlDataFragment elements in a single one.
"""
# If have doctype remove it.
start_pos = 0
match = _DOCTYPE_REGEXP.match(text)
if match:
start_pos = match.end()
prev_end = start_pos
for match in _HTML_REGEXP.finditer(text, start_pos):
start = match.start()
end = match.end()

if start > prev_end:
yield HtmlDataFragment(prev_end, start, True)

if match.groups()[0] is not None: # comment
yield HtmlDataFragment(start, end)
elif match.groups()[1] is not None: # <script>...</script>
for e in _parse_script(match):
yield e
else: # tag
yield _parse_tag(match)
prev_end = end
textlen = len(text)
if prev_end < textlen:
yield HtmlDataFragment(prev_end, textlen, True)

def _parse_script(match):
"""parse a <script>...</script> region matched by _HTML_REGEXP"""
open_text, content, close_text = match.groups()[1:4]

open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
open_tag.start = match.start()
open_tag.end = match.start() + len(open_text)

close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
close_tag.start = match.end() - len(close_text)
close_tag.end = match.end()

yield open_tag
if open_tag.end < close_tag.start:
start_pos = 0
for m in _COMMENT_REGEXP.finditer(content):
if m.start() > start_pos:
yield HtmlDataFragment(
open_tag.end + start_pos, open_tag.end + m.start())
yield HtmlDataFragment(
open_tag.end + m.start(), open_tag.end + m.end())
start_pos = m.end()
if open_tag.end + start_pos < close_tag.start:
yield HtmlDataFragment(
open_tag.end + start_pos, close_tag.start)
yield close_tag

def _parse_tag(match):
"""
parse a tag matched by _HTML_REGEXP
"""
data = match.groups()
closing, tag, attr_text = data[4:7]
# if tag is None then the match is a comment
if tag is not None:
unpaired = data[-1]
if closing:
tag_type = HtmlTagType.CLOSE_TAG
elif unpaired:
tag_type = HtmlTagType.UNPAIRED_TAG
else:
tag_type = HtmlTagType.OPEN_TAG
return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
match.end())


def url_to_page(url, encoding=None, default_encoding='utf-8'):
Expand Down Expand Up @@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index):
text_start = htmlpage.parsed_body[start_index].start
text_end = htmlpage.parsed_body[end_index or -1].end
text = htmlpage.body[text_start:text_end]

return HtmlPageRegion.__new__(cls, htmlpage, text)

def __init__(self, htmlpage, start_index, end_index):
Expand Down
25 changes: 19 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
#!/usr/bin/env python
import os
import platform
from setuptools import setup, find_packages
from setuptools.extension import Extension
from Cython.Build import cythonize
import numpy as np


USE_CYTHON = 'CYTHONIZE' in os.environ
IS_PYPY = platform.python_implementation() == 'PyPy'
ext = '.pyx' if USE_CYTHON else '.c'
extensions = [
Extension("scrapely._htmlpage",
["scrapely/_htmlpage.pyx"],
["scrapely/_htmlpage%s" % ext],
include_dirs=[np.get_include()]),
Extension("scrapely.extraction._similarity",
["scrapely/extraction/_similarity.pyx"],
["scrapely/extraction/_similarity%s" % ext],
include_dirs=[np.get_include()]),
]
if USE_CYTHON and not IS_PYPY:
from Cython.Build import cythonize
extensions = cythonize(extensions)
if IS_PYPY:
extensions = []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If both IS_PYPY and USE_CYTHON are True then extensions will be cythonized, but not used. I think it makes sense to either respect USE_CYTHON in PyPy (their cpyext layer is improving, so maybe it compiles and speed is not worse), or avoid compiling the extension if IS_PYPY is True.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, it shouldn't build with Cython when using pypy. The performance when using the compiled extension is 10 times slower than without so it is better for pypy to not use the extension at this time.
PR has been updated to reflect this.



setup(
name='scrapely',
version='0.12.0',
version='0.13.0b1',
license='BSD',
description='A pure-python HTML screen-scraping library',
author='Scrapy project',
Expand All @@ -38,6 +48,9 @@
'Topic :: Internet :: WWW/HTTP',
'Topic :: Text Processing :: Markup :: HTML',
],
install_requires=['numpy', 'w3lib', 'six', 'cython'],
ext_modules=cythonize(extensions),
install_requires=['numpy', 'w3lib', 'six'],
extras_require={
'speedup': ['cython']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is it for? If an user installs scrapely[speedup] there won't be any speedup for the user, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If cython isn't installed it will build the c extension from the included _htmlpage.pyx file. If you're downloading from pypi and the _htmlpage.c file is included that will be used to create the extension instead.

},
ext_modules=extensions,
)
Loading