From 2f537b536f2355cdb69181279ac37177c3251f23 Mon Sep 17 00:00:00 2001
From: Ruairi Fahy <ruairifahy91@gmail.com>
Date: Tue, 21 Jun 2016 15:24:14 +0100
Subject: [PATCH 1/3] Add C files and use Cython for package setup

---
 .travis.yml      |  4 ++--
 MANIFEST.in      |  4 +++-
 requirements.txt |  3 +--
 setup.py         | 16 +++++++++++-----
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index e6b5e3f..00097fc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,8 +8,8 @@ env:
 
 install:
 - pip install cython
-- pip install -U tox codecov
-
+- CYTHONIZE=1 python setup.py build
+- pip install -U tox
 script: tox
 
 after_success:
diff --git a/MANIFEST.in b/MANIFEST.in
index a3b876d..69f062a 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,4 @@
 include scrapely/*.pyx
-include scrapely/extraction/*.pyx
\ No newline at end of file
+include scrapely/extraction/*.pyx
+include scrapely/*.c
+include scrapely/extraction/*.c
diff --git a/requirements.txt b/requirements.txt
index c3f0da3..b59bcf1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
 numpy
 w3lib
-six
-cython
+six
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 871521f..c02fe82 100755
--- a/setup.py
+++ b/setup.py
@@ -1,17 +1,23 @@
 #!/usr/bin/env python
+import os
 from setuptools import setup, find_packages
 from setuptools.extension import Extension
-from Cython.Build import cythonize
 import numpy as np
 
+
+USE_CYTHON = 'CYTHONIZE' in os.environ
+ext = '.pyx' if USE_CYTHON else '.c'
 extensions = [
     Extension("scrapely._htmlpage",
-              ["scrapely/_htmlpage.pyx"],
+              ["scrapely/_htmlpage%s" % ext],
               include_dirs=[np.get_include()]),
     Extension("scrapely.extraction._similarity",
-              ["scrapely/extraction/_similarity.pyx"],
+              ["scrapely/extraction/_similarity%s" % ext],
               include_dirs=[np.get_include()]),
 ]
+if USE_CYTHON:
+    from Cython.Build import cythonize
+    extensions = cythonize(extensions)
 
 
 setup(
@@ -38,6 +44,6 @@
         'Topic :: Internet :: WWW/HTTP',
         'Topic :: Text Processing :: Markup :: HTML',
     ],
-    install_requires=['numpy', 'w3lib', 'six', 'cython'],
-    ext_modules=cythonize(extensions),
+    install_requires=['numpy', 'w3lib', 'six'],
+    ext_modules=extensions,
 )

From ac62a4e641132963f80deab2fce3e3a822c2d7e3 Mon Sep 17 00:00:00 2001
From: Ruairi Fahy <ruairifahy91@gmail.com>
Date: Tue, 8 Nov 2016 11:39:07 +0000
Subject: [PATCH 2/3] Python 3 fixes

Add compatability function for some tests
Add fallback if no c extenstions installed
Fix comment parsing in c extension
---
 scrapely/_htmlpage.pyx               |  16 +--
 scrapely/compat.py                   |   6 +
 scrapely/extraction/regionextract.py |  11 +-
 scrapely/extraction/similarity.py    |  16 ++-
 scrapely/extractors.py               |  12 +-
 scrapely/htmlpage.py                 | 161 +++++++++++++++++++++++++--
 setup.py                             |   5 +-
 tox.ini                              |   2 +-
 8 files changed, 198 insertions(+), 31 deletions(-)
 create mode 100644 scrapely/compat.py

diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx
index c138f6a..6939671 100644
--- a/scrapely/_htmlpage.pyx
+++ b/scrapely/_htmlpage.pyx
@@ -90,7 +90,7 @@ cdef class CommentParser:
             if self.open_count == 0:
                 self.start = i - 3
             self.open_state = 1
-            self.open_count += 1
+            self.open_count = 1
             self.inside_comment = True
 
         if self.close_count < self.open_count:
@@ -141,12 +141,12 @@ cdef class ScriptParser:
             self.state = 1
         if ((self.state == 1 and c == u'<') or
             (self.state == 2 and c == u'/') or
-            (self.state == 3 and c == u's' or c == u'S') or
-            (self.state == 4 and c == u'c' or c == u'C') or
-            (self.state == 5 and c == u'r' or c == u'R') or
-            (self.state == 6 and c == u'i' or c == u'I') or
-            (self.state == 7 and c == u'p' or c == u'P') or
-            (self.state == 8 and c == u't' or c == u'T') or
+            (self.state == 3 and c in u'sS') or
+            (self.state == 4 and c in u'cC') or
+            (self.state == 5 and c in u'rR') or
+            (self.state == 6 and c in u'iI') or
+            (self.state == 7 and c in u'pP') or
+            (self.state == 8 and c in u'tT') or
             (self.state == 9 and c == u'>')):
             self.state += 1
         else:
@@ -288,7 +288,7 @@ cpdef parse_html(s):
                     if tag_name != u'!doctype':
                         parsed.append(
                             HtmlTag(tag_type, tag_name,
-                                      tag_attributes, tag_start, tag_end + 1))
+                                    tag_attributes, tag_start, tag_end + 1))
                     if tag_name == u'script':
                         script = True
                     if open_tag:
diff --git a/scrapely/compat.py b/scrapely/compat.py
new file mode 100644
index 0000000..78e06e7
--- /dev/null
+++ b/scrapely/compat.py
@@ -0,0 +1,6 @@
+try:
+    utext = unicode
+except NameError:
+    class utext(str):
+        def __repr__(self):
+            return 'u{}'.format(super(utext, self).__repr__())
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
index 2eba3fa..f2f6cf2 100644
--- a/scrapely/extraction/regionextract.py
+++ b/scrapely/extraction/regionextract.py
@@ -64,17 +64,18 @@ class BasicTypeExtractor(object):
     annotations.
 
     For example:
+    >>> from scrapely.compat import utext
     >>> from scrapely.extraction.pageparsing import parse_strings
     >>> template, page = parse_strings( \
         u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>', u'<h1> a name</h1>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> ex.extract(page, 0, 1, None)
+    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
     [(u'name', u' a name')]
 
     It supports attribute descriptors
     >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip())
     >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor})
-    >>> ex.extract(page, 0, 1, None)
+    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
     [(u'name', u'a name')]
 
     It supports ignoring regions
@@ -82,7 +83,7 @@ class BasicTypeExtractor(object):
         u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
         u'<div>a name<b> id-9</b></div>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> ex.extract(page, 0, 3, [PageRegion(1, 2)])
+    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])]
     [(u'name', u'a name')]
     """
 
@@ -640,12 +641,12 @@ def extract_text(self, text):
         pref_index = 0
         if self.minprefix > 0:
             rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
-            if plen < self.minprefix:
+            if plen is None or plen < self.minprefix:
                 return None
             pref_index = -rev_idx
         if self.minsuffix == 0:
             return text[pref_index:]
         sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
-        if slen < self.minsuffix:
+        if slen is None or slen < self.minsuffix:
             return None
         return text[pref_index:pref_index + sidx]
diff --git a/scrapely/extraction/similarity.py b/scrapely/extraction/similarity.py
index 81573fb..ac640b5 100644
--- a/scrapely/extraction/similarity.py
+++ b/scrapely/extraction/similarity.py
@@ -6,9 +6,17 @@
 from operator import itemgetter
 from heapq import nlargest
 
-# For typical use cases (small sequences and patterns) the naive approach actually
-# runs faster than KMP algorithm
-from . _similarity import naive_match_length
+try:
+    # For typical use cases (small sequences and patterns) the naive approach
+    # actually runs faster than KMP algorithm
+    from . _similarity import naive_match_length
+except ImportError:
+    def naive_match_length(to_search, subsequence, range_start, range_end):
+        startval = subsequence[0]
+        return ((i, common_prefix_length(to_search[i:], subsequence))
+                for i in xrange(range_start, range_end)
+                if startval == to_search[i])
+
 
 def common_prefix_length(a, b):
     """Calculate the length of the common prefix in both sequences passed.
@@ -46,7 +54,7 @@ def common_prefix(*sequences):
 
 
 def longest_unique_subsequence(to_search, subsequence, range_start=0,
-        range_end=None):
+                               range_end=None):
     """Find the longest unique subsequence of items in an array or string.  This
     searches to_search looking for the longest overlapping
     match with subsequence. If the largest match is unique (there is no other
diff --git a/scrapely/extractors.py b/scrapely/extractors.py
index 20dab22..4fe451a 100644
--- a/scrapely/extractors.py
+++ b/scrapely/extractors.py
@@ -81,7 +81,8 @@ def text(region):
     removing excessive whitespace,
 
     For example:
-    >>> t = lambda s: text(htmlregion(s))
+    >>> from scrapely.compat import utext
+    >>> t = lambda s: utext(text(htmlregion(s)))
     >>> t(u'<h1>test</h1>')
     u'test'
 
@@ -122,7 +123,8 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
             opening and closing tag is removed.
 
     For example:
-    >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
+    >>> from scrapely.compat import utext
+    >>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep))
     >>> t(u'<strong>test <blink>test</blink></strong>')
     u'<strong>test test</strong>'
 
@@ -272,7 +274,8 @@ def extract_number(txt):
     >>> extract_number('  45.3, 7')
 
     It will handle unescaped entities:
-    >>> extract_number(u'&#163;129&#46;99')
+    >>> from scrapely.compat import utext
+    >>> utext(extract_number(u'&#163;129&#46;99'))
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
@@ -285,6 +288,7 @@ def extract_price(txt):
     """
     Extracts numbers making some price format specific assumptions
 
+    >>> from scrapely.compat import utext
     >>> extract_price('asdf 234,234.45sdf ')
     '234234.45'
     >>> extract_price('234,23')
@@ -298,7 +302,7 @@ def extract_price(txt):
     >>> extract_price('adsfg')
     >>> extract_price('stained, linseed oil finish, clear glas doors')
     >>> extract_price('')
-    >>> extract_price(u'&#163;129&#46;99')
+    >>> utext(extract_price(u'&#163;129&#46;99'))
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
index f4f4134..62e2650 100644
--- a/scrapely/htmlpage.py
+++ b/scrapely/htmlpage.py
@@ -11,14 +11,158 @@
 from six.moves.urllib.request import urlopen
 from copy import deepcopy
 from w3lib.encoding import html_to_unicode
-
-from . import _htmlpage
-
-
-parse_html = _htmlpage.parse_html
-HtmlDataFragment = _htmlpage.HtmlDataFragment
-HtmlTag = _htmlpage.HtmlTag
-HtmlTagType = _htmlpage.HtmlTagType
+try:
+    from . import _htmlpage
+    parse_html = _htmlpage.parse_html
+    HtmlDataFragment = _htmlpage.HtmlDataFragment
+    HtmlTag = _htmlpage.HtmlTag
+    HtmlTagType = _htmlpage.HtmlTagType
+except ImportError:
+    import re
+    from collections import OrderedDict
+
+    class HtmlTagType(object):
+        OPEN_TAG = 1
+        CLOSE_TAG = 2
+        UNPAIRED_TAG = 3
+
+    class HtmlDataFragment(object):
+        __slots__ = ('start', 'end', 'is_text_content')
+
+        def __init__(self, start, end, is_text_content=False):
+            self.start = start
+            self.end = end
+            self.is_text_content = is_text_content
+
+        def __str__(self):
+            return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
+                self.start, self.end, self.is_text_content)
+
+        def __repr__(self):
+            return str(self)
+
+    class HtmlTag(HtmlDataFragment):
+        __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
+
+        def __init__(self, tag_type, tag, attr_text, start, end):
+            HtmlDataFragment.__init__(self, start, end)
+            self.tag_type = tag_type
+            self.tag = tag
+            if isinstance(attr_text, dict):
+                self._attributes = attr_text
+                self._attr_text = None
+            else:  # defer loading attributes until necessary
+                self._attributes = OrderedDict()
+                self._attr_text = attr_text
+
+        @property
+        def attributes(self):
+            if not self._attributes and self._attr_text:
+                for attr_match in _ATTR_REGEXP.findall(self._attr_text):
+                    name = attr_match[0].lower()
+                    values = [v for v in attr_match[1:] if v]
+                    # According to HTML spec if attribute name is repeated only
+                    # the first one is taken into account
+                    if name not in self._attributes:
+                        self._attributes[name] = values[0] if values else None
+            return self._attributes
+
+        def __str__(self):
+            attributes = ', '.join(
+                sorted(["%s: %s" % (k, repr(v))
+                       for k, v in self.attributes.items()]))
+            return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
+                self.tag, attributes, self.tag_type, self.start, self.end)
+
+        def __repr__(self):
+            return str(self)
+
+    _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
+             "([^>\s]+))?)?")
+    _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
+    _DOCTYPE = r"<!DOCTYPE.*?>"
+    _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
+    _COMMENT = "(<!--.*?-->|<\?.+?>)"
+
+    _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
+    _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
+                              re.I | re.DOTALL)
+    _DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
+    _COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
+
+    def parse_html(text):
+        """Higher level html parser. Calls lower level parsers and joins sucesive
+        HtmlDataFragment elements in a single one.
+        """
+        # If have doctype remove it.
+        start_pos = 0
+        match = _DOCTYPE_REGEXP.match(text)
+        if match:
+            start_pos = match.end()
+        prev_end = start_pos
+        for match in _HTML_REGEXP.finditer(text, start_pos):
+            start = match.start()
+            end = match.end()
+
+            if start > prev_end:
+                yield HtmlDataFragment(prev_end, start, True)
+
+            if match.groups()[0] is not None:  # comment
+                yield HtmlDataFragment(start, end)
+            elif match.groups()[1] is not None:  # <script>...</script>
+                for e in _parse_script(match):
+                    yield e
+            else:  # tag
+                yield _parse_tag(match)
+            prev_end = end
+        textlen = len(text)
+        if prev_end < textlen:
+            yield HtmlDataFragment(prev_end, textlen, True)
+
+    def _parse_script(match):
+        """parse a <script>...</script> region matched by _HTML_REGEXP"""
+        open_text, content, close_text = match.groups()[1:4]
+
+        open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
+        open_tag.start = match.start()
+        open_tag.end = match.start() + len(open_text)
+
+        close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
+        close_tag.start = match.end() - len(close_text)
+        close_tag.end = match.end()
+
+        yield open_tag
+        if open_tag.end < close_tag.start:
+            start_pos = 0
+            for m in _COMMENT_REGEXP.finditer(content):
+                if m.start() > start_pos:
+                    yield HtmlDataFragment(
+                        open_tag.end + start_pos, open_tag.end + m.start())
+                yield HtmlDataFragment(
+                    open_tag.end + m.start(), open_tag.end + m.end())
+                start_pos = m.end()
+            if open_tag.end + start_pos < close_tag.start:
+                yield HtmlDataFragment(
+                    open_tag.end + start_pos, close_tag.start)
+        yield close_tag
+
+    def _parse_tag(match):
+        """
+        parse a tag matched by _HTML_REGEXP
+        """
+        data = match.groups()
+        closing, tag, attr_text = data[4:7]
+        # if tag is None then the match is a comment
+        if tag is not None:
+            unpaired = data[-1]
+            if closing:
+                tag_type = HtmlTagType.CLOSE_TAG
+            elif unpaired:
+                tag_type = HtmlTagType.UNPAIRED_TAG
+            else:
+                tag_type = HtmlTagType.OPEN_TAG
+            return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
+                           match.end())
 
 
 def url_to_page(url, encoding=None, default_encoding='utf-8'):
@@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index):
             text_start = htmlpage.parsed_body[start_index].start
             text_end = htmlpage.parsed_body[end_index or -1].end
             text = htmlpage.body[text_start:text_end]
+
         return HtmlPageRegion.__new__(cls, htmlpage, text)
 
     def __init__(self, htmlpage, start_index, end_index):
diff --git a/setup.py b/setup.py
index c02fe82..c563751 100755
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name='scrapely',
-    version='0.12.0',
+    version='0.13.0b1',
     license='BSD',
     description='A pure-python HTML screen-scraping library',
     author='Scrapy project',
@@ -45,5 +45,8 @@
         'Topic :: Text Processing :: Markup :: HTML',
     ],
     install_requires=['numpy', 'w3lib', 'six'],
+    extras_requires={
+        'speedup': ['cython']
+    },
     ext_modules=extensions,
 )
diff --git a/tox.ini b/tox.ini
index 34a9a3a..77edb41 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27,py33,py34
+envlist = py27,py34
 usedevelop = True
 
 [testenv]

From cf0ee1b8d2d8bc54bd3527dd48e9004240cc2b3a Mon Sep 17 00:00:00 2001
From: Ruairi Fahy <ruairifahy91@gmail.com>
Date: Thu, 10 Nov 2016 14:48:35 +0000
Subject: [PATCH 3/3] Handle parsing of `<!>`. Use pypy as test environment.

Test python parsing implementation
Fallback to pure python parser if no cython available
---
 .travis.yml                          |  2 +-
 scrapely/_htmlpage.pyx               |  9 +++-
 scrapely/compat.py                   |  6 ---
 scrapely/extraction/regionextract.py |  7 ++-
 scrapely/extractors.py               | 12 ++---
 scrapely/htmlpage.py                 |  2 +-
 setup.py                             |  8 +++-
 tests/test_htmlpage_data.py          | 68 +++++++++++++++++++---------
 tox.ini                              |  3 +-
 9 files changed, 71 insertions(+), 46 deletions(-)
 delete mode 100644 scrapely/compat.py

diff --git a/.travis.yml b/.travis.yml
index 00097fc..767c039 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,8 +3,8 @@ python: 2.7
 
 env:
 - TOXENV=py27
-- TOXENV=py33
 - TOXENV=py34
+- TOXENV=pypy
 
 install:
 - pip install cython
diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx
index 6939671..8d35720 100644
--- a/scrapely/_htmlpage.pyx
+++ b/scrapely/_htmlpage.pyx
@@ -84,8 +84,13 @@ cdef class CommentParser:
             (self.open_state == 4 and c == u'-')):
             self.open_state += 1
         else:
+            # Handle <!> comment
+            if self.open_state == 3 and c == u'>':
+                self.inside_comment = False
+                self.reset()
+                self.start, self.end = i - 2, i
+                return True
             self.open_state = 1
-
         if self.open_state == 5:
             if self.open_count == 0:
                 self.start = i - 3
@@ -233,6 +238,8 @@ cpdef parse_html(s):
                 parsed.append(
                     HtmlDataFragment(comment_parser.start, tag_end + 1, False))
                 reset_tag = True
+                if (comment_parser.end - comment_parser.start) == 2:
+                    open_tag = False
 
         if comment_parser.inside_comment:
             open_tag = False
diff --git a/scrapely/compat.py b/scrapely/compat.py
deleted file mode 100644
index 78e06e7..0000000
--- a/scrapely/compat.py
+++ /dev/null
@@ -1,6 +0,0 @@
-try:
-    utext = unicode
-except NameError:
-    class utext(str):
-        def __repr__(self):
-            return 'u{}'.format(super(utext, self).__repr__())
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
index f2f6cf2..8469ce9 100644
--- a/scrapely/extraction/regionextract.py
+++ b/scrapely/extraction/regionextract.py
@@ -64,18 +64,17 @@ class BasicTypeExtractor(object):
     annotations.
 
     For example:
-    >>> from scrapely.compat import utext
     >>> from scrapely.extraction.pageparsing import parse_strings
     >>> template, page = parse_strings( \
         u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>', u'<h1> a name</h1>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
+    >>> ex.extract(page, 0, 1, None)
     [(u'name', u' a name')]
 
     It supports attribute descriptors
     >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip())
     >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor})
-    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
+    >>> ex.extract(page, 0, 1, None)
     [(u'name', u'a name')]
 
     It supports ignoring regions
@@ -83,7 +82,7 @@ class BasicTypeExtractor(object):
         u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
         u'<div>a name<b> id-9</b></div>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])]
+    >>> ex.extract(page, 0, 3, [PageRegion(1, 2)])
     [(u'name', u'a name')]
     """
 
diff --git a/scrapely/extractors.py b/scrapely/extractors.py
index 4fe451a..20dab22 100644
--- a/scrapely/extractors.py
+++ b/scrapely/extractors.py
@@ -81,8 +81,7 @@ def text(region):
     removing excessive whitespace,
 
     For example:
-    >>> from scrapely.compat import utext
-    >>> t = lambda s: utext(text(htmlregion(s)))
+    >>> t = lambda s: text(htmlregion(s))
     >>> t(u'<h1>test</h1>')
     u'test'
 
@@ -123,8 +122,7 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
             opening and closing tag is removed.
 
     For example:
-    >>> from scrapely.compat import utext
-    >>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep))
+    >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
     >>> t(u'<strong>test <blink>test</blink></strong>')
     u'<strong>test test</strong>'
 
@@ -274,8 +272,7 @@ def extract_number(txt):
     >>> extract_number('  45.3, 7')
 
     It will handle unescaped entities:
-    >>> from scrapely.compat import utext
-    >>> utext(extract_number(u'&#163;129&#46;99'))
+    >>> extract_number(u'&#163;129&#46;99')
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
@@ -288,7 +285,6 @@ def extract_price(txt):
     """
     Extracts numbers making some price format specific assumptions
 
-    >>> from scrapely.compat import utext
     >>> extract_price('asdf 234,234.45sdf ')
     '234234.45'
     >>> extract_price('234,23')
@@ -302,7 +298,7 @@ def extract_price(txt):
     >>> extract_price('adsfg')
     >>> extract_price('stained, linseed oil finish, clear glas doors')
     >>> extract_price('')
-    >>> utext(extract_price(u'&#163;129&#46;99'))
+    >>> extract_price(u'&#163;129&#46;99')
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
index 62e2650..5900c34 100644
--- a/scrapely/htmlpage.py
+++ b/scrapely/htmlpage.py
@@ -82,7 +82,7 @@ def __repr__(self):
     _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
     _DOCTYPE = r"<!DOCTYPE.*?>"
     _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
-    _COMMENT = "(<!--.*?-->|<\?.+?>)"
+    _COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"
 
     _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
     _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
diff --git a/setup.py b/setup.py
index c563751..3d14733 100755
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
 import os
+import platform
 from setuptools import setup, find_packages
 from setuptools.extension import Extension
 import numpy as np
 
 
 USE_CYTHON = 'CYTHONIZE' in os.environ
+IS_PYPY = platform.python_implementation() == 'PyPy'
 ext = '.pyx' if USE_CYTHON else '.c'
 extensions = [
     Extension("scrapely._htmlpage",
@@ -15,9 +17,11 @@
               ["scrapely/extraction/_similarity%s" % ext],
               include_dirs=[np.get_include()]),
 ]
-if USE_CYTHON:
+if USE_CYTHON and not IS_PYPY:
     from Cython.Build import cythonize
     extensions = cythonize(extensions)
+if IS_PYPY:
+    extensions = []
 
 
 setup(
@@ -45,7 +49,7 @@
         'Topic :: Text Processing :: Markup :: HTML',
     ],
     install_requires=['numpy', 'w3lib', 'six'],
-    extras_requires={
+    extras_require={
         'speedup': ['cython']
     },
     ext_modules=extensions,
diff --git a/tests/test_htmlpage_data.py b/tests/test_htmlpage_data.py
index 07554df..e5ab414 100644
--- a/tests/test_htmlpage_data.py
+++ b/tests/test_htmlpage_data.py
@@ -1,6 +1,6 @@
 PAGE = u"""
-<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);                           
-</style>
+<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
+                           </style>
 <body>
 <div class="scrapy-selected" id="header">
 <img src="company_logo.jpg" style="margin-left: 68px; padding-top:5px;" alt="Logo" width="530" height="105">
@@ -152,27 +152,52 @@
  {'end': 150, 'start': 149},
 ]
 
-# for testing tags inside comments
-PAGE3 = u"""<html><body><h1>Helloooo!!</h1><p>Did i say hello??</p><!--<p>
-</p>--><script type="text/javascript">bla<!--comment-->blabla</script></body></html>"""
+# for testing tags in different forms
+PAGE3 = u"""<!DOCTYPE html>
+<html>
+    <head>
+    <!-- Standard comment style -->
+    <title>Page name</title>
+    <meta name="name" content="value"><!> <!-- <- Self Closing Comment --!>
+    </head>
+
+    <!-- Comment used for ignoring a script
+        <script type="text/javascript">
+            var a = 1;
+        </script>
+    -->
+    <body>
+    </body>
+</html>
+"""
 
 PARSED3 = [
- {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
- {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
- {'attributes': {}, 'end': 16, 'start': 12, 'tag': u'h1', 'tag_type': 1},
- {'end': 26, 'start': 16},
- {'attributes': {}, 'end': 31, 'start': 26, 'tag': u'h1', 'tag_type': 2},
- {'attributes': {}, 'end': 34, 'start': 31, 'tag': u'p', 'tag_type': 1},
- {'end': 51, 'start': 34},
- {'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2},
- {'end': 70, 'start': 55, 'is_text_content': False},
- {'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1},
- {'end': 104, 'start': 101, 'is_text_content': False},
- {'end': 118, 'start': 104, 'is_text_content': False},
- {'end': 124, 'start': 118, 'is_text_content': False},
- {'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2},
- {'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2},
- {'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2}
+    {'end': 16, 'start': 15, 'is_text_content': True},
+    {'end': 22, 'start': 16, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'html'},
+    {'end': 27, 'start': 22, 'is_text_content': True},
+    {'end': 33, 'start': 27, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'head'},
+    {'end': 38, 'start': 33, 'is_text_content': True},
+    {'end': 69, 'start': 38, 'is_text_content': False},
+    {'end': 74, 'start': 69, 'is_text_content': True},
+    {'end': 81, 'start': 74, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'title'},
+    {'end': 90, 'start': 81, 'is_text_content': True},
+    {'end': 98, 'start': 90, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'title'},
+    {'end': 103, 'start': 98, 'is_text_content': True},
+    {'end': 137, 'start': 103, 'attributes': {'content': 'value', 'name': 'name'}, 'tag_type': 1, 'is_text_content': False, 'tag': 'meta'},
+    {'end': 140, 'start': 137, 'is_text_content': False},
+    {'end': 141, 'start': 140, 'is_text_content': True},
+    {'end': 174, 'start': 141, 'is_text_content': False},
+    {'end': 179, 'start': 174, 'is_text_content': True},
+    {'end': 186, 'start': 179, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'head'},
+    {'end': 192, 'start': 186, 'is_text_content': True},
+    {'end': 320, 'start': 192, 'is_text_content': False},
+    {'end': 325, 'start': 320, 'is_text_content': True},
+    {'end': 331, 'start': 325, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'body'},
+    {'end': 336, 'start': 331, 'is_text_content': True},
+    {'end': 343, 'start': 336, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'body'},
+    {'end': 344, 'start': 343, 'is_text_content': True},
+    {'end': 351, 'start': 344, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'html'},
+    {'end': 352, 'start': 351, 'is_text_content': True}
 ]
 
 # for testing tags inside scripts
@@ -293,4 +318,3 @@
     {"attributes": {}, "end": 91, "start": 84, "tag": "body", "tag_type": 2},
     {"attributes": {}, "end": 98, "start": 91, "tag": "html", "tag_type": 2}
 ]
-
diff --git a/tox.ini b/tox.ini
index 77edb41..1375cd9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27,py34
+envlist = py27,py34,pypy,pypy3
 usedevelop = True
 
 [testenv]
@@ -14,6 +14,7 @@ deps =
     nose-parameterized
     doctest-ignore-unicode
     coverage
+    cython
 commands =
     pip install -e .
     nosetests \