diff --git a/.travis.yml b/.travis.yml
index e6b5e3f..767c039 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,13 +3,13 @@ python: 2.7
 
 env:
 - TOXENV=py27
-- TOXENV=py33
 - TOXENV=py34
+- TOXENV=pypy
 
 install:
 - pip install cython
-- pip install -U tox codecov
-
+- CYTHONIZE=1 python setup.py build
+- pip install -U tox
 script: tox
 
 after_success:
diff --git a/MANIFEST.in b/MANIFEST.in
index a3b876d..69f062a 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,4 @@
 include scrapely/*.pyx
-include scrapely/extraction/*.pyx
\ No newline at end of file
+include scrapely/extraction/*.pyx
+include scrapely/*.c
+include scrapely/extraction/*.c
diff --git a/requirements.txt b/requirements.txt
index c3f0da3..b59bcf1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
 numpy
 w3lib
-six
-cython
+six
\ No newline at end of file
diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx
index c138f6a..8d35720 100644
--- a/scrapely/_htmlpage.pyx
+++ b/scrapely/_htmlpage.pyx
@@ -84,13 +84,18 @@ cdef class CommentParser:
             (self.open_state == 4 and c == u'-')):
             self.open_state += 1
         else:
+            # Handle <!> comment
+            if self.open_state == 3 and c == u'>':
+                self.inside_comment = False
+                self.reset()
+                self.start, self.end = i - 2, i
+                return True
             self.open_state = 1
-
         if self.open_state == 5:
             if self.open_count == 0:
                 self.start = i - 3
             self.open_state = 1
-            self.open_count += 1
+            self.open_count = 1
             self.inside_comment = True
 
         if self.close_count < self.open_count:
@@ -141,12 +146,12 @@ cdef class ScriptParser:
             self.state = 1
         if ((self.state == 1 and c == u'<') or
             (self.state == 2 and c == u'/') or
-            (self.state == 3 and c == u's' or c == u'S') or
-            (self.state == 4 and c == u'c' or c == u'C') or
-            (self.state == 5 and c == u'r' or c == u'R') or
-            (self.state == 6 and c == u'i' or c == u'I') or
-            (self.state == 7 and c == u'p' or c == u'P') or
-            (self.state == 8 and c == u't' or c == u'T') or
+            (self.state == 3 and c in u'sS') or
+            (self.state == 4 and c in u'cC') or
+            (self.state == 5 and c in u'rR') or
+            (self.state == 6 and c in u'iI') or
+            (self.state == 7 and c in u'pP') or
+            (self.state == 8 and c in u'tT') or
             (self.state == 9 and c == u'>')):
             self.state += 1
         else:
@@ -233,6 +238,8 @@ cpdef parse_html(s):
                 parsed.append(
                     HtmlDataFragment(comment_parser.start, tag_end + 1, False))
                 reset_tag = True
+                if (comment_parser.end - comment_parser.start) == 2:
+                    open_tag = False
 
         if comment_parser.inside_comment:
             open_tag = False
@@ -288,7 +295,7 @@ cpdef parse_html(s):
                     if tag_name != u'!doctype':
                         parsed.append(
                             HtmlTag(tag_type, tag_name,
-                                      tag_attributes, tag_start, tag_end + 1))
+                                    tag_attributes, tag_start, tag_end + 1))
                     if tag_name == u'script':
                         script = True
                     if open_tag:
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
index 2eba3fa..8469ce9 100644
--- a/scrapely/extraction/regionextract.py
+++ b/scrapely/extraction/regionextract.py
@@ -640,12 +640,12 @@ def extract_text(self, text):
         pref_index = 0
         if self.minprefix > 0:
             rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
-            if plen < self.minprefix:
+            if plen is None or plen < self.minprefix:
                 return None
             pref_index = -rev_idx
         if self.minsuffix == 0:
             return text[pref_index:]
         sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
-        if slen < self.minsuffix:
+        if slen is None or slen < self.minsuffix:
             return None
         return text[pref_index:pref_index + sidx]
diff --git a/scrapely/extraction/similarity.py b/scrapely/extraction/similarity.py
index 81573fb..ac640b5 100644
--- a/scrapely/extraction/similarity.py
+++ b/scrapely/extraction/similarity.py
@@ -6,9 +6,17 @@
 from operator import itemgetter
 from heapq import nlargest
 
-# For typical use cases (small sequences and patterns) the naive approach actually
-# runs faster than KMP algorithm
-from . _similarity import naive_match_length
+try:
+    # For typical use cases (small sequences and patterns) the naive approach
+    # actually runs faster than KMP algorithm
+    from . _similarity import naive_match_length
+except ImportError:
+    def naive_match_length(to_search, subsequence, range_start, range_end):
+        startval = subsequence[0]
+        return ((i, common_prefix_length(to_search[i:], subsequence))
+                for i in xrange(range_start, range_end)
+                if startval == to_search[i])
+
 
 def common_prefix_length(a, b):
     """Calculate the length of the common prefix in both sequences passed.
@@ -46,7 +54,7 @@ def common_prefix(*sequences):
 
 
 def longest_unique_subsequence(to_search, subsequence, range_start=0,
-        range_end=None):
+                               range_end=None):
     """Find the longest unique subsequence of items in an array or string.  This
     searches to_search looking for the longest overlapping
     match with subsequence. If the largest match is unique (there is no other
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
index f4f4134..5900c34 100644
--- a/scrapely/htmlpage.py
+++ b/scrapely/htmlpage.py
@@ -11,14 +11,158 @@
 from six.moves.urllib.request import urlopen
 from copy import deepcopy
 from w3lib.encoding import html_to_unicode
-
-from . import _htmlpage
-
-
-parse_html = _htmlpage.parse_html
-HtmlDataFragment = _htmlpage.HtmlDataFragment
-HtmlTag = _htmlpage.HtmlTag
-HtmlTagType = _htmlpage.HtmlTagType
+try:
+    from . import _htmlpage
+    parse_html = _htmlpage.parse_html
+    HtmlDataFragment = _htmlpage.HtmlDataFragment
+    HtmlTag = _htmlpage.HtmlTag
+    HtmlTagType = _htmlpage.HtmlTagType
+except ImportError:
+    import re
+    from collections import OrderedDict
+
+    class HtmlTagType(object):
+        OPEN_TAG = 1
+        CLOSE_TAG = 2
+        UNPAIRED_TAG = 3
+
+    class HtmlDataFragment(object):
+        __slots__ = ('start', 'end', 'is_text_content')
+
+        def __init__(self, start, end, is_text_content=False):
+            self.start = start
+            self.end = end
+            self.is_text_content = is_text_content
+
+        def __str__(self):
+            return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
+                self.start, self.end, self.is_text_content)
+
+        def __repr__(self):
+            return str(self)
+
+    class HtmlTag(HtmlDataFragment):
+        __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
+
+        def __init__(self, tag_type, tag, attr_text, start, end):
+            HtmlDataFragment.__init__(self, start, end)
+            self.tag_type = tag_type
+            self.tag = tag
+            if isinstance(attr_text, dict):
+                self._attributes = attr_text
+                self._attr_text = None
+            else:  # defer loading attributes until necessary
+                self._attributes = OrderedDict()
+                self._attr_text = attr_text
+
+        @property
+        def attributes(self):
+            if not self._attributes and self._attr_text:
+                for attr_match in _ATTR_REGEXP.findall(self._attr_text):
+                    name = attr_match[0].lower()
+                    values = [v for v in attr_match[1:] if v]
+                    # According to HTML spec if attribute name is repeated only
+                    # the first one is taken into account
+                    if name not in self._attributes:
+                        self._attributes[name] = values[0] if values else None
+            return self._attributes
+
+        def __str__(self):
+            attributes = ', '.join(
+                sorted(["%s: %s" % (k, repr(v))
+                       for k, v in self.attributes.items()]))
+            return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
+                self.tag, attributes, self.tag_type, self.start, self.end)
+
+        def __repr__(self):
+            return str(self)
+
+    _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
+             "([^>\s]+))?)?")
+    _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
+    _DOCTYPE = r"<!DOCTYPE.*?>"
+    _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
+    _COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"
+
+    _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
+    _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
+                              re.I | re.DOTALL)
+    _DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
+    _COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
+
+    def parse_html(text):
+        """Higher level html parser. Calls lower level parsers and joins sucesive
+        HtmlDataFragment elements in a single one.
+        """
+        # If have doctype remove it.
+        start_pos = 0
+        match = _DOCTYPE_REGEXP.match(text)
+        if match:
+            start_pos = match.end()
+        prev_end = start_pos
+        for match in _HTML_REGEXP.finditer(text, start_pos):
+            start = match.start()
+            end = match.end()
+
+            if start > prev_end:
+                yield HtmlDataFragment(prev_end, start, True)
+
+            if match.groups()[0] is not None:  # comment
+                yield HtmlDataFragment(start, end)
+            elif match.groups()[1] is not None:  # <script>...</script>
+                for e in _parse_script(match):
+                    yield e
+            else:  # tag
+                yield _parse_tag(match)
+            prev_end = end
+        textlen = len(text)
+        if prev_end < textlen:
+            yield HtmlDataFragment(prev_end, textlen, True)
+
+    def _parse_script(match):
+        """parse a <script>...</script> region matched by _HTML_REGEXP"""
+        open_text, content, close_text = match.groups()[1:4]
+
+        open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
+        open_tag.start = match.start()
+        open_tag.end = match.start() + len(open_text)
+
+        close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
+        close_tag.start = match.end() - len(close_text)
+        close_tag.end = match.end()
+
+        yield open_tag
+        if open_tag.end < close_tag.start:
+            start_pos = 0
+            for m in _COMMENT_REGEXP.finditer(content):
+                if m.start() > start_pos:
+                    yield HtmlDataFragment(
+                        open_tag.end + start_pos, open_tag.end + m.start())
+                yield HtmlDataFragment(
+                    open_tag.end + m.start(), open_tag.end + m.end())
+                start_pos = m.end()
+            if open_tag.end + start_pos < close_tag.start:
+                yield HtmlDataFragment(
+                    open_tag.end + start_pos, close_tag.start)
+        yield close_tag
+
+    def _parse_tag(match):
+        """
+        parse a tag matched by _HTML_REGEXP
+        """
+        data = match.groups()
+        closing, tag, attr_text = data[4:7]
+        # if tag is None then the match is a comment
+        if tag is not None:
+            unpaired = data[-1]
+            if closing:
+                tag_type = HtmlTagType.CLOSE_TAG
+            elif unpaired:
+                tag_type = HtmlTagType.UNPAIRED_TAG
+            else:
+                tag_type = HtmlTagType.OPEN_TAG
+            return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
+                           match.end())
 
 
 def url_to_page(url, encoding=None, default_encoding='utf-8'):
@@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index):
             text_start = htmlpage.parsed_body[start_index].start
             text_end = htmlpage.parsed_body[end_index or -1].end
             text = htmlpage.body[text_start:text_end]
+
         return HtmlPageRegion.__new__(cls, htmlpage, text)
 
     def __init__(self, htmlpage, start_index, end_index):
diff --git a/setup.py b/setup.py
index 871521f..3d14733 100755
--- a/setup.py
+++ b/setup.py
@@ -1,22 +1,32 @@
 #!/usr/bin/env python
+import os
+import platform
 from setuptools import setup, find_packages
 from setuptools.extension import Extension
-from Cython.Build import cythonize
 import numpy as np
 
+
+USE_CYTHON = 'CYTHONIZE' in os.environ
+IS_PYPY = platform.python_implementation() == 'PyPy'
+ext = '.pyx' if USE_CYTHON else '.c'
 extensions = [
     Extension("scrapely._htmlpage",
-              ["scrapely/_htmlpage.pyx"],
+              ["scrapely/_htmlpage%s" % ext],
               include_dirs=[np.get_include()]),
     Extension("scrapely.extraction._similarity",
-              ["scrapely/extraction/_similarity.pyx"],
+              ["scrapely/extraction/_similarity%s" % ext],
               include_dirs=[np.get_include()]),
 ]
+if USE_CYTHON and not IS_PYPY:
+    from Cython.Build import cythonize
+    extensions = cythonize(extensions)
+if IS_PYPY:
+    extensions = []
 
 
 setup(
     name='scrapely',
-    version='0.12.0',
+    version='0.13.0b1',
     license='BSD',
     description='A pure-python HTML screen-scraping library',
     author='Scrapy project',
@@ -38,6 +48,9 @@
         'Topic :: Internet :: WWW/HTTP',
         'Topic :: Text Processing :: Markup :: HTML',
     ],
-    install_requires=['numpy', 'w3lib', 'six', 'cython'],
-    ext_modules=cythonize(extensions),
+    install_requires=['numpy', 'w3lib', 'six'],
+    extras_require={
+        'speedup': ['cython']
+    },
+    ext_modules=extensions,
 )
diff --git a/tests/test_htmlpage_data.py b/tests/test_htmlpage_data.py
index 07554df..e5ab414 100644
--- a/tests/test_htmlpage_data.py
+++ b/tests/test_htmlpage_data.py
@@ -1,6 +1,6 @@
 PAGE = u"""
-<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);                           
-</style>
+<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
+                           </style>
 <body>
 <div class="scrapy-selected" id="header">
 <img src="company_logo.jpg" style="margin-left: 68px; padding-top:5px;" alt="Logo" width="530" height="105">
@@ -152,27 +152,52 @@
  {'end': 150, 'start': 149},
 ]
 
-# for testing tags inside comments
-PAGE3 = u"""<html><body><h1>Helloooo!!</h1><p>Did i say hello??</p><!--<p>
-</p>--><script type="text/javascript">bla<!--comment-->blabla</script></body></html>"""
+# for testing tags in different forms
+PAGE3 = u"""<!DOCTYPE html>
+<html>
+    <head>
+    <!-- Standard comment style -->
+    <title>Page name</title>
+    <meta name="name" content="value"><!> <!-- <- Self Closing Comment --!>
+    </head>
+
+    <!-- Comment used for ignoring a script
+        <script type="text/javascript">
+            var a = 1;
+        </script>
+    -->
+    <body>
+    </body>
+</html>
+"""
 
 PARSED3 = [
- {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
- {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
- {'attributes': {}, 'end': 16, 'start': 12, 'tag': u'h1', 'tag_type': 1},
- {'end': 26, 'start': 16},
- {'attributes': {}, 'end': 31, 'start': 26, 'tag': u'h1', 'tag_type': 2},
- {'attributes': {}, 'end': 34, 'start': 31, 'tag': u'p', 'tag_type': 1},
- {'end': 51, 'start': 34},
- {'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2},
- {'end': 70, 'start': 55, 'is_text_content': False},
- {'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1},
- {'end': 104, 'start': 101, 'is_text_content': False},
- {'end': 118, 'start': 104, 'is_text_content': False},
- {'end': 124, 'start': 118, 'is_text_content': False},
- {'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2},
- {'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2},
- {'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2}
+    {'end': 16, 'start': 15, 'is_text_content': True},
+    {'end': 22, 'start': 16, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'html'},
+    {'end': 27, 'start': 22, 'is_text_content': True},
+    {'end': 33, 'start': 27, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'head'},
+    {'end': 38, 'start': 33, 'is_text_content': True},
+    {'end': 69, 'start': 38, 'is_text_content': False},
+    {'end': 74, 'start': 69, 'is_text_content': True},
+    {'end': 81, 'start': 74, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'title'},
+    {'end': 90, 'start': 81, 'is_text_content': True},
+    {'end': 98, 'start': 90, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'title'},
+    {'end': 103, 'start': 98, 'is_text_content': True},
+    {'end': 137, 'start': 103, 'attributes': {'content': 'value', 'name': 'name'}, 'tag_type': 1, 'is_text_content': False, 'tag': 'meta'},
+    {'end': 140, 'start': 137, 'is_text_content': False},
+    {'end': 141, 'start': 140, 'is_text_content': True},
+    {'end': 174, 'start': 141, 'is_text_content': False},
+    {'end': 179, 'start': 174, 'is_text_content': True},
+    {'end': 186, 'start': 179, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'head'},
+    {'end': 192, 'start': 186, 'is_text_content': True},
+    {'end': 320, 'start': 192, 'is_text_content': False},
+    {'end': 325, 'start': 320, 'is_text_content': True},
+    {'end': 331, 'start': 325, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'body'},
+    {'end': 336, 'start': 331, 'is_text_content': True},
+    {'end': 343, 'start': 336, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'body'},
+    {'end': 344, 'start': 343, 'is_text_content': True},
+    {'end': 351, 'start': 344, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'html'},
+    {'end': 352, 'start': 351, 'is_text_content': True}
 ]
 
 # for testing tags inside scripts
@@ -293,4 +318,3 @@
     {"attributes": {}, "end": 91, "start": 84, "tag": "body", "tag_type": 2},
     {"attributes": {}, "end": 98, "start": 91, "tag": "html", "tag_type": 2}
 ]
-
diff --git a/tox.ini b/tox.ini
index 34a9a3a..1375cd9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27,py33,py34
+envlist = py27,py34,pypy,pypy3
 usedevelop = True
 
 [testenv]
@@ -14,6 +14,7 @@ deps =
     nose-parameterized
     doctest-ignore-unicode
     coverage
+    cython
 commands =
     pip install -e .
     nosetests \