Skip to content
Permalink
Browse files

RegexLinkExtractor: encode URL unicode value when creating Links

  • Loading branch information
redapple authored and dangra committed Feb 5, 2014
1 parent 8da65de commit d0ee545a8f4a770fc34dc995be65e3b0f88d9175
Showing with 4 additions and 2 deletions.
  1. +4 −2 scrapy/contrib/linkextractors/regex.py
@@ -7,7 +7,7 @@
from .sgml import SgmlLinkExtractor

linkre = re.compile(
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>",
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>",
re.DOTALL | re.IGNORECASE)

def clean_link(link_text):
@@ -25,6 +25,8 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

links_text = linkre.findall(response_text)
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
urlstext = set([(clean_url(url).encode(response_encoding), clean_text(text))
for url, _, text in links_text])


return [Link(url, text) for url, text in urlstext]

0 comments on commit d0ee545

Please sign in to comment.
You can’t perform that action at this time.