Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
RegexLinkExtractor: encode URL unicode value when creating Links
- Loading branch information
Showing
with
4 additions
and
2 deletions.
-
+4
−2
scrapy/contrib/linkextractors/regex.py
|
@@ -7,7 +7,7 @@ |
|
|
from .sgml import SgmlLinkExtractor |
|
|
|
|
|
linkre = re.compile( |
|
|
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", |
|
|
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", |
|
|
re.DOTALL | re.IGNORECASE) |
|
|
|
|
|
def clean_link(link_text): |
|
@@ -25,6 +25,8 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur |
|
|
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() |
|
|
|
|
|
links_text = linkre.findall(response_text) |
|
|
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) |
|
|
urlstext = set([(clean_url(url).encode(response_encoding), clean_text(text)) |
|
|
for url, _, text in links_text]) |
|
|
|
|
|
|
|
|
return [Link(url, text) for url, text in urlstext] |