Skip to content

Commit

Permalink
Merge pull request #6298 from lorenzoverardo/bug-6292/robots.txt-non-…
Browse files Browse the repository at this point in the history
…UTF-8

Handle robots.txt files not UTF-8 encoded
  • Loading branch information
wRAR committed Apr 5, 2024
2 parents f7bf3f7 + 7b37dcd commit 5d31e89
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
2 changes: 1 addition & 1 deletion scrapy/robotstxt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
if to_native_str_type:
robotstxt_body = to_unicode(robotstxt_body)
else:
robotstxt_body = robotstxt_body.decode("utf-8")
robotstxt_body = robotstxt_body.decode("utf-8", errors="ignore")
except UnicodeDecodeError:
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
# Switch to 'allow all' state.
Expand Down
21 changes: 21 additions & 0 deletions tests/test_robotstxt_interface.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from twisted.trial import unittest

from scrapy.robotstxt import decode_robotstxt


def reppy_available():
# check if reppy parser is installed
Expand Down Expand Up @@ -141,6 +143,25 @@ def test_unicode_url_and_useragent(self):
)


class DecodeRobotsTxtTest(unittest.TestCase):
def test_native_string_conversion(self):
robotstxt_body = "User-agent: *\nDisallow: /\n".encode("utf-8")
decoded_content = decode_robotstxt(
robotstxt_body, spider=None, to_native_str_type=True
)
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")

def test_decode_utf8(self):
robotstxt_body = "User-agent: *\nDisallow: /\n".encode("utf-8")
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")

def test_decode_non_utf8(self):
robotstxt_body = b"User-agent: *\n\xFFDisallow: /\n"
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")


class PythonRobotParserTest(BaseRobotParserTest, unittest.TestCase):
def setUp(self):
from scrapy.robotstxt import PythonRobotParser
Expand Down

0 comments on commit 5d31e89

Please sign in to comment.